diff --git a/.gitignore b/.gitignore index 137e2d7..ea65f30 100644 --- a/.gitignore +++ b/.gitignore @@ -1,17 +1,20 @@ *.ipynb* *.ipynb - +models # local data output/* data/* temp* test* - +weights # python .ipynb_checkpoints *.ipynb **/__pycache__/ - +*.filelist +*.jsonl +analysis +physics_collection # logs *.log *.out diff --git a/batch_running_task/README.md b/batch_running_task/README.md new file mode 100644 index 0000000..a6d65f6 --- /dev/null +++ b/batch_running_task/README.md @@ -0,0 +1,91 @@ +# Inference Accelerated PDF parsing +This fold include a series infra-accelerate modules for origin PDF parsing, including: +- intergrated preprocessing into dataloader +- fast_postprocessing +- torch.compile + bf16 +- tensorRT +- Torch-TensorRT[https://pytorch.org/TensorRT/] + +Those engine is tested on a 80,000,000 pdf dataset and get a 5-10x speedup compared with the origin pdf parsing engine. Basicly, it can reach 6-10 pages per second on a single A100 GPU. + +This is not a pipline framework but seperated into three task-wise batch processing engine. But it can be easily integrated into your own pipline framework. +- Detection (Bounding Boxing) +- Recognition (OCR) +- Math formula recognition (MFR) + +## Detection (Bounding Boxing) +Check the unit case:1000pdf takes around 20-30min +``` + python batch_running_task/task_layout/rough_layout.py +``` +### LayoutLM + The layoutLM is based on the `detectron2`. The main Vision Engine(ViT) is implemented via huggingface, the postprocess is based on detectron. + There is a tensorRT version of the detectron model https://github.com/NVIDIA/TensorRT/tree/main/samples/python/detectron2 , but it is only for Mask R-CNN backbone. + The tensorRT author manuelly develop the CUDA NMS and ROIAlign such as `DET2GraphSurgeon` (see https://github.com/NVIDIA/TensorRT/blob/main/samples/python/detectron2/create_onnx.py) to convert the detectron2 model to tensorRT model. + For layoutLM, there is no such tool to convert whole model into a tensorRT engine. + There are serveral ways to accelerate the layoutLM model: + - accelerate part by part, such as the tensorRT ViT backbone with detectron ROIAlign, NMS. + - use torch.compile + - use bf16 + + In this repo, I use the torch.compile(1.5x) and bf16(2x) to accelerate the layoutLM model. The tensorRT version is not implemented yet. + + Another way to accelerate the layoutLM is `avoid .numpy() large GPU tensor`. Origin code will use + ``` + boxes = outputs["instances"].to("cpu")._fields["pred_boxes"].tensor.numpy() + labels = outputs["instances"].to("cpu")._fields["pred_classes"].numpy() + scores = outputs["instances"].to("cpu")._fields["scores"].numpy() + ``` + This will copy the large tensor from GPU to CPU. (Since we later only gather part of data via `mask`, full tensor copy is unnecessary). + The better way is to do slicing on GPU tensor and then copy the sliced tensor to CPU. (2x) (see batch_running_task/task_layout/get_batch_layout_model.py) + + +### MFD + MFD(Math Formula Detection) is a simple YOLO model build through `ultralytics`. It has a good tensorRT convert tool chain. See https://docs.ultralytics.com/modes/export/ and convension/MDF/convert.py + Download the engine via `huggingface-cli download --resume-download --local-dir-use-symlinks False LLM4SCIENCE/ultralytics-YOLO-MFD --local-dir models/MFD`. The `batchsize` and `tensorRT version==10.3.0` must match! if you want to use the `trt_engine` directly. + +### PaddleOCR-Det + PaddleOCR-Det is the best text detecter around the world. But original paddle det only support one image per batch. In our detection task, every image is normlized into same size, so the original paddle det does not fit our task. Refer to `https://github.com/WenmuZhou/PytorchOCR`, Zhou has convert the paddleOCR into pytorch. It allow us use batch detection in pytorch now. + + There is a big speed up possiblity for the postprocessing for the paddleOCR-Det module. Currently, we use the DB postprocessing. See `https://github.com/PaddlePaddle/PaddleOCR/blob/main/ppocr/postprocess/db_postprocess.py`. The DB postprocessing is the slow part compare to whole detection process. Currently, there is no any speedup solution for the DB postprocessing. + +### Detection Async(experimental) + See `batch_running_task/task_layout/rough_layout_with_aync.py` + The async detection is a way to async postprocess and GPU inference. It works perfectly. But in slurm system, there is `exit` error when run the script, this will make your machine `CPU soft lock`. So, I do not recommend to use this script in slurm system. + +## Recognition (OCR) + Check the unit case:1000 pdf takes around 2-5 min + ``` + python batch_running_task/task_rec/rough_rec.py + ``` + PaddleOCR-Rec is the best text recognizer around the world. The original paddle rec support batch image processing. And the origin paddleOCR "is already very fast". + However, you can see I still use `PytorchOCR` in this part. Just want to provide a non-paddle solution. + Download the engine via `huggingface-cli download --resume-download --local-dir-use-symlinks False LLM4SCIENCE/pytorch_paddle_weight --local-dir models/pytorch_paddle_weight `. The `batchsize` and `tensorRT version==10.3.0` must match! if you want to use the `trt_engine` directly. + + +## Math formula recognition (MFR) + Check the unit case: 1000 pdf takes around 2-5min + ``` + python batch_running_task/task_mfr/rough_mfr.py + ``` + MFR model is `nougat` based model named `UniMERNet`. I tried to use Huggingface tensorRT convert tool chain to convert the model into tensorRT. But it failed. (The reshape module is not set properly). One way is using the `TensorRT-LLM`, see `https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal` and `convension/unimernet`. + - Notice `TensorRT-LLM` will default install `mpi4py=4.*.*` which will require `mpi.so40`. The conda `conda install -c conda-forge openmpi` can only support `openmpi==3.*.*'. So you need to install `openmpi` from source. Or, you can just `pip install mpi4py==3.*`. + - Notice you should `srun --mpi=pmi2` when run script in slurm. + + Download the engine via `huggingface-cli download --resume-download --local-dir-use-symlinks False LLM4SCIENCE/unimernet --local-dir models/MFR/unimernet`. The `batchsize` and `tensorRT version==10.3.0` must match! if you want to use the `trt_engine` directly. + + The different between `LLM4SCIENCE/unimernet` and `wanderkid/unimernet` is we delete the `counting` module in weight file. (it only works in training). And it is a pure nougat model. + + +## Batch run the task + Each task has a "batch_deal_with_xxx" module which will automatively schedule task. For example, your can prepare a `.jsonl` file named `test.filelist` with each line is + ``` + {"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"10.1017/cbo9780511770425.012.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}} +{"track_id":"64d182ba-21bf-478f-bb65-6a276aab3f4d","path":"10.1111/j.1365-2559.2006.02442.x.pdf","file_type":"pdf","content_type":"application/pdf","content_length":493629,"title":"Sensitivity and specificity of immunohistochemical antibodies used to distinguish between benign and malignant pleural disease: a systematic review of published reports","remark":{"file_id":"j.1365-2559.2006.02442.x","file_source_type":"paper","original_file_id":"10.1111/j.1365-2559.2006.02442.x","file_name":"10.1111/j.1365-2559.2006.02442.x.pdf","author":"J King; N Thatcher; C Pickering; P Hasleton"}} + ``` + and then run + ``` + python batch_running_task/task_layout/batch_deal_with_layout.py --root test.filelist + python batch_running_task/task_layout/batch_deal_with_rec.py --root test.filelist + python batch_running_task/task_layout/batch_deal_with_mfr.py --root test.filelist + ``` diff --git a/batch_running_task/batch_run.sh b/batch_running_task/batch_run.sh new file mode 100644 index 0000000..97e5647 --- /dev/null +++ b/batch_running_task/batch_run.sh @@ -0,0 +1,36 @@ + +TOTALNUM=30 +CPU_NUM=$1 # Automatically get the number of CPUs +if [ -z "$CPU_NUM" ]; then + CPU_NUM=$TOTALNUM +fi +# check hostname: if it start with SH than use + +if [[ $(hostname) == SH* ]]; then + PARA="--quotatype=spot -p AI4Chem -N1 -c8 --gres=gpu:1" + + export LD_LIBRARY_PATH=/mnt/cache/share/gcc/gcc-7.5.0/lib64:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} + export PATH=/mnt/cache/share/gcc/gcc-7.5.0/bin:$PATH + +else + + PARA="-p vip_gpu_ailab_low -N1 -c8 --gres=gpu:1" +fi +SCRIPT="batch_running_task/task_rec/run_rec.sh" +FILELIST="physics_collection/wait_for_ocr.filelist" + + +START=0 +for ((CPU=0; CPU 1: + divided_nums = np.linspace(0, totally_paper_num , num_parts+1) + divided_nums = [int(s) for s in divided_nums] + start_index = divided_nums[index_part] + end_index = divided_nums[index_part + 1] + else: + start_index = 0 + end_index = 1 + args.start_index = start_index + args.end_index = end_index + if args.shuffle: + np.random.shuffle(alread_processing_file_list) + alread_processing_file_list = alread_processing_file_list[start_index:end_index] + + return alread_processing_file_list + + +def save_analysis(analysis, debug, args): + + logpath = os.path.join(args.logpath,args.task_name) + print(logpath) + os.makedirs(logpath, exist_ok=True) + if args.num_parts > 1: + for key, val in analysis.items(): + print(f"{key}=>{len(val)}") + fold = os.path.join(logpath,f"{key.lower()}.filelist.split") + os.makedirs(fold, exist_ok=True) + with open(os.path.join(fold,f"{args.start_index}-{args.end_index}"), 'w') as f: + for line in val: + f.write(line+'\n') + else: + #print(analysis) + for key, val in analysis.items(): + print(f"{key}=>{len(val)}") + if debug: + print(val) + else: + with open(os.path.join(logpath,f"{key.lower()}.filelist"), 'w') as f: + for line in val: + f.write(line+'\n') + \ No newline at end of file diff --git a/batch_running_task/check_log.sh b/batch_running_task/check_log.sh new file mode 100644 index 0000000..ab36e7a --- /dev/null +++ b/batch_running_task/check_log.sh @@ -0,0 +1,35 @@ + +for file in .log/*; +do + ### skip if it is not a file + [ -f "$file" ] || continue + ## if head -n 1 file has string `is not` then delete this file + if [ "$(tail -n 3 "$file"|head -n 1|grep -c 'is not')" -eq 1 ]; then + echo "Deleting $file" + rm -f "$file" + fi +done + +user=`whoami` +jobname='ParseSciHUB' + +runningPID=`squeue -u $user -n $jobname | awk '{print $1}'` +for log_file in .log/*; +do + ### skip if it is not a file + [ -f "$log_file" ] || continue + ## get PID from log_file, the name rule is like $PID-ParseSciHUB.out + PID=$(echo $log_file|awk -F'/' '{print $2}'|awk -F'-' '{print $1}') + ## if the PID is not in runningPID, then delete this file + if [ "$(echo $runningPID|grep -c $PID)" -eq 0 ]; then + #echo "Deleting $log_file" + rm -f "$log_file" + else + #line=$(tail -n 30 "$log_file"|grep Data|tail -n 1| sed 's/\x1B\[A//g'| tr -d '\r') + line=$(tail -n 1000 "$log_file"|grep "Images batch"|tail -n 1| sed 's/\x1B\[A//g'| tr -d '\r') + #line=$(tail -n 1000 "$log_file"|grep "[Data]"|tail -n 1| sed 's/\x1B\[A//g'| tr -d '\r') + echo $log_file $line + #grep Error "$log_file" + fi +done +#echo "$output" \ No newline at end of file diff --git a/batch_running_task/dataaccelerate.py b/batch_running_task/dataaccelerate.py new file mode 100644 index 0000000..a9ea5d7 --- /dev/null +++ b/batch_running_task/dataaccelerate.py @@ -0,0 +1,89 @@ +#pip install prefetch_generator + +# 新建DataLoaderX类 +from torch.utils.data import DataLoader +import numpy as np +import torch + +def sendall2gpu(listinlist,device): + if isinstance(listinlist,(list,tuple)): + return [sendall2gpu(_list,device) for _list in listinlist] + elif isinstance(listinlist, (dict)): + return dict([(key,sendall2gpu(val,device)) for key,val in listinlist.items()]) + elif isinstance(listinlist, np.ndarray): + return listinlist + else: + return listinlist.to(device=device, non_blocking=True) +try: + from prefetch_generator import BackgroundGenerator + class DataLoaderX(DataLoader): + def __iter__(self): + return BackgroundGenerator(super().__iter__()) +except: + pass#DataLoaderX = DataLoader +class DataSimfetcher(): + def __init__(self, loader, device='auto'): + + if device == 'auto': + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + else: + self.device = device + self.loader = iter(loader) + + def next(self): + try: + + self.batch = next(self.loader) + self.batch = sendall2gpu(self.batch,self.device) + except StopIteration: + self.batch = None + return self.batch +class DataPrefetcher(): + def __init__(self, loader, device='auto'): + if device == 'auto': + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + else: + self.device = device#raise NotImplementedError + self.loader = iter(loader) + self.stream = torch.cuda.Stream() + # With Amp, it isn't necessary to manually convert data to half. + # if args.fp16: + # self.mean = self.mean.half() + # self.std = self.std.half() + self.preload() + + def preload(self): + try: + self.batch = next(self.loader) + except StopIteration: + self.batch = None + return + with torch.cuda.stream(self.stream): + self.batch = sendall2gpu(self.batch,self.device) + # With Amp, it isn't necessary to manually convert data to half. + # if args.fp16: + # self.next_input = self.next_input.half() + # else: + # self.next_input = self.next_input.float() + + def next(self): + torch.cuda.current_stream().wait_stream(self.stream) + batch = self.batch + self.preload() + return batch + +class infinite_batcher: + def __init__(self,data_loader, device='auto'): + self.length=len(data_loader) + self.now=-1 + self.data_loader= data_loader + self.prefetcher = None + self.device = device + def next(self): + if (self.now >= self.length) or (self.now == -1): + if self.prefetcher is not None:del self.prefetcher + self.prefetcher = DataSimfetcher(self.data_loader,device=self.device) + self.now=0 + self.now+=1 + return self.prefetcher.next() + diff --git a/batch_running_task/get_data_utils.py b/batch_running_task/get_data_utils.py new file mode 100644 index 0000000..6b66318 --- /dev/null +++ b/batch_running_task/get_data_utils.py @@ -0,0 +1,615 @@ + +import json +import requests +import io +import os +import fitz +fitz.TOOLS.mupdf_display_errors(on=False) + + +def clean_pdf_path(pdf_path): + return pdf_path[len("opendata:"):] if pdf_path.startswith("opendata:") else pdf_path + +def read_json_from_path(path, client): + if "s3:" in path: + buffer = client.get(path).replace(b'\x00', b'\n') + if path.endswith('.json'): + return json.loads(buffer) + elif path.endswith('.jsonl'): + whole_data = [] + for t in io.BytesIO(buffer).readlines(): + try: + data = json.loads(t) + except: + print(t) + raise + whole_data.append(data) + return whole_data + else: + return {'content':str(buffer)} + elif path.startswith('http'): + response = requests.get(path) + if response.status_code == 200: + content = response.json()["content"] + if path.endswith('.json'): + content = json.loads(content) + elif path.endswith('.md'): + content = {'content':content} + return content + else: + return None + else: + if path.endswith('.json'): + with open(path,'r') as f: + data = json.load(f) + return data + elif path.endswith('.jsonl'): + with open(path,'r') as f: + whole_data = [] + for t in f.readlines(): + data = json.loads(t.strip()) + whole_data.append(data) + return whole_data + else: + raise NotImplementedError("please use json or jsonl file") + + + +def write_jsonj_to_path(data, path, client): + if "s3:" in path: + byte_object = json.dumps(data).encode('utf-8') + with io.BytesIO(byte_object) as f: + client.put(path, f) + else: + assert not path.startswith('http'), "why you want to save the file to a online path?" + thedir = os.path.dirname(path) + os.makedirs(thedir, exist_ok=True) + with open(path,'w') as f: + json.dump(data, f) + +def write_json_to_path(data, path, client): + if "s3:" in path: + byte_object = json.dumps(data).encode('utf-8') + with io.BytesIO(byte_object) as f: + client.put(path, f) + else: + assert not path.startswith('http'), "why you want to save the file to a online path?" + thedir = os.path.dirname(path) + os.makedirs(thedir, exist_ok=True) + with open(path,'w') as f: + json.dump(data, f) +def write_jsonl_to_path(data, path, client): + byte_object = "\n".join([json.dumps(d) for d in data]) + if "s3:" in path: + with io.BytesIO(byte_object.encode('utf-8')) as f: + client.put(path, f) + else: + assert not path.startswith('http'), "why you want to save the file to a online path?" + thedir = os.path.dirname(path) + if thedir: + os.makedirs(thedir, exist_ok=True) + with open(path,'w') as f: + for d in data: + try: + byte_object = json.dumps(d) + except: + + raise NotImplementedError(f"fail to dump {d}") + f.write(byte_object+'\n') + + +import boto3 +from botocore.client import Config +class MyFastS2client: + def __init__(self, ACCESS_KEY, SECRET_KEY,ENDPOINT): + + session = boto3.Session( + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=SECRET_KEY, + region_name='' + ) + + # Create an S3 client + self.s3 = session.client( + 's3', + endpoint_url=ENDPOINT, + config=Config(signature_version='s3v4') # Ensure compatibility + ) + + def get(self, path): + # path is like opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub/part-66210c190659-000026.jsonl + # obtain bucket_name and object_key + bucket_name = path.split("//")[1].split("/")[0] + object_key = "/".join(path.split("//")[1].split("/")[1:]) + response = self.s3.get_object(Bucket=bucket_name, Key=object_key) + return response['Body'].read() + + def put(self, path,data): + # path is like opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub/part-66210c190659-000026.jsonl + # obtain bucket_name and object_key + bucket_name = path.split("//")[1].split("/")[0] + object_key = "/".join(path.split("//")[1].split("/")[1:]) + + self.s3.put_object(Bucket=bucket_name, Key=object_key, Body=data) + + def contains(self, path): + # path is like opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub/part-66210c190659-000026.jsonl + # obtain bucket_name and object_key + bucket_name = path.split("//")[1].split("/")[0] + object_key = "/".join(path.split("//")[1].split("/")[1:]) + try: + self.s3.head_object(Bucket=bucket_name, Key=object_key) + return True + except: + return False +def build_client(): + #print(f"we will building ceph client...................") + + try: + from petrel_client.client import Client # 安装完成后才可导入 + client = Client(conf_path="~/petreloss.conf") # 实例化Petrel Client,然后就可以调用下面的APIs + except: + + ### get key and endpoint from local .client.conf + with open(".client.conf",'r') as f: + lines = f.readlines() + for line in lines: + if "key" in line: + ACCESS_KEY = line.split("=")[1].strip() + if "secret" in line: + SECRET_KEY = line.split("=")[1].strip() + if "endpoint" in line: + ENDPOINT = line.split("=")[1].strip() + client = MyFastS2client(ACCESS_KEY, SECRET_KEY, ENDPOINT) # 实例化Petrel Client,然后就可以调用下面的APIs + + + #print(f"done..................") + return client + +def check_path_exists(path,client): + #print(path) + if "s3:" in path: + return client.contains(path) + elif path.startswith('http'): + assert 'get_data' in path, "please use get_data flag for data path" + response = requests.get(path.replace('get_data','checkfile')) + if response.status_code == 200: + status = response.json()["status"] + return status + else: + return False + else: + return os.path.exists(path) + +def check_lock_exists(path, client): + if "s3:" in path: + raise NotImplementedError("s3 lock not implemented") + elif path.startswith('http'): + assert 'get_data' in path, "please use get_data flag for data path" + response = requests.get(path.replace('get_data','checklock')) + if response.status_code == 200: + status = response.json()["status"] + return status + else: + return False + else: + raise NotImplementedError("please donot use lock lock") + return os.path.exists(path) + +def check_lock_and_last_start_time(path, client): + if "s3:" in path: + raise NotImplementedError(f"s3 lock not implemented. Now path {path}") + elif path.startswith('http'): + assert 'checklocktime' in path, "please use `checklocktime` flag for data path" + response = requests.get(path) + if response.status_code == 200: + content = response.json() + if not content["status"]:return False + return content['start_time'] + else: + return False + else: + raise NotImplementedError("s3 lock not implemented") + +def create_last_start_time_lock(path, client): + if "s3:" in path: + raise NotImplementedError("s3 lock not implemented") + elif path.startswith('http'): + assert 'createlocktime' in path, "please use `createlocktime` flag for data path" + response = requests.get(path) + else: + raise NotImplementedError("s3 lock not implemented") + +from PIL import Image +import numpy as np +UNIFIED_WIDTH = 1472 # lets always make the oimage in such size +UNIFIED_HEIGHT = 1920 # lets always make the oimage in such size +def pad_image_to_ratio(image, output_width = UNIFIED_WIDTH,output_height=UNIFIED_HEIGHT, ): + """ + Pads the given PIL.Image object to fit the specified width-height ratio + by adding padding only to the bottom and right sides. + + :param image: PIL.Image object + :param target_ratio: Desired width/height ratio (e.g., 16/9) + :return: New PIL.Image object with the padding applied + """ + # Original dimensions + input_width, input_height = image.size + height = min(input_height, output_height) + width = min(input_width, output_width) + + if output_height == input_height and output_width == input_width: + return image + + if input_height / output_height > input_width / output_width: + # Resize to match height, width will be smaller than output_width + height = output_height + width = int(input_width * output_height / input_height) + else: + # Resize to match width, height will be smaller than output_height + width = output_width + height = int(input_height * output_width / input_width) + image= image.resize((width, height), resample=3) + # Create new image with target dimensions and a white background + new_image = Image.new("RGB", (output_width, output_height), (255, 255, 255)) + new_image.paste(image, (0, 0)) + + return new_image + +def process_pdf_page_to_image(page, dpi,output_width=UNIFIED_WIDTH,output_height=UNIFIED_HEIGHT): + pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72)) + if pix.width > 3000 or pix.height > 3000: + pix = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) + image = Image.frombytes('RGB', (pix.width, pix.height), pix.samples) + else: + image = Image.frombytes('RGB', (pix.width, pix.height), pix.samples) + + image = pad_image_to_ratio(image, output_width = output_width,output_height=output_height) + + image = np.array(image)[:,:,::-1] + return image.copy() + +def read_pdf_from_path(path, client): + if "s3:" in path: + buffer = client.get(path) + return fitz.open(stream = buffer, filetype="pdf") + else: + return fitz.open(path) + +import pymupdf +class DatasetUtils: + client = None + last_read_pdf_buffer={} + def smart_read_json(self, json_path): + if "s3:" in json_path and self.client is None: self.client = build_client() + if json_path.startswith("s3:"): json_path = "opendata:"+ json_path + return read_json_from_path(json_path, self.client) + + def smart_write_json(self, data, targetpath): + if "s3:" in targetpath and self.client is None: self.client = build_client() + if json_path.startswith("s3:"): json_path = "opendata:"+ json_path + write_json_to_path(data, targetpath, self.client) + + def check_path_exists(self, path): + if "s3:" in path and self.client is None: self.client = build_client() + if path.startswith("s3:"): path = "opendata:"+ path + return check_path_exists(path, self.client) + + def smart_load_pdf(self, pdf_path): + if "s3:" in pdf_path and self.client is None: self.client = build_client() + if pdf_path.startswith("s3:"): pdf_path = "opendata:"+ pdf_path + with self.timer("smart_load_pdf"): + try: + pdf_buffer = read_pdf_from_path(pdf_path, self.client) + except pymupdf.mupdf.FzErrorFormat: + print(f""" + ======================================== + error in loading pdf {pdf_path}, we pass + ======================================== + """) + pdf_buffer = None + except Exception as e: + print(f"error in loading pdf {pdf_path}") + print(e) + raise + return pdf_buffer + + def clean_pdf_buffer(self): + return + keys = list(self.last_read_pdf_buffer.keys()) + for key in keys: + if self.last_read_pdf_buffer[key] is not None: + self.last_read_pdf_buffer[key].close() + del self.last_read_pdf_buffer[key] + + + def get_pdf_buffer(self,path, buffer_num=1): + if "s3:" in path and self.client is None: self.client = build_client() + if path.startswith("s3:"): path = "opendata:"+ path + if path not in self.last_read_pdf_buffer: + if buffer_num is not None and len(self.last_read_pdf_buffer) >= buffer_num: + self.clean_pdf_buffer() + self.last_read_pdf_buffer[path] = self.smart_load_pdf(path) + + pdf_buffer = self.last_read_pdf_buffer[path] + return pdf_buffer + + +from tqdm.auto import tqdm +import json,os +from multiprocessing import Pool +FILEROOT = "page_num_map" +def process_file(filename): + with open(os.path.join(FILEROOT, filename)) as f: + data = json.load(f) + return data +def get_page_num_map_whole(): + + page_num_map_whole = {} + files = os.listdir(FILEROOT) + num_thread=4 + print("to get page num map whole") + with Pool(num_thread) as pool: + results = list(tqdm(pool.imap(process_file, files), total=len(files))) + + for result in results: + page_num_map_whole.update(result) + return page_num_map_whole + +output_width =1472 #pdf_metadata['width']#1472 +output_height=1920 #pdf_metadata['height']#1920 +import sys +sys.path.append(os.path.abspath(os.path.dirname(os.path.dirname(__file__)))) + +from batch_running_task.utils import convert_boxes +def build_dict(pdf_metadata_list, track_id_key = "track_id"): + pdf_metadata_dict = {} + for pdf_metadata in pdf_metadata_list: + track_id = pdf_metadata[track_id_key] + height = pdf_metadata.get('height', 1920) + width = pdf_metadata.get('width',1472) + if height == output_height and width == output_width: + pass + else: + ### lets do the bbox convertion + doc_layout_result=pdf_metadata['doc_layout_result'] + for pdf_page_metadata in doc_layout_result: + page_id = pdf_page_metadata['page_id'] + layout_dets = [] + for res in pdf_page_metadata["layout_dets"]: + new_res = res.copy() + xmin, ymin = int(res['poly'][0]), int(res['poly'][1]) + xmax, ymax = int(res['poly'][4]), int(res['poly'][5]) + bbox= [xmin, ymin, xmax, ymax] + bbox= convert_boxes([bbox], pdf_metadata['width'], pdf_metadata['height'], output_width, output_height)[0] + poly= [bbox[0], bbox[1], bbox[2], bbox[1], bbox[2], bbox[3], bbox[0], bbox[3]] + res['poly'] = poly + page_id_to_metadata = {pdf_page_metadata['page_id']: pdf_page_metadata for pdf_page_metadata in pdf_metadata['doc_layout_result']} + pdf_metadata_dict[track_id] = page_id_to_metadata + + return pdf_metadata_dict + +def read_data_with_patch(result_path, client): + if result_path.startswith("s3:"): + result_path = "opendata:"+result_path + pdf_path_map_to_page_num = [] + #assert "layoutV" in result_path + filename = os.path.basename(result_path) + patch_path = os.path.join(os.path.dirname(os.path.dirname(result_path)),"det_patch_good",filename) + missingpath= os.path.join(os.path.dirname(os.path.dirname(result_path)),"fix_missing_page_version2",filename) + # mfr_patchpath = os.path.join(os.path.dirname(os.path.dirname(result_path)),"mfr_patch",filename) + # mfr_patch_bf16path = os.path.join(os.path.dirname(os.path.dirname(result_path)),"mfr_patch_bf16",filename) + # rec_patchpath = os.path.join(os.path.dirname(os.path.dirname(result_path)),"rec_patch",filename) + + assert check_path_exists(result_path,client) + #tqdm.write("reading result") + result = read_json_from_path(result_path,client) + result_dict = build_dict(result) + + patch_add_dict = build_dict(read_json_from_path(patch_path,client)) if check_path_exists(patch_path,client) else {} + + missing_dict = build_dict(read_json_from_path(missingpath,client)) if check_path_exists(missingpath,client) else {} + # mfr_patch_dict = build_dict(read_json_from_path(mfr_patchpath,client)) if check_path_exists(mfr_patchpath,client) else {} + # mfr_patch_bf16_dict = build_dict(read_json_from_path(mfr_patch_bf16path,client)) if check_path_exists(mfr_patch_bf16path,client) else {} + # rec_patch_dict = build_dict(read_json_from_path(rec_patchpath,client)) if check_path_exists(rec_patchpath,client) else {} + + #tqdm.write("reading done") + if len(patch_add_dict) == 0 and len(missing_dict) == 0: + #tqdm.write(f"no patch and missing for {result_path}") + pass + else: + + for track_id, pdf_metadata in result_dict.items(): + for patch_dict in [patch_add_dict, missing_dict]: + if track_id in patch_dict: + patch_pdf_metadata = patch_dict[track_id] + for page_id, pdf_page_metadata in patch_pdf_metadata.items(): + if page_id in pdf_metadata: + ## then merge page result + pdf_metadata[page_id]["layout_dets"].extend(pdf_page_metadata["layout_dets"]) + else: + pdf_metadata[page_id] = pdf_page_metadata + for pdf_metadata in result: + track_id = pdf_metadata['track_id'] + pdf_metadata['height'] = output_height + pdf_metadata['width'] = output_width + doc_layout_result = [] + for page_id, pdf_page_metadata in result_dict[track_id].items(): + doc_layout_result.append(pdf_page_metadata) + pdf_metadata['doc_layout_result'] = doc_layout_result + return result + +def merge_rec_result(pdf_metadata, rec_patch_dict, track_id_key = "path"): + track_id = pdf_metadata[track_id_key] + if track_id in rec_patch_dict: + current_rec_patch = rec_patch_dict[track_id] + else: + return + for pdf_page_metadata in pdf_metadata['doc_layout_result']: + page_id = pdf_page_metadata['page_id'] + bbox_count = 0 + for bbox_metadata in pdf_page_metadata['layout_dets']: + if bbox_metadata['category_id'] != 15:continue + bbox_count+=1 + if bbox_count == 0: continue + patch_rec_list = current_rec_patch[page_id]["layout_dets"] + assert len(patch_rec_list) == bbox_count, f"pdf={track_id} page={page_id} => bbox count {bbox_count} not equal to patch count {len(patch_rec_list)}" + bbox_id = 0 + for bbox_metadata in pdf_page_metadata['layout_dets']: + if bbox_metadata['category_id'] != 15:continue + bbox_metadata.update(patch_rec_list[bbox_id]) + bbox_id += 1 + +def merge_mfr_result(pdf_metadata, mfr_patch_dict, track_id_key = "path"): + track_id = pdf_metadata[track_id_key] + if track_id in mfr_patch_dict: + current_mfr_patch = mfr_patch_dict[track_id] + else: + return + for pdf_page_metadata in pdf_metadata['doc_layout_result']: + page_id = pdf_page_metadata['page_id'] + bbox_count = 0 + for bbox_metadata in pdf_page_metadata['layout_dets']: + if bbox_metadata['category_id'] not in [13, 14]:continue + bbox_count+=1 + if bbox_count == 0: continue + patch_mfr_list = current_mfr_patch[page_id]["layout_dets"] + assert len(patch_mfr_list) == bbox_count, f"pdf={track_id} page={page_id} => bbox count {bbox_count} not equal to patch count {len(patch_mfr_list)}" + bbox_id = 0 + for bbox_metadata in pdf_page_metadata['layout_dets']: + if bbox_metadata['category_id'] not in [13, 14]:continue + bbox_metadata.update(patch_mfr_list[bbox_id]) + bbox_id += 1 + +def read_data_with_mfr(result_path, client): + if result_path.startswith("s3:"): + result_path = "opendata:"+result_path + + filename = os.path.basename(result_path) + + mfr_patchpath = os.path.join(os.path.dirname(os.path.dirname(result_path)),"mfr_patch",filename) + mfr_patch_bf16path = os.path.join(os.path.dirname(os.path.dirname(result_path)),"mfr_patch_bf16",filename) + rec_patchpath = os.path.join(os.path.dirname(os.path.dirname(result_path)),"rec_patch",filename) + + assert check_path_exists(result_path,client) + #tqdm.write("reading result") + result = read_json_from_path(result_path,client) + + track_id_key = 'path' + mfr_patch_dict = build_dict(read_json_from_path(mfr_patchpath,client),track_id_key = track_id_key) if check_path_exists(mfr_patchpath,client) else {} + mfr_patch_bf16_dict= build_dict(read_json_from_path(mfr_patch_bf16path,client),track_id_key = track_id_key) if check_path_exists(mfr_patch_bf16path,client) else {} + mfr_patch_dict.update(mfr_patch_bf16_dict) + if len(mfr_patch_dict)>0: + for pdf_metadata in tqdm(result, desc="adding patch and missing", leave=False, position=3): + merge_mfr_result(pdf_metadata, mfr_patch_dict) + + track_id_key = 'path' + rec_patch_dict = build_dict(read_json_from_path(rec_patchpath,client),track_id_key = track_id_key) if check_path_exists(rec_patchpath,client) else {} + if len(rec_patch_dict)>0: + for pdf_metadata in tqdm(result, desc="[REC] adding patch and missing", leave=False, position=3): + merge_rec_result(pdf_metadata, rec_patch_dict, track_id_key=track_id_key) + + return result + + +def read_data_with_version(result_path, client): + if result_path.startswith("s3:"): + result_path = "opendata:"+result_path + #assert "layoutV" in result_path + filename = os.path.basename(result_path) + rootpath = os.path.dirname(os.path.dirname(result_path)) + version1 = os.path.join(rootpath,"add_mfr",filename) + version2 = os.path.join(rootpath,"rec_fixed_final",filename) + version3 = os.path.join(rootpath,"fix_missing_page_version2",filename) + + assert check_path_exists(result_path,client) + #tqdm.write("reading result") + result = read_json_from_path(result_path,client) + result_dict = build_dict(result) + patch_version1_dict = build_dict(read_json_from_path(version1,client)) if check_path_exists(version1,client) else {} + patch_version2_dict = build_dict(read_json_from_path(version2,client)) if check_path_exists(version2,client) else {} + patch_version3_dict = build_dict(read_json_from_path(version3,client)) if check_path_exists(version3,client) else {} + + + #tqdm.write("reading done") + for track_id, pdf_metadata in result_dict.items(): + for patch_dict in [patch_version1_dict, patch_version2_dict, patch_version3_dict]: + if track_id in patch_dict: + patch_pdf_metadata = patch_dict[track_id] + for page_id, pdf_page_metadata in patch_pdf_metadata.items(): + if page_id in pdf_metadata: + assert len(pdf_page_metadata["layout_dets"]) == len(pdf_metadata[page_id]["layout_dets"]), f"pdf={track_id} page={page_id} => bbox count {len(pdf_metadata[page_id]['layout_dets'])} not equal to patch count {len(pdf_page_metadata['layout_dets'])}" + for box1_dict, box2_dict in zip(pdf_metadata[page_id]["layout_dets"], pdf_page_metadata["layout_dets"]): + assert box1_dict['category_id'] == box2_dict['category_id'], f"pdf={track_id} page={page_id} => category_id {box1_dict['category_id']} not equal to patch category_id {box2_dict['category_id']}" + assert box1_dict['poly'] == box2_dict['poly'], f"pdf={track_id} page={page_id} => poly {box1_dict['poly']} not equal to patch poly {box2_dict['poly']}" + if box1_dict['category_id'] == 15: + if box2_dict.get('text',"") == "":continue + if box1_dict.get('text',"") == "": + box1_dict['text'] = box2_dict.get('text',"") + + else: + assert box1_dict['text'] == box2_dict['text'], f"pdf={track_id} page={page_id} => text {box1_dict['text']} not equal to patch text {box2_dict['text']}" + + if box1_dict['category_id'] in {13, 14}: + if box2_dict.get('latex',"") == "":continue + if box1_dict.get('latex',"") == "": + box1_dict['latex'] = box2_dict['latex'] + else: + assert box1_dict['latex'] == box2_dict['latex'], f"pdf={track_id} page={page_id} => latex {box1_dict['latex']} not equal to patch latex {box2_dict['latex']}" + box1_dict.update(box2_dict) + else: + pdf_metadata[page_id] = pdf_page_metadata + + for pdf_metadata in result: + track_id = pdf_metadata['track_id'] + pdf_metadata['height'] = output_height + pdf_metadata['width'] = output_width + doc_layout_result = [] + for page_id, pdf_page_metadata in result_dict[track_id].items(): + doc_layout_result.append(pdf_page_metadata) + pdf_metadata['doc_layout_result'] = doc_layout_result + + print(len(result)) + # mfr_patch_dict = build_dict(read_json_from_path(mfr_patchpath,client)) if check_path_exists(mfr_patchpath,client) else {} + # mfr_patch_bf16_dict = build_dict(read_json_from_path(mfr_patch_bf16path,client)) if check_path_exists(mfr_patch_bf16path,client) else {} + # rec_patch_dict = build_dict(read_json_from_path(rec_patchpath,client)) if check_path_exists(rec_patchpath,client) else {} + + return result + +class PackStatus: + whole_layout_complete = 'whole_layout_complete' + whole_ocr_complete = 'whole_ocr_complete' + layout_not_complete = 'layout_not_complete' + better_redo = 'better_redo' + better_addon = 'better_addon' + check_the_page_information = 'check_the_page_information' + +packstatus= PackStatus() + +class PDFSTATUS: + layout_not_complete = 'cN' + layout_has_complete = 'cA' + layout_complete_and_ocr_finished = 'cF' + layout_complete_without_ocr = 'cT' + + +pdf_status = PDFSTATUS() + +class PAGESTATUS: + layout_complete_and_ocr_finished = 'bF' + layout_complete_and_ocr_only_for_mfd = 'bM' + layout_complete_and_ocr_only_for_rec = 'bR' + layout_complete = 'bP' + only_have_15 = 'bI' + only_have_layout = 'bK' + no012467 = 'bA' + none = 'bN' + +page_status = PAGESTATUS() +class BOXSTATUS: + has_category_layout = 'a1' + has_category_mfd_and_get_mfr = 'b1' + has_category_mfd_without_mfr = 'c1' + has_category_rec_without_rec = 'd1' + has_category_rec_and_get_rec = 'e1' +boxstatus = BOXSTATUS() diff --git a/batch_running_task/pytorchocr/__init__.py b/batch_running_task/pytorchocr/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/batch_running_task/pytorchocr/base_ocr_v20.py b/batch_running_task/pytorchocr/base_ocr_v20.py new file mode 100644 index 0000000..3c7eb1a --- /dev/null +++ b/batch_running_task/pytorchocr/base_ocr_v20.py @@ -0,0 +1,112 @@ +import os, sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from collections import OrderedDict +import numpy as np +import cv2 +import torch + +from pytorchocr.modeling.architectures.base_model import BaseModel + +class BaseOCRV20: + def __init__(self, config, **kwargs): + self.config = config + self.build_net(**kwargs) + self.net.eval() + + + def build_net(self, **kwargs): + self.net = BaseModel(self.config, **kwargs) + + + def load_paddle_weights(self, weights_path): + raise NotImplementedError('implemented in converter.') + print('paddle weights loading...') + import paddle.fluid as fluid + with fluid.dygraph.guard(): + para_state_dict, opti_state_dict = fluid.load_dygraph(weights_path) + + for k,v in self.net.state_dict().items(): + name = k + + if name.endswith('num_batches_tracked'): + continue + + if name.endswith('running_mean'): + ppname = name.replace('running_mean', '_mean') + elif name.endswith('running_var'): + ppname = name.replace('running_var', '_variance') + elif name.endswith('bias') or name.endswith('weight'): + ppname = name + elif 'lstm' in name: + ppname = name + + else: + print('Redundance:') + print(name) + raise ValueError + try: + if ppname.endswith('fc.weight'): + self.net.state_dict()[k].copy_(torch.Tensor(para_state_dict[ppname].T)) + else: + self.net.state_dict()[k].copy_(torch.Tensor(para_state_dict[ppname])) + except Exception as e: + print('pytorch: {}, {}'.format(k, v.size())) + print('paddle: {}, {}'.format(ppname, para_state_dict[ppname].shape)) + raise e + + print('model is loaded: {}'.format(weights_path)) + + def read_pytorch_weights(self, weights_path): + if not os.path.exists(weights_path): + raise FileNotFoundError('{} is not existed.'.format(weights_path)) + weights = torch.load(weights_path) + return weights + + def get_out_channels(self, weights): + if list(weights.keys())[-1].endswith('.weight') and len(list(weights.values())[-1].shape) == 2: + out_channels = list(weights.values())[-1].numpy().shape[1] + else: + out_channels = list(weights.values())[-1].numpy().shape[0] + return out_channels + + def load_state_dict(self, weights): + self.net.load_state_dict(weights) + print('weights is loaded.') + + def load_pytorch_weights(self, weights_path): + self.net.load_state_dict(torch.load(weights_path)) + print('model is loaded: {}'.format(weights_path)) + + + def save_pytorch_weights(self, weights_path): + try: + torch.save(self.net.state_dict(), weights_path, _use_new_zipfile_serialization=False) + except: + torch.save(self.net.state_dict(), weights_path) # _use_new_zipfile_serialization=False for torch>=1.6.0 + print('model is saved: {}'.format(weights_path)) + + + def print_pytorch_state_dict(self): + print('pytorch:') + for k,v in self.net.state_dict().items(): + print('{}----{}'.format(k,type(v))) + + def read_paddle_weights(self, weights_path): + import paddle.fluid as fluid + with fluid.dygraph.guard(): + para_state_dict, opti_state_dict = fluid.load_dygraph(weights_path) + return para_state_dict, opti_state_dict + + def print_paddle_state_dict(self, weights_path): + import paddle.fluid as fluid + with fluid.dygraph.guard(): + para_state_dict, opti_state_dict = fluid.load_dygraph(weights_path) + print('paddle"') + for k,v in para_state_dict.items(): + print('{}----{}'.format(k,type(v))) + + + def inference(self, inputs): + with torch.no_grad(): + infer = self.net(inputs) + return infer diff --git a/batch_running_task/pytorchocr/data/__init__.py b/batch_running_task/pytorchocr/data/__init__.py new file mode 100644 index 0000000..4bc5c34 --- /dev/null +++ b/batch_running_task/pytorchocr/data/__init__.py @@ -0,0 +1,24 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os +import sys +import numpy as np +# import paddle +import signal +import random + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) + + +import copy +# from paddle.io import Dataset, DataLoader, BatchSampler, DistributedBatchSampler +# import paddle.distributed as dist + +from pytorchocr.data.imaug import transform, create_operators +# from pytorchocr.data.simple_dataset import SimpleDataSet +# from pytorchocr.data.lmdb_dataset import LMDBDateSet + diff --git a/batch_running_task/pytorchocr/data/imaug/__init__.py b/batch_running_task/pytorchocr/data/imaug/__init__.py new file mode 100644 index 0000000..dce5011 --- /dev/null +++ b/batch_running_task/pytorchocr/data/imaug/__init__.py @@ -0,0 +1,48 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +# from .iaa_augment import IaaAugment +# from .make_border_map import MakeBorderMap +# from .make_shrink_map import MakeShrinkMap +# from .random_crop_data import EastRandomCropData, PSERandomCrop + +# from .rec_img_aug import RecAug, RecResizeImg, ClsResizeImg +# from .randaugment import RandAugment +from .operators import * +# from .label_ops import * + +# from .east_process import * +# from .sast_process import * +from .gen_table_mask import * + +def transform(data, ops=None): + """ transform """ + if ops is None: + ops = [] + for op in ops: + data = op(data) + if data is None: + return None + return data + + +def create_operators(op_param_list, global_config=None): + """ + create operators based on the config + Args: + params(list): a dict list, used to create some operators + """ + assert isinstance(op_param_list, list), ('operator config should be a list') + ops = [] + for operator in op_param_list: + assert isinstance(operator, + dict) and len(operator) == 1, "yaml format error" + op_name = list(operator)[0] + param = {} if operator[op_name] is None else operator[op_name] + if global_config is not None: + param.update(global_config) + op = eval(op_name)(**param) + ops.append(op) + return ops \ No newline at end of file diff --git a/batch_running_task/pytorchocr/data/imaug/gen_table_mask.py b/batch_running_task/pytorchocr/data/imaug/gen_table_mask.py new file mode 100644 index 0000000..2b14e7a --- /dev/null +++ b/batch_running_task/pytorchocr/data/imaug/gen_table_mask.py @@ -0,0 +1,245 @@ +""" +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import sys +import six +import cv2 +import numpy as np + + +class GenTableMask(object): + """ gen table mask """ + + def __init__(self, shrink_h_max, shrink_w_max, mask_type=0, **kwargs): + self.shrink_h_max = 5 + self.shrink_w_max = 5 + self.mask_type = mask_type + + def projection(self, erosion, h, w, spilt_threshold=0): + # 水平投影 + projection_map = np.ones_like(erosion) + project_val_array = [0 for _ in range(0, h)] + + for j in range(0, h): + for i in range(0, w): + if erosion[j, i] == 255: + project_val_array[j] += 1 + # 根据数组,获取切割点 + start_idx = 0 # 记录进入字符区的索引 + end_idx = 0 # 记录进入空白区域的索引 + in_text = False # 是否遍历到了字符区内 + box_list = [] + for i in range(len(project_val_array)): + if in_text == False and project_val_array[i] > spilt_threshold: # 进入字符区了 + in_text = True + start_idx = i + elif project_val_array[i] <= spilt_threshold and in_text == True: # 进入空白区了 + end_idx = i + in_text = False + if end_idx - start_idx <= 2: + continue + box_list.append((start_idx, end_idx + 1)) + + if in_text: + box_list.append((start_idx, h - 1)) + # 绘制投影直方图 + for j in range(0, h): + for i in range(0, project_val_array[j]): + projection_map[j, i] = 0 + return box_list, projection_map + + def projection_cx(self, box_img): + box_gray_img = cv2.cvtColor(box_img, cv2.COLOR_BGR2GRAY) + h, w = box_gray_img.shape + # 灰度图片进行二值化处理 + ret, thresh1 = cv2.threshold(box_gray_img, 200, 255, cv2.THRESH_BINARY_INV) + # 纵向腐蚀 + if h < w: + kernel = np.ones((2, 1), np.uint8) + erode = cv2.erode(thresh1, kernel, iterations=1) + else: + erode = thresh1 + # 水平膨胀 + kernel = np.ones((1, 5), np.uint8) + erosion = cv2.dilate(erode, kernel, iterations=1) + # 水平投影 + projection_map = np.ones_like(erosion) + project_val_array = [0 for _ in range(0, h)] + + for j in range(0, h): + for i in range(0, w): + if erosion[j, i] == 255: + project_val_array[j] += 1 + # 根据数组,获取切割点 + start_idx = 0 # 记录进入字符区的索引 + end_idx = 0 # 记录进入空白区域的索引 + in_text = False # 是否遍历到了字符区内 + box_list = [] + spilt_threshold = 0 + for i in range(len(project_val_array)): + if in_text == False and project_val_array[i] > spilt_threshold: # 进入字符区了 + in_text = True + start_idx = i + elif project_val_array[i] <= spilt_threshold and in_text == True: # 进入空白区了 + end_idx = i + in_text = False + if end_idx - start_idx <= 2: + continue + box_list.append((start_idx, end_idx + 1)) + + if in_text: + box_list.append((start_idx, h - 1)) + # 绘制投影直方图 + for j in range(0, h): + for i in range(0, project_val_array[j]): + projection_map[j, i] = 0 + split_bbox_list = [] + if len(box_list) > 1: + for i, (h_start, h_end) in enumerate(box_list): + if i == 0: + h_start = 0 + if i == len(box_list): + h_end = h + word_img = erosion[h_start:h_end + 1, :] + word_h, word_w = word_img.shape + w_split_list, w_projection_map = self.projection(word_img.T, word_w, word_h) + w_start, w_end = w_split_list[0][0], w_split_list[-1][1] + if h_start > 0: + h_start -= 1 + h_end += 1 + word_img = box_img[h_start:h_end + 1:, w_start:w_end + 1, :] + split_bbox_list.append([w_start, h_start, w_end, h_end]) + else: + split_bbox_list.append([0, 0, w, h]) + return split_bbox_list + + def shrink_bbox(self, bbox): + left, top, right, bottom = bbox + sh_h = min(max(int((bottom - top) * 0.1), 1), self.shrink_h_max) + sh_w = min(max(int((right - left) * 0.1), 1), self.shrink_w_max) + left_new = left + sh_w + right_new = right - sh_w + top_new = top + sh_h + bottom_new = bottom - sh_h + if left_new >= right_new: + left_new = left + right_new = right + if top_new >= bottom_new: + top_new = top + bottom_new = bottom + return [left_new, top_new, right_new, bottom_new] + + def __call__(self, data): + img = data['image'] + cells = data['cells'] + height, width = img.shape[0:2] + if self.mask_type == 1: + mask_img = np.zeros((height, width), dtype=np.float32) + else: + mask_img = np.zeros((height, width, 3), dtype=np.float32) + cell_num = len(cells) + for cno in range(cell_num): + if "bbox" in cells[cno]: + bbox = cells[cno]['bbox'] + left, top, right, bottom = bbox + box_img = img[top:bottom, left:right, :].copy() + split_bbox_list = self.projection_cx(box_img) + for sno in range(len(split_bbox_list)): + split_bbox_list[sno][0] += left + split_bbox_list[sno][1] += top + split_bbox_list[sno][2] += left + split_bbox_list[sno][3] += top + + for sno in range(len(split_bbox_list)): + left, top, right, bottom = split_bbox_list[sno] + left, top, right, bottom = self.shrink_bbox([left, top, right, bottom]) + if self.mask_type == 1: + mask_img[top:bottom, left:right] = 1.0 + data['mask_img'] = mask_img + else: + mask_img[top:bottom, left:right, :] = (255, 255, 255) + data['image'] = mask_img + return data + + +class ResizeTableImage(object): + def __init__(self, max_len, **kwargs): + super(ResizeTableImage, self).__init__() + self.max_len = max_len + + def get_img_bbox(self, cells): + bbox_list = [] + if len(cells) == 0: + return bbox_list + cell_num = len(cells) + for cno in range(cell_num): + if "bbox" in cells[cno]: + bbox = cells[cno]['bbox'] + bbox_list.append(bbox) + return bbox_list + + def resize_img_table(self, img, bbox_list, max_len): + height, width = img.shape[0:2] + ratio = max_len / (max(height, width) * 1.0) + resize_h = int(height * ratio) + resize_w = int(width * ratio) + img_new = cv2.resize(img, (resize_w, resize_h)) + bbox_list_new = [] + for bno in range(len(bbox_list)): + left, top, right, bottom = bbox_list[bno].copy() + left = int(left * ratio) + top = int(top * ratio) + right = int(right * ratio) + bottom = int(bottom * ratio) + bbox_list_new.append([left, top, right, bottom]) + return img_new, bbox_list_new + + def __call__(self, data): + img = data['image'] + if 'cells' not in data: + cells = [] + else: + cells = data['cells'] + bbox_list = self.get_img_bbox(cells) + img_new, bbox_list_new = self.resize_img_table(img, bbox_list, self.max_len) + data['image'] = img_new + cell_num = len(cells) + bno = 0 + for cno in range(cell_num): + if "bbox" in data['cells'][cno]: + data['cells'][cno]['bbox'] = bbox_list_new[bno] + bno += 1 + data['max_len'] = self.max_len + return data + + +class PaddingTableImage(object): + def __init__(self, **kwargs): + super(PaddingTableImage, self).__init__() + + def __call__(self, data): + img = data['image'] + max_len = data['max_len'] + padding_img = np.zeros((max_len, max_len, 3), dtype=np.float32) + height, width = img.shape[0:2] + padding_img[0:height, 0:width, :] = img.copy() + data['image'] = padding_img + return data \ No newline at end of file diff --git a/batch_running_task/pytorchocr/data/imaug/operators.py b/batch_running_task/pytorchocr/data/imaug/operators.py new file mode 100644 index 0000000..daa67a2 --- /dev/null +++ b/batch_running_task/pytorchocr/data/imaug/operators.py @@ -0,0 +1,418 @@ +""" +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import sys +import six +import cv2 +import numpy as np + + +class DecodeImage(object): + """ decode image """ + + def __init__(self, img_mode='RGB', channel_first=False, **kwargs): + self.img_mode = img_mode + self.channel_first = channel_first + + def __call__(self, data): + img = data['image'] + if six.PY2: + assert type(img) is str and len( + img) > 0, "invalid input 'img' in DecodeImage" + else: + assert type(img) is bytes and len( + img) > 0, "invalid input 'img' in DecodeImage" + img = np.frombuffer(img, dtype='uint8') + img = cv2.imdecode(img, 1) + if img is None: + return None + if self.img_mode == 'GRAY': + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + elif self.img_mode == 'RGB': + assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape) + img = img[:, :, ::-1] + + if self.channel_first: + img = img.transpose((2, 0, 1)) + + data['image'] = img + return data + + +class NRTRDecodeImage(object): + """ decode image """ + + def __init__(self, img_mode='RGB', channel_first=False, **kwargs): + self.img_mode = img_mode + self.channel_first = channel_first + + def __call__(self, data): + img = data['image'] + if six.PY2: + assert type(img) is str and len( + img) > 0, "invalid input 'img' in DecodeImage" + else: + assert type(img) is bytes and len( + img) > 0, "invalid input 'img' in DecodeImage" + img = np.frombuffer(img, dtype='uint8') + + img = cv2.imdecode(img, 1) + + if img is None: + return None + if self.img_mode == 'GRAY': + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + elif self.img_mode == 'RGB': + assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape) + img = img[:, :, ::-1] + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + if self.channel_first: + img = img.transpose((2, 0, 1)) + data['image'] = img + return data + + +class NormalizeImage(object): + """ normalize image such as substract mean, divide std + """ + + def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs): + if isinstance(scale, str): + scale = eval(scale) + self.scale = np.float32(scale if scale is not None else 1.0 / 255.0) + mean = mean if mean is not None else [0.485, 0.456, 0.406] + std = std if std is not None else [0.229, 0.224, 0.225] + + shape = (3, 1, 1) if order == 'chw' else (1, 1, 3) + self.mean = np.array(mean).reshape(shape).astype('float32') + self.std = np.array(std).reshape(shape).astype('float32') + + def __call__(self, data): + img = data['image'] + from PIL import Image + if isinstance(img, Image.Image): + img = np.array(img) + assert isinstance(img, + np.ndarray), "invalid input 'img' in NormalizeImage" + data['image'] = ( + img.astype('float32') * self.scale - self.mean) / self.std + return data + + +class ToCHWImage(object): + """ convert hwc image to chw image + """ + + def __init__(self, **kwargs): + pass + + def __call__(self, data): + img = data['image'] + from PIL import Image + if isinstance(img, Image.Image): + img = np.array(img) + data['image'] = img.transpose((2, 0, 1)) + return data + + +class Fasttext(object): + def __init__(self, path="None", **kwargs): + import fasttext + self.fast_model = fasttext.load_model(path) + + def __call__(self, data): + label = data['label'] + fast_label = self.fast_model[label] + data['fast_label'] = fast_label + return data + + +class KeepKeys(object): + def __init__(self, keep_keys, **kwargs): + self.keep_keys = keep_keys + + def __call__(self, data): + data_list = [] + for key in self.keep_keys: + data_list.append(data[key]) + return data_list + + +class Resize(object): + def __init__(self, size=(640, 640), **kwargs): + self.size = size + + def resize_image(self, img): + resize_h, resize_w = self.size + ori_h, ori_w = img.shape[:2] # (h, w, c) + ratio_h = float(resize_h) / ori_h + ratio_w = float(resize_w) / ori_w + img = cv2.resize(img, (int(resize_w), int(resize_h))) + return img, [ratio_h, ratio_w] + + def __call__(self, data): + img = data['image'] + text_polys = data['polys'] + + img_resize, [ratio_h, ratio_w] = self.resize_image(img) + new_boxes = [] + for box in text_polys: + new_box = [] + for cord in box: + new_box.append([cord[0] * ratio_w, cord[1] * ratio_h]) + new_boxes.append(new_box) + data['image'] = img_resize + data['polys'] = np.array(new_boxes, dtype=np.float32) + return data + + +class DetResizeForTest(object): + def __init__(self, **kwargs): + super(DetResizeForTest, self).__init__() + self.resize_type = 0 + if 'image_shape' in kwargs: + self.image_shape = kwargs['image_shape'] + self.resize_type = 1 + elif 'limit_side_len' in kwargs: + self.limit_side_len = kwargs['limit_side_len'] + self.limit_type = kwargs.get('limit_type', 'min') + elif 'resize_long' in kwargs: + self.resize_type = 2 + self.resize_long = kwargs.get('resize_long', 960) + else: + self.limit_side_len = 736 + self.limit_type = 'min' + + def __call__(self, data): + img = data['image'] + src_h, src_w, _ = img.shape + + if self.resize_type == 0: + # img, shape = self.resize_image_type0(img) + img, [ratio_h, ratio_w] = self.resize_image_type0(img) + elif self.resize_type == 2: + img, [ratio_h, ratio_w] = self.resize_image_type2(img) + else: + # img, shape = self.resize_image_type1(img) + img, [ratio_h, ratio_w] = self.resize_image_type1(img) + data['image'] = img + data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w]) + return data + + def resize_image_type1(self, img): + resize_h, resize_w = self.image_shape + ori_h, ori_w = img.shape[:2] # (h, w, c) + ratio_h = float(resize_h) / ori_h + ratio_w = float(resize_w) / ori_w + img = cv2.resize(img, (int(resize_w), int(resize_h))) + # return img, np.array([ori_h, ori_w]) + return img, [ratio_h, ratio_w] + + def resize_image_type0(self, img): + """ + resize image to a size multiple of 32 which is required by the network + args: + img(array): array with shape [h, w, c] + return(tuple): + img, (ratio_h, ratio_w) + """ + limit_side_len = self.limit_side_len + h, w, c = img.shape + + # limit the max side + if self.limit_type == 'max': + if max(h, w) > limit_side_len: + if h > w: + ratio = float(limit_side_len) / h + else: + ratio = float(limit_side_len) / w + else: + ratio = 1. + elif self.limit_type == 'min': + if min(h, w) < limit_side_len: + if h < w: + ratio = float(limit_side_len) / h + else: + ratio = float(limit_side_len) / w + else: + ratio = 1. + elif self.limit_type == 'resize_long': + ratio = float(limit_side_len) / max(h, w) + else: + raise Exception('not support limit type, image ') + resize_h = int(h * ratio) + resize_w = int(w * ratio) + + resize_h = max(int(round(resize_h / 32) * 32), 32) + resize_w = max(int(round(resize_w / 32) * 32), 32) + + try: + if int(resize_w) <= 0 or int(resize_h) <= 0: + return None, (None, None) + img = cv2.resize(img, (int(resize_w), int(resize_h))) + except: + print(img.shape, resize_w, resize_h) + sys.exit(0) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return img, [ratio_h, ratio_w] + + def resize_image_type2(self, img): + h, w, _ = img.shape + + resize_w = w + resize_h = h + + if resize_h > resize_w: + ratio = float(self.resize_long) / resize_h + else: + ratio = float(self.resize_long) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + img = cv2.resize(img, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + + return img, [ratio_h, ratio_w] + + +class E2EResizeForTest(object): + def __init__(self, **kwargs): + super(E2EResizeForTest, self).__init__() + self.max_side_len = kwargs['max_side_len'] + self.valid_set = kwargs['valid_set'] + + def __call__(self, data): + img = data['image'] + src_h, src_w, _ = img.shape + if self.valid_set == 'totaltext': + im_resized, [ratio_h, ratio_w] = self.resize_image_for_totaltext( + img, max_side_len=self.max_side_len) + else: + im_resized, (ratio_h, ratio_w) = self.resize_image( + img, max_side_len=self.max_side_len) + data['image'] = im_resized + data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w]) + return data + + def resize_image_for_totaltext(self, im, max_side_len=512): + + h, w, _ = im.shape + resize_w = w + resize_h = h + ratio = 1.25 + if h * ratio > max_side_len: + ratio = float(max_side_len) / resize_h + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return im, (ratio_h, ratio_w) + + def resize_image(self, im, max_side_len=512): + """ + resize image to a size multiple of max_stride which is required by the network + :param im: the resized image + :param max_side_len: limit of max image size to avoid out of memory in gpu + :return: the resized image and the resize ratio + """ + h, w, _ = im.shape + + resize_w = w + resize_h = h + + # Fix the longer side + if resize_h > resize_w: + ratio = float(max_side_len) / resize_h + else: + ratio = float(max_side_len) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + + return im, (ratio_h, ratio_w) + + +class KieResize(object): + def __init__(self, **kwargs): + super(KieResize, self).__init__() + self.max_side, self.min_side = kwargs['img_scale'][0], kwargs[ + 'img_scale'][1] + + def __call__(self, data): + img = data['image'] + points = data['points'] + src_h, src_w, _ = img.shape + im_resized, scale_factor, [ratio_h, ratio_w + ], [new_h, new_w] = self.resize_image(img) + resize_points = self.resize_boxes(img, points, scale_factor) + data['ori_image'] = img + data['ori_boxes'] = points + data['points'] = resize_points + data['image'] = im_resized + data['shape'] = np.array([new_h, new_w]) + return data + + def resize_image(self, img): + norm_img = np.zeros([1024, 1024, 3], dtype='float32') + scale = [512, 1024] + h, w = img.shape[:2] + max_long_edge = max(scale) + max_short_edge = min(scale) + scale_factor = min(max_long_edge / max(h, w), + max_short_edge / min(h, w)) + resize_w, resize_h = int(w * float(scale_factor) + 0.5), int(h * float( + scale_factor) + 0.5) + max_stride = 32 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(img, (resize_w, resize_h)) + new_h, new_w = im.shape[:2] + w_scale = new_w / w + h_scale = new_h / h + scale_factor = np.array( + [w_scale, h_scale, w_scale, h_scale], dtype=np.float32) + norm_img[:new_h, :new_w, :] = im + return norm_img, scale_factor, [h_scale, w_scale], [new_h, new_w] + + def resize_boxes(self, im, points, scale_factor): + points = points * scale_factor + img_shape = im.shape[:2] + points[:, 0::2] = np.clip(points[:, 0::2], 0, img_shape[1]) + points[:, 1::2] = np.clip(points[:, 1::2], 0, img_shape[0]) + return points diff --git a/batch_running_task/pytorchocr/modeling/__init__.py b/batch_running_task/pytorchocr/modeling/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/batch_running_task/pytorchocr/modeling/architectures/__init__.py b/batch_running_task/pytorchocr/modeling/architectures/__init__.py new file mode 100644 index 0000000..a93052f --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/architectures/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +__all__ = ['build_model'] + + +def build_model(config, **kwargs): + from .base_model import BaseModel + + config = copy.deepcopy(config) + module_class = BaseModel(config, **kwargs) + return module_class \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/architectures/base_model.py b/batch_running_task/pytorchocr/modeling/architectures/base_model.py new file mode 100644 index 0000000..b0ba725 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/architectures/base_model.py @@ -0,0 +1,125 @@ +import os, sys +# import torch +import torch.nn as nn +# import torch.nn.functional as F +# from pytorchocr.modeling.common import Activation + +from pytorchocr.modeling.transforms import build_transform +from pytorchocr.modeling.backbones import build_backbone +from pytorchocr.modeling.necks import build_neck +from pytorchocr.modeling.heads import build_head + +class BaseModel(nn.Module): + def __init__(self, config, **kwargs): + """ + the module for OCR. + args: + config (dict): the super parameters for module. + """ + super(BaseModel, self).__init__() + + in_channels = config.get('in_channels', 3) + model_type = config['model_type'] + # build transfrom, + # for rec, transfrom can be TPS,None + # for det and cls, transfrom shoule to be None, + # if you make model differently, you can use transfrom in det and cls + if 'Transform' not in config or config['Transform'] is None: + self.use_transform = False + else: + self.use_transform = True + config['Transform']['in_channels'] = in_channels + self.transform = build_transform(config['Transform']) + in_channels = self.transform.out_channels + # raise NotImplementedError + + # build backbone, backbone is need for del, rec and cls + if 'Backbone' not in config or config['Backbone'] is None: + self.use_backbone = False + else: + self.use_backbone = True + config["Backbone"]['in_channels'] = in_channels + self.backbone = build_backbone(config["Backbone"], model_type) + in_channels = self.backbone.out_channels + + # build neck + # for rec, neck can be cnn,rnn or reshape(None) + # for det, neck can be FPN, BIFPN and so on. + # for cls, neck should be none + if 'Neck' not in config or config['Neck'] is None: + self.use_neck = False + else: + self.use_neck = True + config['Neck']['in_channels'] = in_channels + self.neck = build_neck(config['Neck']) + in_channels = self.neck.out_channels + + # # build head, head is need for det, rec and cls + if 'Head' not in config or config['Head'] is None: + self.use_head = False + else: + self.use_head = True + config["Head"]['in_channels'] = in_channels + self.head = build_head(config["Head"], **kwargs) + + self.return_all_feats = config.get("return_all_feats", False) + + self._initialize_weights() + + def _initialize_weights(self): + # weight initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.ConvTranspose2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + + + def forward(self, x): + y = dict() + if self.use_transform: + x = self.transform(x) + if self.use_backbone: + x = self.backbone(x) + if isinstance(x, dict): + y.update(x) + else: + y["backbone_out"] = x + final_name = "backbone_out" + if self.use_neck: + x = self.neck(x) + if isinstance(x, dict): + y.update(x) + else: + y["neck_out"] = x + final_name = "neck_out" + if self.use_head: + x = self.head(x) + # for multi head, save ctc neck out for udml + if isinstance(x, dict) and 'ctc_nect' in x.keys(): + y['neck_out'] = x['ctc_neck'] + y['head_out'] = x + elif isinstance(x, dict): + y.update(x) + else: + y["head_out"] = x + if self.return_all_feats: + if self.training: + return y + elif isinstance(x, dict): + return x + else: + return {final_name: x} + else: + return x \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/backbones/__init__.py b/batch_running_task/pytorchocr/modeling/backbones/__init__.py new file mode 100644 index 0000000..7abdb0e --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/__init__.py @@ -0,0 +1,56 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['build_backbone'] + + +def build_backbone(config, model_type): + if model_type == 'det': + from .det_mobilenet_v3 import MobileNetV3 + from .det_resnet import ResNet + from .det_resnet_vd import ResNet_vd + from .det_resnet_vd_sast import ResNet_SAST + from .rec_lcnetv3 import PPLCNetV3 + from .rec_hgnet import PPHGNet_small + support_dict = ['MobileNetV3', 'ResNet', 'ResNet_vd', 'ResNet_SAST', 'PPLCNetV3', 'PPHGNet_small'] + elif model_type == 'rec' or model_type == 'cls': + from .rec_mobilenet_v3 import MobileNetV3 + from .rec_resnet_vd import ResNet + from .rec_resnet_fpn import ResNetFPN + from .rec_mv1_enhance import MobileNetV1Enhance + from .rec_nrtr_mtb import MTB + from .rec_resnet_31 import ResNet31 + from .rec_svtrnet import SVTRNet + from .rec_vitstr import ViTSTR + from .rec_densenet import DenseNet + from .rec_lcnetv3 import PPLCNetV3 + from .rec_hgnet import PPHGNet_small + support_dict = ['MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB', + 'ResNet31', 'SVTRNet', 'ViTSTR', 'DenseNet', 'PPLCNetV3', 'PPHGNet_small'] + elif model_type == 'e2e': + from .e2e_resnet_vd_pg import ResNet + support_dict = ['ResNet'] + elif model_type == "table": + from .table_resnet_vd import ResNet + from .table_mobilenet_v3 import MobileNetV3 + support_dict = ["ResNet", "MobileNetV3"] + else: + raise NotImplementedError + + module_name = config.pop('name') + assert module_name in support_dict, Exception( + 'when model typs is {}, backbone only support {}'.format(model_type, + support_dict)) + module_class = eval(module_name)(**config) + return module_class \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/backbones/det_mobilenet_v3.py b/batch_running_task/pytorchocr/modeling/backbones/det_mobilenet_v3.py new file mode 100644 index 0000000..795f5d1 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/det_mobilenet_v3.py @@ -0,0 +1,256 @@ +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation + +def make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False) + + self.bn = nn.BatchNorm2d( + out_channels, + ) + if self.if_act: + self.act = Activation(act_type=act, inplace=True) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.if_act: + x = self.act(x) + return x + + +class SEModule(nn.Module): + def __init__(self, in_channels, reduction=4, name=""): + super(SEModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.conv1 = nn.Conv2d( + in_channels=in_channels, + out_channels=in_channels // reduction, + kernel_size=1, + stride=1, + padding=0, + bias=True) + self.relu1 = Activation(act_type='relu', inplace=True) + self.conv2 = nn.Conv2d( + in_channels=in_channels // reduction, + out_channels=in_channels, + kernel_size=1, + stride=1, + padding=0, + bias=True) + self.hard_sigmoid = Activation(act_type='hard_sigmoid', inplace=True) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = self.relu1(outputs) + outputs = self.conv2(outputs) + outputs = self.hard_sigmoid(outputs) + outputs = inputs * outputs + return outputs + + +class ResidualUnit(nn.Module): + def __init__(self, + in_channels, + mid_channels, + out_channels, + kernel_size, + stride, + use_se, + act=None, + name=''): + super(ResidualUnit, self).__init__() + self.if_shortcut = stride == 1 and in_channels == out_channels + self.if_se = use_se + + self.expand_conv = ConvBNLayer( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0, + if_act=True, + act=act, + name=name + "_expand") + self.bottleneck_conv = ConvBNLayer( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + padding=int((kernel_size - 1) // 2), + groups=mid_channels, + if_act=True, + act=act, + name=name + "_depthwise") + if self.if_se: + self.mid_se = SEModule(mid_channels, name=name + "_se") + self.linear_conv = ConvBNLayer( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + if_act=False, + act=None, + name=name + "_linear") + + def forward(self, inputs): + x = self.expand_conv(inputs) + x = self.bottleneck_conv(x) + if self.if_se: + x = self.mid_se(x) + x = self.linear_conv(x) + if self.if_shortcut: + x = inputs + x + return x + + +class MobileNetV3(nn.Module): + def __init__(self, + in_channels=3, + model_name='large', + scale=0.5, + disable_se=False, + **kwargs): + """ + the MobilenetV3 backbone network for detection module. + Args: + params(dict): the super parameters for build network + """ + super(MobileNetV3, self).__init__() + + self.disable_se = disable_se + + if model_name == "large": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, False, 'relu', 1], + [3, 64, 24, False, 'relu', 2], + [3, 72, 24, False, 'relu', 1], + [5, 72, 40, True, 'relu', 2], + [5, 120, 40, True, 'relu', 1], + [5, 120, 40, True, 'relu', 1], + [3, 240, 80, False, 'hard_swish', 2], + [3, 200, 80, False, 'hard_swish', 1], + [3, 184, 80, False, 'hard_swish', 1], + [3, 184, 80, False, 'hard_swish', 1], + [3, 480, 112, True, 'hard_swish', 1], + [3, 672, 112, True, 'hard_swish', 1], + [5, 672, 160, True, 'hard_swish', 2], + [5, 960, 160, True, 'hard_swish', 1], + [5, 960, 160, True, 'hard_swish', 1], + ] + cls_ch_squeeze = 960 + elif model_name == "small": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, True, 'relu', 2], + [3, 72, 24, False, 'relu', 2], + [3, 88, 24, False, 'relu', 1], + [5, 96, 40, True, 'hard_swish', 2], + [5, 240, 40, True, 'hard_swish', 1], + [5, 240, 40, True, 'hard_swish', 1], + [5, 120, 48, True, 'hard_swish', 1], + [5, 144, 48, True, 'hard_swish', 1], + [5, 288, 96, True, 'hard_swish', 2], + [5, 576, 96, True, 'hard_swish', 1], + [5, 576, 96, True, 'hard_swish', 1], + ] + cls_ch_squeeze = 576 + else: + raise NotImplementedError("mode[" + model_name + + "_model] is not implemented!") + + supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25] + assert scale in supported_scale, \ + "supported scale are {} but input scale is {}".format(supported_scale, scale) + inplanes = 16 + # conv1 + self.conv = ConvBNLayer( + in_channels=in_channels, + out_channels=make_divisible(inplanes * scale), + kernel_size=3, + stride=2, + padding=1, + groups=1, + if_act=True, + act='hard_swish', + name='conv1') + + self.stages = nn.ModuleList() + self.out_channels = [] + block_list = [] + i = 0 + inplanes = make_divisible(inplanes * scale) + for (k, exp, c, se, nl, s) in cfg: + se = se and not self.disable_se + if s == 2 and i > 2: + self.out_channels.append(inplanes) + self.stages.append(nn.Sequential(*block_list)) + block_list = [] + block_list.append( + ResidualUnit( + in_channels=inplanes, + mid_channels=make_divisible(scale * exp), + out_channels=make_divisible(scale * c), + kernel_size=k, + stride=s, + use_se=se, + act=nl, + name="conv" + str(i + 2))) + inplanes = make_divisible(scale * c) + i += 1 + block_list.append( + ConvBNLayer( + in_channels=inplanes, + out_channels=make_divisible(scale * cls_ch_squeeze), + kernel_size=1, + stride=1, + padding=0, + groups=1, + if_act=True, + act='hard_swish', + name='conv_last')) + self.stages.append(nn.Sequential(*block_list)) + self.out_channels.append(make_divisible(scale * cls_ch_squeeze)) + # for i, stage in enumerate(self.stages): + # self.add_sublayer(sublayer=stage, name="stage{}".format(i)) + + def forward(self, x): + x = self.conv(x) + out_list = [] + for stage in self.stages: + x = stage(x) + out_list.append(x) + return out_list diff --git a/batch_running_task/pytorchocr/modeling/backbones/det_resnet.py b/batch_running_task/pytorchocr/modeling/backbones/det_resnet.py new file mode 100644 index 0000000..f925d74 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/det_resnet.py @@ -0,0 +1,210 @@ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from .det_resnet_vd import DeformableConvV2, ConvBNLayer + + +class BottleneckBlock(nn.Module): + def __init__(self, + num_channels, + num_filters, + stride, + shortcut=True, + is_dcn=False): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=1, + act="relu", ) + self.conv1 = ConvBNLayer( + in_channels=num_filters, + out_channels=num_filters, + kernel_size=3, + stride=stride, + act="relu", + is_dcn=is_dcn, + # dcn_groups=1, + ) + self.conv2 = ConvBNLayer( + in_channels=num_filters, + out_channels=num_filters * 4, + kernel_size=1, + act=None, ) + + if not shortcut: + self.short = ConvBNLayer( + in_channels=num_channels, + out_channels=num_filters * 4, + kernel_size=1, + stride=stride, ) + + self.shortcut = shortcut + + self._num_channels_out = num_filters * 4 + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + y = torch.add(short, conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Module): + def __init__(self, + num_channels, + num_filters, + stride, + shortcut=True, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=3, + stride=stride, + act="relu") + self.conv1 = ConvBNLayer( + in_channels=num_filters, + out_channels=num_filters, + kernel_size=3, + act=None) + + if not shortcut: + self.short = ConvBNLayer( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=1, + stride=stride) + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = torch.add(short, conv1) + y = F.relu(y) + return y + + +class ResNet(nn.Module): + def __init__(self, + in_channels=3, + layers=50, + out_indices=None, + dcn_stage=None): + super(ResNet, self).__init__() + + self.layers = layers + self.input_image_channel = in_channels + + supported_layers = [18, 34, 50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + num_channels = [64, 256, 512, + 1024] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512] + + self.dcn_stage = dcn_stage if dcn_stage is not None else [ + False, False, False, False + ] + self.out_indices = out_indices if out_indices is not None else [ + 0, 1, 2, 3 + ] + + self.conv = ConvBNLayer( + in_channels=self.input_image_channel, + out_channels=64, + kernel_size=7, + stride=2, + act="relu", ) + self.pool2d_max = nn.MaxPool2d( + kernel_size=3, + stride=2, + padding=1, ) + + self.stages = nn.ModuleList() + self.out_channels = [] + if layers >= 50: + for block in range(len(depth)): + shortcut = False + block_list = nn.Sequential() + is_dcn = self.dcn_stage[block] + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = BottleneckBlock( + num_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + is_dcn=is_dcn) + block_list.add_module(conv_name, bottleneck_block) + shortcut = True + if block in self.out_indices: + self.out_channels.append(num_filters[block] * 4) + self.stages.append(block_list) + else: + for block in range(len(depth)): + shortcut = False + block_list = nn.Sequential() + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = BasicBlock( + num_channels=num_channels[block] + if i == 0 else num_filters[block], + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut) + block_list.add_module(conv_name, basic_block) + shortcut = True + if block in self.out_indices: + self.out_channels.append(num_filters[block]) + self.stages.append(block_list) + + def forward(self, inputs): + y = self.conv(inputs) + y = self.pool2d_max(y) + out = [] + for i, block in enumerate(self.stages): + y = block(y) + if i in self.out_indices: + out.append(y) + return out diff --git a/batch_running_task/pytorchocr/modeling/backbones/det_resnet_vd.py b/batch_running_task/pytorchocr/modeling/backbones/det_resnet_vd.py new file mode 100644 index 0000000..2330bc7 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/det_resnet_vd.py @@ -0,0 +1,360 @@ + + +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation +import torchvision + +class DeformableConvV2(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + weight_attr=None, + bias_attr=None, + lr_scale=1, + regularizer=None, + skip_quant=False, + dcn_bias_regularizer=None, + dcn_bias_lr_scale=2.): + super(DeformableConvV2, self).__init__() + self.offset_channel = 2 * kernel_size**2 * groups + self.mask_channel = kernel_size**2 * groups + + if bias_attr: + # in FCOS-DCN head, specifically need learning_rate and regularizer + dcn_bias_attr = True + else: + # in ResNet backbone, do not need bias + dcn_bias_attr = False + self.conv_dcn = torchvision.ops.DeformConv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2 * dilation, + dilation=dilation, + groups=groups//2 if groups > 1 else 1, + bias=dcn_bias_attr) + + self.conv_offset = nn.Conv2d( + in_channels, + groups * 3 * kernel_size**2, + kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + bias=True) + if skip_quant: + self.conv_offset.skip_quant = True + + def forward(self, x): + offset_mask = self.conv_offset(x) + offset, mask = torch.split( + offset_mask, + split_size_or_sections=[self.offset_channel, self.mask_channel], + dim=1) + mask = torch.sigmoid(mask) + y = self.conv_dcn(x, offset, mask=mask) + return y + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + dcn_groups=1, + is_vd_mode=False, + act=None, + name=None, + is_dcn=False, + ): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self.act = act + self._pool2d_avg = nn.AvgPool2d( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + if not is_dcn: + self._conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False) + else: + self._conv = DeformableConvV2( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=dcn_groups, + bias_attr=False) + + self._batch_norm = nn.BatchNorm2d( + out_channels, + track_running_stats=True, + ) + + if act is not None: + self._act = Activation(act_type=act, inplace=True) + + + def forward(self, inputs): + if self.is_vd_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + if self.act is not None: + y = self._act(y) + return y + + +class BottleneckBlock(nn.Module): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None, + is_dcn=False, + ): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2b", + is_dcn=is_dcn, + dcn_groups=2, + ) + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = torch.add(short, conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Module): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = short + conv1 + y = F.relu(y) + return y + + +class ResNet_vd(nn.Module): + def __init__(self, + in_channels=3, + layers=50, + dcn_stage=None, + out_indices=None, + **kwargs): + super(ResNet_vd, self).__init__() + + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_channels = [64, 256, 512, + 1024] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512] + + self.dcn_stage = dcn_stage if dcn_stage is not None else [ + False, False, False, False + ] + self.out_indices = out_indices if out_indices is not None else [ + 0, 1, 2, 3 + ] + + self.conv1_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=32, + kernel_size=3, + stride=2, + act='relu', + name="conv1_1") + self.conv1_2 = ConvBNLayer( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + name="conv1_2") + self.conv1_3 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + name="conv1_3") + self.pool2d_max = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.stages = nn.ModuleList() + self.out_channels = [] + if layers >= 50: + for block in range(len(depth)): + # block_list = [] + block_list = nn.Sequential() + shortcut = False + is_dcn = self.dcn_stage[block] + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name, + is_dcn=is_dcn, + ) + + shortcut = True + block_list.add_module('bb_%d_%d' % (block, i), bottleneck_block) + if block in self.out_indices: + self.out_channels.append(num_filters[block] * 4) + # self.stages.append(nn.Sequential(*block_list)) + self.stages.append(block_list) + else: + for block in range(len(depth)): + # block_list = [] + block_list = nn.Sequential() + shortcut = False + # is_dcn = self.dcn_stage[block] + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = BasicBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name) + + shortcut = True + block_list.add_module('bb_%d_%d' % (block, i), basic_block) + # block_list.append(basic_block) + if block in self.out_indices: + self.out_channels.append(num_filters[block]) + self.stages.append(block_list) + + # self.stages.append(nn.Sequential(*block_list)) + + + def forward(self, inputs): + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + y = self.pool2d_max(y) + out = [] + for i, block in enumerate(self.stages): + y = block(y) + if i in self.out_indices: + out.append(y) + return out \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/backbones/det_resnet_vd_sast.py b/batch_running_task/pytorchocr/modeling/backbones/det_resnet_vd_sast.py new file mode 100644 index 0000000..0f49643 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/det_resnet_vd_sast.py @@ -0,0 +1,279 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation +# import paddle +# from paddle import ParamAttr +# import paddle.nn as nn +# import paddle.nn.functional as F + +__all__ = ["ResNet_SAST"] + + +class ConvBNLayer(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_vd_mode=False, + act=None, + name=None, ): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2d( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self._conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = nn.BatchNorm2d( + out_channels,) + self.act = act + if act is not None: + self._act = Activation(act_type=act) + + + def forward(self, inputs): + if self.is_vd_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + if self.act: + y = self._act(y) + return y + + +class BottleneckBlock(nn.Module): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = torch.add(short, conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Module): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = torch.add(short, conv1) + y = F.relu(y) + return y + + +class ResNet_SAST(nn.Module): + def __init__(self, in_channels=3, layers=50, **kwargs): + super(ResNet_SAST, self).__init__() + + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + # depth = [3, 4, 6, 3] + depth = [3, 4, 6, 3, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + # num_channels = [64, 256, 512, + # 1024] if layers >= 50 else [64, 64, 128, 256] + # num_filters = [64, 128, 256, 512] + num_channels = [64, 256, 512, + 1024, 2048] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512, 512] + + self.conv1_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=32, + kernel_size=3, + stride=2, + act='relu', + name="conv1_1") + self.conv1_2 = ConvBNLayer( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + name="conv1_2") + self.conv1_3 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + name="conv1_3") + # self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + self.pool2d_max = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.stages = nn.ModuleList() + self.out_channels = [3, 64] + if layers >= 50: + for block in range(len(depth)): + # block_list = [] + block_list = nn.Sequential() + shortcut = False + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = BottleneckBlock( + in_channels=num_channels[block] if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name + ) + shortcut = True + # block_list.append(bottleneck_block) + block_list.add_module('bb_%d_%d' % (block, i), bottleneck_block) + self.out_channels.append(num_filters[block] * 4) + # self.stages.append(nn.Sequential(*block_list)) + self.stages.append(block_list) + else: + for block in range(len(depth)): + # block_list = [] + block_list = nn.Sequential() + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = BasicBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name) + shortcut = True + # block_list.append(basic_block) + block_list.add_module('bb_%d_%d' % (block, i), basic_block) + self.out_channels.append(num_filters[block]) + # self.stages.append(nn.Sequential(*block_list)) + self.stages.append(block_list) + + def forward(self, inputs): + out = [inputs] + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + out.append(y) + y = self.pool2d_max(y) + for block in self.stages: + y = block(y) + out.append(y) + return out \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/backbones/e2e_resnet_vd_pg.py b/batch_running_task/pytorchocr/modeling/backbones/e2e_resnet_vd_pg.py new file mode 100644 index 0000000..d6ff30f --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/e2e_resnet_vd_pg.py @@ -0,0 +1,247 @@ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation + +__all__ = ["ResNet"] + + +class ConvBNLayer(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_vd_mode=False, + act=None, + name=None, ): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2d( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self._conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = nn.BatchNorm2d(out_channels) + self.act = act + if self.act is not None: + self._act = Activation(act_type=self.act, inplace=True) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + if self.act is not None: + y = self._act(y) + return y + + +class BottleneckBlock(nn.Module): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=stride, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = torch.add(short, conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Module): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = torch.add(short, conv1) + y = F.relu(y) + return y + + +class ResNet(nn.Module): + def __init__(self, in_channels=3, layers=50, **kwargs): + super(ResNet, self).__init__() + + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + # depth = [3, 4, 6, 3] + depth = [3, 4, 6, 3, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_channels = [64, 256, 512, 1024, + 2048] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512, 512] + + self.conv1_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=7, + stride=2, + act='relu', + name="conv1_1") + self.pool2d_max = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.stages = nn.ModuleList() + self.out_channels = [3, 64] + # num_filters = [64, 128, 256, 512, 512] + if layers >= 50: + for block in range(len(depth)): + block_list = nn.Sequential() + shortcut = False + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneckBlock = BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name) + shortcut = True + block_list.add_module('bb_%d_%d' % (block, i), bottleneckBlock) + self.out_channels.append(num_filters[block] * 4) + self.stages.append(block_list) + else: + for block in range(len(depth)): + block_list = nn.Sequential() + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basicBlock = BasicBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name) + shortcut = True + block_list.add_module('bb_%d_%d' % (block, i), basicBlock) + self.out_channels.append(num_filters[block]) + self.stages.append(block_list) + + + def forward(self, inputs): + out = [inputs] + y = self.conv1_1(inputs) + out.append(y) + y = self.pool2d_max(y) + for block in self.stages: + y = block(y) + out.append(y) + return out diff --git a/batch_running_task/pytorchocr/modeling/backbones/rec_densenet.py b/batch_running_task/pytorchocr/modeling/backbones/rec_densenet.py new file mode 100644 index 0000000..8c1989f --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/rec_densenet.py @@ -0,0 +1,133 @@ +""" +This code is refer from: +https://github.com/LBH1024/CAN/models/densenet.py + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Bottleneck(nn.Module): + def __init__(self, nChannels, growthRate, use_dropout): + super(Bottleneck, self).__init__() + interChannels = 4 * growthRate + self.bn1 = nn.BatchNorm2d(interChannels) + self.conv1 = nn.Conv2d( + nChannels, interChannels, kernel_size=1, + bias=True) # Xavier initialization + self.bn2 = nn.BatchNorm2d(growthRate) + self.conv2 = nn.Conv2d( + interChannels, growthRate, kernel_size=3, padding=1, + bias=True) # Xavier initialization + self.use_dropout = use_dropout + self.dropout = nn.Dropout(p=0.2) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + if self.use_dropout: + out = self.dropout(out) + out = F.relu(self.bn2(self.conv2(out))) + if self.use_dropout: + out = self.dropout(out) + out = torch.cat([x, out], 1) + return out + + +class SingleLayer(nn.Module): + def __init__(self, nChannels, growthRate, use_dropout): + super(SingleLayer, self).__init__() + self.bn1 = nn.BatchNorm2d(nChannels) + self.conv1 = nn.Conv2d( + nChannels, growthRate, kernel_size=3, padding=1, bias=False) + + self.use_dropout = use_dropout + self.dropout = nn.Dropout(p=0.2) + + def forward(self, x): + out = self.conv1(F.relu(x)) + if self.use_dropout: + out = self.dropout(out) + + out = torch.cat([x, out], 1) + return out + + +class Transition(nn.Module): + def __init__(self, nChannels, out_channels, use_dropout): + super(Transition, self).__init__() + self.bn1 = nn.BatchNorm2d(out_channels) + self.conv1 = nn.Conv2d( + nChannels, out_channels, kernel_size=1, bias=False) + self.use_dropout = use_dropout + self.dropout = nn.Dropout(p=0.2) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + if self.use_dropout: + out = self.dropout(out) + out = F.avg_pool2d(out, 2, ceil_mode=True, count_include_pad=False) + return out + + +class DenseNet(nn.Module): + def __init__(self, growthRate, reduction, bottleneck, use_dropout, + input_channel, **kwargs): + super(DenseNet, self).__init__() + + nDenseBlocks = 16 + nChannels = 2 * growthRate + + self.conv1 = nn.Conv2d( + input_channel, + nChannels, + kernel_size=7, + padding=3, + stride=2, + bias=False) + self.dense1 = self._make_dense(nChannels, growthRate, nDenseBlocks, + bottleneck, use_dropout) + nChannels += nDenseBlocks * growthRate + out_channels = int(math.floor(nChannels * reduction)) + self.trans1 = Transition(nChannels, out_channels, use_dropout) + + nChannels = out_channels + self.dense2 = self._make_dense(nChannels, growthRate, nDenseBlocks, + bottleneck, use_dropout) + nChannels += nDenseBlocks * growthRate + out_channels = int(math.floor(nChannels * reduction)) + self.trans2 = Transition(nChannels, out_channels, use_dropout) + + nChannels = out_channels + self.dense3 = self._make_dense(nChannels, growthRate, nDenseBlocks, + bottleneck, use_dropout) + self.out_channels = out_channels + + def _make_dense(self, nChannels, growthRate, nDenseBlocks, bottleneck, + use_dropout): + layers = [] + for i in range(int(nDenseBlocks)): + if bottleneck: + layers.append(Bottleneck(nChannels, growthRate, use_dropout)) + else: + layers.append(SingleLayer(nChannels, growthRate, use_dropout)) + nChannels += growthRate + return nn.Sequential(*layers) + + def forward(self, inputs): + x, x_m, y = inputs + out = self.conv1(x) + out = F.relu(out, inplace=True) + out = F.max_pool2d(out, 2, ceil_mode=True) + out = self.dense1(out) + out = self.trans1(out) + out = self.dense2(out) + out = self.trans2(out) + out = self.dense3(out) + return out, x_m, y diff --git a/batch_running_task/pytorchocr/modeling/backbones/rec_hgnet.py b/batch_running_task/pytorchocr/modeling/backbones/rec_hgnet.py new file mode 100644 index 0000000..fecb1bc --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/rec_hgnet.py @@ -0,0 +1,324 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ConvBNAct(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + use_act=True): + super().__init__() + self.use_act = use_act + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False) + self.bn = nn.BatchNorm2d(out_channels) + if self.use_act: + self.act = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.use_act: + x = self.act(x) + return x + + +class ESEModule(nn.Module): + def __init__(self, channels): + super().__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.conv = nn.Conv2d( + in_channels=channels, + out_channels=channels, + kernel_size=1, + stride=1, + padding=0) + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv(x) + x = self.sigmoid(x) + return x * identity + + +class HG_Block(nn.Module): + def __init__( + self, + in_channels, + mid_channels, + out_channels, + layer_num, + identity=False, ): + super().__init__() + self.identity = identity + + self.layers = nn.ModuleList() + self.layers.append( + ConvBNAct( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=3, + stride=1)) + for _ in range(layer_num - 1): + self.layers.append( + ConvBNAct( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=3, + stride=1)) + + # feature aggregation + total_channels = in_channels + layer_num * mid_channels + self.aggregation_conv = ConvBNAct( + in_channels=total_channels, + out_channels=out_channels, + kernel_size=1, + stride=1) + self.att = ESEModule(out_channels) + + def forward(self, x): + identity = x + output = [] + output.append(x) + for layer in self.layers: + x = layer(x) + output.append(x) + x = torch.cat(output, dim=1) + x = self.aggregation_conv(x) + x = self.att(x) + if self.identity: + x += identity + return x + + +class HG_Stage(nn.Module): + def __init__(self, + in_channels, + mid_channels, + out_channels, + block_num, + layer_num, + downsample=True, + stride=[2, 1]): + super().__init__() + self.downsample = downsample + if downsample: + self.downsample = ConvBNAct( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + stride=stride, + groups=in_channels, + use_act=False) + + blocks_list = [] + blocks_list.append( + HG_Block( + in_channels, + mid_channels, + out_channels, + layer_num, + identity=False)) + for _ in range(block_num - 1): + blocks_list.append( + HG_Block( + out_channels, + mid_channels, + out_channels, + layer_num, + identity=True)) + self.blocks = nn.Sequential(*blocks_list) + + def forward(self, x): + if self.downsample: + x = self.downsample(x) + x = self.blocks(x) + return x + + +class PPHGNet(nn.Module): + """ + PPHGNet + Args: + stem_channels: list. Stem channel list of PPHGNet. + stage_config: dict. The configuration of each stage of PPHGNet. such as the number of channels, stride, etc. + layer_num: int. Number of layers of HG_Block. + use_last_conv: boolean. Whether to use a 1x1 convolutional layer before the classification layer. + class_expand: int=2048. Number of channels for the last 1x1 convolutional layer. + dropout_prob: float. Parameters of dropout, 0.0 means dropout is not used. + class_num: int=1000. The number of classes. + Returns: + model: nn.Layer. Specific PPHGNet model depends on args. + """ + + def __init__( + self, + stem_channels, + stage_config, + layer_num, + in_channels=3, + det=False, + out_indices=None): + super().__init__() + self.det = det + self.out_indices = out_indices if out_indices is not None else [ + 0, 1, 2, 3 + ] + + # stem + stem_channels.insert(0, in_channels) + self.stem = nn.Sequential(* [ + ConvBNAct( + in_channels=stem_channels[i], + out_channels=stem_channels[i + 1], + kernel_size=3, + stride=2 if i == 0 else 1) for i in range( + len(stem_channels) - 1) + ]) + + if self.det: + self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + # stages + self.stages = nn.ModuleList() + self.out_channels = [] + for block_id, k in enumerate(stage_config): + in_channels, mid_channels, out_channels, block_num, downsample, stride = stage_config[ + k] + self.stages.append( + HG_Stage(in_channels, mid_channels, out_channels, block_num, + layer_num, downsample, stride)) + if block_id in self.out_indices: + self.out_channels.append(out_channels) + + if not self.det: + self.out_channels = stage_config["stage4"][2] + + self._init_weights() + + def _init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.zeros_(m.bias) + + def forward(self, x): + x = self.stem(x) + if self.det: + x = self.pool(x) + + out = [] + for i, stage in enumerate(self.stages): + x = stage(x) + if self.det and i in self.out_indices: + out.append(x) + if self.det: + return out + + if self.training: + x = F.adaptive_avg_pool2d(x, [1, 40]) + else: + x = F.avg_pool2d(x, [3, 2]) + return x + + +def PPHGNet_tiny(pretrained=False, use_ssld=False, **kwargs): + """ + PPHGNet_tiny + Args: + pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True. + Returns: + model: nn.Layer. Specific `PPHGNet_tiny` model depends on args. + """ + stage_config = { + # in_channels, mid_channels, out_channels, blocks, downsample + "stage1": [96, 96, 224, 1, False, [2, 1]], + "stage2": [224, 128, 448, 1, True, [1, 2]], + "stage3": [448, 160, 512, 2, True, [2, 1]], + "stage4": [512, 192, 768, 1, True, [2, 1]], + } + + model = PPHGNet( + stem_channels=[48, 48, 96], + stage_config=stage_config, + layer_num=5, + **kwargs) + return model + + +def PPHGNet_small(pretrained=False, use_ssld=False, det=False, **kwargs): + """ + PPHGNet_small + Args: + pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True. + Returns: + model: nn.Layer. Specific `PPHGNet_small` model depends on args. + """ + stage_config_det = { + # in_channels, mid_channels, out_channels, blocks, downsample + "stage1": [128, 128, 256, 1, False, 2], + "stage2": [256, 160, 512, 1, True, 2], + "stage3": [512, 192, 768, 2, True, 2], + "stage4": [768, 224, 1024, 1, True, 2], + } + + stage_config_rec = { + # in_channels, mid_channels, out_channels, blocks, downsample + "stage1": [128, 128, 256, 1, True, [2, 1]], + "stage2": [256, 160, 512, 1, True, [1, 2]], + "stage3": [512, 192, 768, 2, True, [2, 1]], + "stage4": [768, 224, 1024, 1, True, [2, 1]], + } + + model = PPHGNet( + stem_channels=[64, 64, 128], + stage_config=stage_config_det if det else stage_config_rec, + layer_num=6, + det=det, + **kwargs) + return model + + +def PPHGNet_base(pretrained=False, use_ssld=True, **kwargs): + """ + PPHGNet_base + Args: + pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True. + Returns: + model: nn.Layer. Specific `PPHGNet_base` model depends on args. + """ + stage_config = { + # in_channels, mid_channels, out_channels, blocks, downsample + "stage1": [160, 192, 320, 1, False, [2, 1]], + "stage2": [320, 224, 640, 2, True, [1, 2]], + "stage3": [640, 256, 960, 3, True, [2, 1]], + "stage4": [960, 288, 1280, 2, True, [2, 1]], + } + + model = PPHGNet( + stem_channels=[96, 96, 160], + stage_config=stage_config, + layer_num=7, + **kwargs) + return model diff --git a/batch_running_task/pytorchocr/modeling/backbones/rec_lcnetv3.py b/batch_running_task/pytorchocr/modeling/backbones/rec_lcnetv3.py new file mode 100644 index 0000000..a25bfeb --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/rec_lcnetv3.py @@ -0,0 +1,474 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation + +# from paddle.nn.initializer import Constant, KaimingNormal +# from paddle.nn import AdaptiveAvgPool2D, BatchNorm2D, Conv2D, Dropout, Hardsigmoid, Hardswish, Identity, Linear, ReLU +# from paddle.regularizer import L2Decay + +NET_CONFIG_det = { + "blocks2": + #k, in_c, out_c, s, use_se + [[3, 16, 32, 1, False]], + "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]], + "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]], + "blocks5": + [[3, 128, 256, 2, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False], + [5, 256, 256, 1, False], [5, 256, 256, 1, False]], + "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True], + [5, 512, 512, 1, False], [5, 512, 512, 1, False]] +} + +NET_CONFIG_rec = { + "blocks2": + #k, in_c, out_c, s, use_se + [[3, 16, 32, 1, False]], + "blocks3": [[3, 32, 64, 1, False], [3, 64, 64, 1, False]], + "blocks4": [[3, 64, 128, (2, 1), False], [3, 128, 128, 1, False]], + "blocks5": + [[3, 128, 256, (1, 2), False], [5, 256, 256, 1, False], + [5, 256, 256, 1, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False]], + "blocks6": [[5, 256, 512, (2, 1), True], [5, 512, 512, 1, True], + [5, 512, 512, (2, 1), False], [5, 512, 512, 1, False]] +} + + +def make_divisible(v, divisor=16, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class LearnableAffineBlock(nn.Module): + def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, + lab_lr=0.1): + super().__init__() + self.scale = nn.Parameter(torch.Tensor([scale_value])) + self.bias = nn.Parameter(torch.Tensor([bias_value])) + + def forward(self, x): + return self.scale * x + self.bias + + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + lr_mult=1.0): + super().__init__() + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False) + + self.bn = nn.BatchNorm2d( + out_channels, + ) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class Act(nn.Module): + def __init__(self, act="hswish", lr_mult=1.0, lab_lr=0.1): + super().__init__() + if act == "hswish": + self.act = nn.Hardswish(inplace=True) + else: + assert act == "relu" + self.act = Activation(act) + self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr) + + def forward(self, x): + return self.lab(self.act(x)) + + +class LearnableRepLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + num_conv_branches=1, + lr_mult=1.0, + lab_lr=0.1): + super().__init__() + self.is_repped = False + self.groups = groups + self.stride = stride + self.kernel_size = kernel_size + self.in_channels = in_channels + self.out_channels = out_channels + self.num_conv_branches = num_conv_branches + self.padding = (kernel_size - 1) // 2 + + self.identity = nn.BatchNorm2d( + num_features=in_channels, + ) if out_channels == in_channels and stride == 1 else None + + self.conv_kxk = nn.ModuleList([ + ConvBNLayer( + in_channels, + out_channels, + kernel_size, + stride, + groups=groups, + lr_mult=lr_mult) for _ in range(self.num_conv_branches) + ]) + + self.conv_1x1 = ConvBNLayer( + in_channels, + out_channels, + 1, + stride, + groups=groups, + lr_mult=lr_mult) if kernel_size > 1 else None + + self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr) + self.act = Act(lr_mult=lr_mult, lab_lr=lab_lr) + + def forward(self, x): + # for export + if self.is_repped: + out = self.lab(self.reparam_conv(x)) + if self.stride != 2: + out = self.act(out) + return out + + out = 0 + if self.identity is not None: + out += self.identity(x) + + if self.conv_1x1 is not None: + out += self.conv_1x1(x) + + for conv in self.conv_kxk: + out += conv(x) + + out = self.lab(out) + if self.stride != 2: + out = self.act(out) + return out + + def rep(self): + if self.is_repped: + return + kernel, bias = self._get_kernel_bias() + self.reparam_conv = nn.Conv2d( + in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + groups=self.groups) + self.reparam_conv.weight.data = kernel + self.reparam_conv.bias.data = bias + self.is_repped = True + + def _pad_kernel_1x1_to_kxk(self, kernel1x1, pad): + if not isinstance(kernel1x1, torch.Tensor): + return 0 + else: + return nn.functional.pad(kernel1x1, [pad, pad, pad, pad]) + + def _get_kernel_bias(self): + kernel_conv_1x1, bias_conv_1x1 = self._fuse_bn_tensor(self.conv_1x1) + kernel_conv_1x1 = self._pad_kernel_1x1_to_kxk(kernel_conv_1x1, + self.kernel_size // 2) + + kernel_identity, bias_identity = self._fuse_bn_tensor(self.identity) + + kernel_conv_kxk = 0 + bias_conv_kxk = 0 + for conv in self.conv_kxk: + kernel, bias = self._fuse_bn_tensor(conv) + kernel_conv_kxk += kernel + bias_conv_kxk += bias + + kernel_reparam = kernel_conv_kxk + kernel_conv_1x1 + kernel_identity + bias_reparam = bias_conv_kxk + bias_conv_1x1 + bias_identity + return kernel_reparam, bias_reparam + + def _fuse_bn_tensor(self, branch): + if not branch: + return 0, 0 + elif isinstance(branch, ConvBNLayer): + kernel = branch.conv.weight + running_mean = branch.bn._mean + running_var = branch.bn._variance + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn._epsilon + else: + assert isinstance(branch, nn.BatchNorm2d) + if not hasattr(self, 'id_tensor'): + input_dim = self.in_channels // self.groups + kernel_value = torch.zeros( + (self.in_channels, input_dim, self.kernel_size, + self.kernel_size), + dtype=branch.weight.dtype) + for i in range(self.in_channels): + kernel_value[i, i % input_dim, self.kernel_size // 2, + self.kernel_size // 2] = 1 + self.id_tensor = kernel_value + kernel = self.id_tensor + running_mean = branch._mean + running_var = branch._variance + gamma = branch.weight + beta = branch.bias + eps = branch._epsilon + std = (running_var + eps).sqrt() + t = (gamma / std).reshape((-1, 1, 1, 1)) + return kernel * t, beta - running_mean * gamma / std + + +class SELayer(nn.Module): + def __init__(self, channel, reduction=4, lr_mult=1.0): + super().__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.conv1 = nn.Conv2d( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0, + ) + self.relu = nn.ReLU() + self.conv2 = nn.Conv2d( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + ) + self.hardsigmoid = nn.Hardsigmoid(inplace=True) + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.hardsigmoid(x) + x = identity * x + return x + + +class LCNetV3Block(nn.Module): + def __init__(self, + in_channels, + out_channels, + stride, + dw_size, + use_se=False, + conv_kxk_num=4, + lr_mult=1.0, + lab_lr=0.1): + super().__init__() + self.use_se = use_se + self.dw_conv = LearnableRepLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=dw_size, + stride=stride, + groups=in_channels, + num_conv_branches=conv_kxk_num, + lr_mult=lr_mult, + lab_lr=lab_lr) + if use_se: + self.se = SELayer(in_channels, lr_mult=lr_mult) + self.pw_conv = LearnableRepLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + num_conv_branches=conv_kxk_num, + lr_mult=lr_mult, + lab_lr=lab_lr) + + def forward(self, x): + x = self.dw_conv(x) + if self.use_se: + x = self.se(x) + x = self.pw_conv(x) + return x + + +class PPLCNetV3(nn.Module): + def __init__(self, + scale=1.0, + conv_kxk_num=4, + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + lab_lr=0.1, + det=False, + **kwargs): + super().__init__() + self.scale = scale + self.lr_mult_list = lr_mult_list + self.det = det + + self.net_config = NET_CONFIG_det if self.det else NET_CONFIG_rec + + assert isinstance(self.lr_mult_list, ( + list, tuple + )), "lr_mult_list should be in (list, tuple) but got {}".format( + type(self.lr_mult_list)) + assert len(self.lr_mult_list + ) == 6, "lr_mult_list length should be 6 but got {}".format( + len(self.lr_mult_list)) + + self.conv1 = ConvBNLayer( + in_channels=3, + out_channels=make_divisible(16 * scale), + kernel_size=3, + stride=2, + lr_mult=self.lr_mult_list[0]) + + self.blocks2 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[1], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks2"]) + ]) + + self.blocks3 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[2], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks3"]) + ]) + + self.blocks4 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[3], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks4"]) + ]) + + self.blocks5 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[4], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks5"]) + ]) + + self.blocks6 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[5], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks6"]) + ]) + self.out_channels = make_divisible(512 * scale) + + if self.det: + mv_c = [16, 24, 56, 480] + self.out_channels = [ + make_divisible(self.net_config["blocks3"][-1][2] * scale), + make_divisible(self.net_config["blocks4"][-1][2] * scale), + make_divisible(self.net_config["blocks5"][-1][2] * scale), + make_divisible(self.net_config["blocks6"][-1][2] * scale), + ] + + self.layer_list = nn.ModuleList([ + nn.Conv2d(self.out_channels[0], int(mv_c[0] * scale), 1, 1, 0), + nn.Conv2d(self.out_channels[1], int(mv_c[1] * scale), 1, 1, 0), + nn.Conv2d(self.out_channels[2], int(mv_c[2] * scale), 1, 1, 0), + nn.Conv2d(self.out_channels[3], int(mv_c[3] * scale), 1, 1, 0) + ]) + self.out_channels = [ + int(mv_c[0] * scale), int(mv_c[1] * scale), + int(mv_c[2] * scale), int(mv_c[3] * scale) + ] + + def forward(self, x): + out_list = [] + x = self.conv1(x) + x = self.blocks2(x) + x = self.blocks3(x) + out_list.append(x) + x = self.blocks4(x) + out_list.append(x) + x = self.blocks5(x) + out_list.append(x) + x = self.blocks6(x) + out_list.append(x) + + if self.det: + out_list[0] = self.layer_list[0](out_list[0]) + out_list[1] = self.layer_list[1](out_list[1]) + out_list[2] = self.layer_list[2](out_list[2]) + out_list[3] = self.layer_list[3](out_list[3]) + return out_list + + if self.training: + x = F.adaptive_avg_pool2d(x, [1, 40]) + else: + x = F.avg_pool2d(x, [3, 2]) + return x diff --git a/batch_running_task/pytorchocr/modeling/backbones/rec_lcnetv3_bak.py b/batch_running_task/pytorchocr/modeling/backbones/rec_lcnetv3_bak.py new file mode 100644 index 0000000..2c5d18b --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/rec_lcnetv3_bak.py @@ -0,0 +1,444 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation + +NET_CONFIG_det = { + "blocks2": + # k, in_c, out_c, s, use_se + [[3, 16, 32, 1, False]], + "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]], + "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]], + "blocks5": + [[3, 128, 256, 2, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False], + [5, 256, 256, 1, False], [5, 256, 256, 1, False]], + "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True], + [5, 512, 512, 1, False], [5, 512, 512, 1, False]] +} + +NET_CONFIG_rec = { + "blocks2": + # k, in_c, out_c, s, use_se + [[3, 16, 32, 1, False]], + "blocks3": [[3, 32, 64, 1, False], [3, 64, 64, 1, False]], + "blocks4": [[3, 64, 128, (2, 1), False], [3, 128, 128, 1, False]], + "blocks5": + [[3, 128, 256, (1, 2), False], [5, 256, 256, 1, False], + [5, 256, 256, 1, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False]], + "blocks6": [[5, 256, 512, (2, 1), True], [5, 512, 512, 1, True], + [5, 512, 512, (2, 1), False], [5, 512, 512, 1, False]] +} + + +def make_divisible(v, divisor=16, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class LearnableAffineBlock(nn.Module): + def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, + lab_lr=0.1): + super().__init__() + self.scale = nn.Parameter(torch.Tensor([scale_value])) + self.bias = nn.Parameter(torch.Tensor([bias_value])) + + def forward(self, x): + return self.scale * x + self.bias + + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + lr_mult=1.0): + super().__init__() + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False) + + self.bn = nn.BatchNorm2d(out_channels) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class Act(nn.Module): + def __init__(self, act="hard_swish", lr_mult=1.0, lab_lr=0.1): + super().__init__() + assert act in ['hard_swish', 'relu'] + self.act = Activation(act) + self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr) + + def forward(self, x): + return self.lab(self.act(x)) + + +class LearnableRepLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + num_conv_branches=1, + lr_mult=1.0, + lab_lr=0.1): + super().__init__() + self.is_repped = False + self.groups = groups + self.stride = stride + self.kernel_size = kernel_size + self.in_channels = in_channels + self.out_channels = out_channels + self.num_conv_branches = num_conv_branches + self.padding = (kernel_size - 1) // 2 + self.identity = nn.BatchNorm2d(in_channels) if out_channels == in_channels and stride == 1 else None + + self.conv_kxk = nn.ModuleList([ + ConvBNLayer( + in_channels, + out_channels, + kernel_size, + stride, + groups=groups, + lr_mult=lr_mult) for _ in range(self.num_conv_branches) + ]) + + self.conv_1x1 = ConvBNLayer( + in_channels, + out_channels, + 1, + stride, + groups=groups, + lr_mult=lr_mult) if kernel_size > 1 else None + self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr) + self.act = Act(lr_mult=lr_mult, lab_lr=lab_lr) + + def forward(self, x): + # for export + if self.is_repped: + out = self.lab(self.reparam_conv(x)) + if self.stride != 2: + out = self.act(out) + return out + + out = 0 + if self.identity is not None: + out += self.identity(x) + + if self.conv_1x1 is not None: + out += self.conv_1x1(x) + + for conv in self.conv_kxk: + out += conv(x) + + out = self.lab(out) + if self.stride != 2: + out = self.act(out) + return out + + def rep(self): + if self.is_repped: + return + kernel, bias = self._get_kernel_bias() + self.reparam_conv = nn.Conv2d( + in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + groups=self.groups) + self.reparam_conv.weight.data = kernel + self.reparam_conv.bias.data = bias + self.is_repped = True + + def _pad_kernel_1x1_to_kxk(self, kernel1x1, pad): + if not isinstance(kernel1x1, torch.Tensor): + return 0 + else: + return nn.functional.pad(kernel1x1, [pad, pad, pad, pad]) + + def _get_kernel_bias(self): + kernel_conv_1x1, bias_conv_1x1 = self._fuse_bn_tensor(self.conv_1x1) + kernel_conv_1x1 = self._pad_kernel_1x1_to_kxk(kernel_conv_1x1, + self.kernel_size // 2) + + kernel_identity, bias_identity = self._fuse_bn_tensor(self.identity) + + kernel_conv_kxk = 0 + bias_conv_kxk = 0 + for conv in self.conv_kxk: + kernel, bias = self._fuse_bn_tensor(conv) + kernel_conv_kxk += kernel + bias_conv_kxk += bias + + kernel_reparam = kernel_conv_kxk + kernel_conv_1x1 + kernel_identity + bias_reparam = bias_conv_kxk + bias_conv_1x1 + bias_identity + return kernel_reparam, bias_reparam + + def _fuse_bn_tensor(self, branch): + if not branch: + return 0, 0 + elif isinstance(branch, ConvBNLayer): + kernel = branch.conv.weight + running_mean = branch.bn.running_mean + running_var = branch.bn.running_var + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn.eps + else: + assert isinstance(branch, nn.BatchNorm2d) + if not hasattr(self, 'id_tensor'): + input_dim = self.in_channels // self.groups + kernel_value = torch.zeros( + (self.in_channels, input_dim, self.kernel_size, + self.kernel_size), + dtype=branch.weight.dtype) + for i in range(self.in_channels): + kernel_value[i, i % input_dim, self.kernel_size // 2, + self.kernel_size // 2] = 1 + self.id_tensor = kernel_value + kernel = self.id_tensor + running_mean = branch.running_mean + running_var = branch.running_var + gamma = branch.weight + beta = branch.bias + eps = branch.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape((-1, 1, 1, 1)) + return kernel * t, beta - running_mean * gamma / std + + +class SELayer(nn.Module): + def __init__(self, channel, reduction=4, lr_mult=1.0): + super().__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.conv1 = nn.Conv2d( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0) + self.relu = nn.ReLU() + self.conv2 = nn.Conv2d( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0) + self.hardsigmoid = Activation('hard_sigmoid') + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.hardsigmoid(x) + x = x * identity + return x + + +class LCNetV3Block(nn.Module): + def __init__(self, + in_channels, + out_channels, + stride, + dw_size, + use_se=False, + conv_kxk_num=4, + lr_mult=1.0, + lab_lr=0.1): + super().__init__() + self.use_se = use_se + self.dw_conv = LearnableRepLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=dw_size, + stride=stride, + groups=in_channels, + num_conv_branches=conv_kxk_num, + lr_mult=lr_mult, + lab_lr=lab_lr) + if use_se: + self.se = SELayer(in_channels, lr_mult=lr_mult) + self.pw_conv = LearnableRepLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + num_conv_branches=conv_kxk_num, + lr_mult=lr_mult, + lab_lr=lab_lr) + + def forward(self, x): + x = self.dw_conv(x) + if self.use_se: + x = self.se(x) + x = self.pw_conv(x) + return x + + +class PPLCNetV3(nn.Module): + def __init__(self, + scale=1.0, + conv_kxk_num=4, + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + lab_lr=0.1, + det=False, + **kwargs): + super().__init__() + self.scale = scale + self.lr_mult_list = lr_mult_list + self.det = det + + self.net_config = NET_CONFIG_det if self.det else NET_CONFIG_rec + + assert isinstance(self.lr_mult_list, ( + list, tuple + )), "lr_mult_list should be in (list, tuple) but got {}".format( + type(self.lr_mult_list)) + assert len(self.lr_mult_list + ) == 6, "lr_mult_list length should be 6 but got {}".format( + len(self.lr_mult_list)) + + self.conv1 = ConvBNLayer( + in_channels=3, + out_channels=make_divisible(16 * scale), + kernel_size=3, + stride=2, + lr_mult=self.lr_mult_list[0]) + + self.blocks2 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[1], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks2"]) + ]) + + self.blocks3 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[2], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks3"]) + ]) + + self.blocks4 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[3], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks4"]) + ]) + + self.blocks5 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[4], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks5"]) + ]) + + self.blocks6 = nn.Sequential(*[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[5], + lab_lr=lab_lr) + for i, (k, in_c, out_c, s, se + ) in enumerate(self.net_config["blocks6"]) + ]) + self.out_channels = make_divisible(512 * scale) + + if self.det: + mv_c = [16, 24, 56, 480] + self.out_channels = [ + make_divisible(self.net_config["blocks3"][-1][2] * scale), + make_divisible(self.net_config["blocks4"][-1][2] * scale), + make_divisible(self.net_config["blocks5"][-1][2] * scale), + make_divisible(self.net_config["blocks6"][-1][2] * scale), + ] + + self.layer_list = nn.ModuleList([ + nn.Conv2d(self.out_channels[0], int(mv_c[0] * scale), 1, 1, 0), + nn.Conv2d(self.out_channels[1], int(mv_c[1] * scale), 1, 1, 0), + nn.Conv2d(self.out_channels[2], int(mv_c[2] * scale), 1, 1, 0), + nn.Conv2d(self.out_channels[3], int(mv_c[3] * scale), 1, 1, 0) + ]) + self.out_channels = [ + int(mv_c[0] * scale), int(mv_c[1] * scale), + int(mv_c[2] * scale), int(mv_c[3] * scale) + ] + + def forward(self, x): + out_list = [] + x = self.conv1(x) + + x = self.blocks2(x) + x = self.blocks3(x) + out_list.append(x) + x = self.blocks4(x) + out_list.append(x) + x = self.blocks5(x) + out_list.append(x) + import numpy as np + x = torch.Tensor(np.load('../PaddleOCR4debug/tmp.npy')) + x = self.blocks6(x) + out_list.append(x) + + if self.det: + out_list[0] = self.layer_list[0](out_list[0]) + out_list[1] = self.layer_list[1](out_list[1]) + out_list[2] = self.layer_list[2](out_list[2]) + out_list[3] = self.layer_list[3](out_list[3]) + return out_list + + if self.training: + x = F.adaptive_avg_pool2d(x, [1, 40]) + else: + x = F.avg_pool2d(x, [3, 2]) + return x \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/backbones/rec_mobilenet_v3.py b/batch_running_task/pytorchocr/modeling/backbones/rec_mobilenet_v3.py new file mode 100644 index 0000000..52cc7a1 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/rec_mobilenet_v3.py @@ -0,0 +1,125 @@ +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation + +from .det_mobilenet_v3 import ResidualUnit, ConvBNLayer, make_divisible + +class MobileNetV3(nn.Module): + def __init__(self, + in_channels=3, + model_name='small', + scale=0.5, + large_stride=None, + small_stride=None, + **kwargs): + super(MobileNetV3, self).__init__() + if small_stride is None: + small_stride = [2, 2, 2, 2] + if large_stride is None: + large_stride = [1, 2, 2, 2] + + assert isinstance(large_stride, list), "large_stride type must " \ + "be list but got {}".format(type(large_stride)) + assert isinstance(small_stride, list), "small_stride type must " \ + "be list but got {}".format(type(small_stride)) + assert len(large_stride) == 4, "large_stride length must be " \ + "4 but got {}".format(len(large_stride)) + assert len(small_stride) == 4, "small_stride length must be " \ + "4 but got {}".format(len(small_stride)) + + if model_name == "large": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, False, 'relu', large_stride[0]], + [3, 64, 24, False, 'relu', (large_stride[1], 1)], + [3, 72, 24, False, 'relu', 1], + [5, 72, 40, True, 'relu', (large_stride[2], 1)], + [5, 120, 40, True, 'relu', 1], + [5, 120, 40, True, 'relu', 1], + [3, 240, 80, False, 'hard_swish', 1], + [3, 200, 80, False, 'hard_swish', 1], + [3, 184, 80, False, 'hard_swish', 1], + [3, 184, 80, False, 'hard_swish', 1], + [3, 480, 112, True, 'hard_swish', 1], + [3, 672, 112, True, 'hard_swish', 1], + [5, 672, 160, True, 'hard_swish', (large_stride[3], 1)], + [5, 960, 160, True, 'hard_swish', 1], + [5, 960, 160, True, 'hard_swish', 1], + ] + cls_ch_squeeze = 960 + elif model_name == "small": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, True, 'relu', (small_stride[0], 1)], + [3, 72, 24, False, 'relu', (small_stride[1], 1)], + [3, 88, 24, False, 'relu', 1], + [5, 96, 40, True, 'hard_swish', (small_stride[2], 1)], + [5, 240, 40, True, 'hard_swish', 1], + [5, 240, 40, True, 'hard_swish', 1], + [5, 120, 48, True, 'hard_swish', 1], + [5, 144, 48, True, 'hard_swish', 1], + [5, 288, 96, True, 'hard_swish', (small_stride[3], 1)], + [5, 576, 96, True, 'hard_swish', 1], + [5, 576, 96, True, 'hard_swish', 1], + ] + cls_ch_squeeze = 576 + else: + raise NotImplementedError("mode[" + model_name + + "_model] is not implemented!") + + supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25] + assert scale in supported_scale, \ + "supported scales are {} but input scale is {}".format(supported_scale, scale) + + inplanes = 16 + # conv1 + self.conv1 = ConvBNLayer( + in_channels=in_channels, + out_channels=make_divisible(inplanes * scale), + kernel_size=3, + stride=2, + padding=1, + groups=1, + if_act=True, + act='hard_swish', + name='conv1') + i = 0 + block_list = [] + inplanes = make_divisible(inplanes * scale) + for (k, exp, c, se, nl, s) in cfg: + block_list.append( + ResidualUnit( + in_channels=inplanes, + mid_channels=make_divisible(scale * exp), + out_channels=make_divisible(scale * c), + kernel_size=k, + stride=s, + use_se=se, + act=nl, + name='conv' + str(i + 2))) + inplanes = make_divisible(scale * c) + i += 1 + self.blocks = nn.Sequential(*block_list) + + self.conv2 = ConvBNLayer( + in_channels=inplanes, + out_channels=make_divisible(scale * cls_ch_squeeze), + kernel_size=1, + stride=1, + padding=0, + groups=1, + if_act=True, + act='hard_swish', + name='conv_last') + + self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) + self.out_channels = make_divisible(scale * cls_ch_squeeze) + + def forward(self, x): + x = self.conv1(x) + x = self.blocks(x) + x = self.conv2(x) + x = self.pool(x) + return x \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/backbones/rec_mv1_enhance.py b/batch_running_task/pytorchocr/modeling/backbones/rec_mv1_enhance.py new file mode 100644 index 0000000..b4f9643 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/rec_mv1_enhance.py @@ -0,0 +1,233 @@ +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation + + +class ConvBNLayer(nn.Module): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + padding, + channels=None, + num_groups=1, + act='hard_swish'): + super(ConvBNLayer, self).__init__() + self.act = act + self._conv = nn.Conv2d( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + bias=False) + + self._batch_norm = nn.BatchNorm2d( + num_filters, + ) + if self.act is not None: + self._act = Activation(act_type=act, inplace=True) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + if self.act is not None: + y = self._act(y) + return y + + +class DepthwiseSeparable(nn.Module): + def __init__(self, + num_channels, + num_filters1, + num_filters2, + num_groups, + stride, + scale, + dw_size=3, + padding=1, + use_se=False): + super(DepthwiseSeparable, self).__init__() + self.use_se = use_se + self._depthwise_conv = ConvBNLayer( + num_channels=num_channels, + num_filters=int(num_filters1 * scale), + filter_size=dw_size, + stride=stride, + padding=padding, + num_groups=int(num_groups * scale)) + if use_se: + self._se = SEModule(int(num_filters1 * scale)) + self._pointwise_conv = ConvBNLayer( + num_channels=int(num_filters1 * scale), + filter_size=1, + num_filters=int(num_filters2 * scale), + stride=1, + padding=0) + + def forward(self, inputs): + y = self._depthwise_conv(inputs) + if self.use_se: + y = self._se(y) + y = self._pointwise_conv(y) + return y + + +class MobileNetV1Enhance(nn.Module): + def __init__(self, + in_channels=3, + scale=0.5, + last_conv_stride=1, + last_pool_type='max', + **kwargs): + super().__init__() + self.scale = scale + self.block_list = [] + + self.conv1 = ConvBNLayer( + num_channels=in_channels, + filter_size=3, + channels=3, + num_filters=int(32 * scale), + stride=2, + padding=1) + + conv2_1 = DepthwiseSeparable( + num_channels=int(32 * scale), + num_filters1=32, + num_filters2=64, + num_groups=32, + stride=1, + scale=scale) + self.block_list.append(conv2_1) + + conv2_2 = DepthwiseSeparable( + num_channels=int(64 * scale), + num_filters1=64, + num_filters2=128, + num_groups=64, + stride=1, + scale=scale) + self.block_list.append(conv2_2) + + conv3_1 = DepthwiseSeparable( + num_channels=int(128 * scale), + num_filters1=128, + num_filters2=128, + num_groups=128, + stride=1, + scale=scale) + self.block_list.append(conv3_1) + + conv3_2 = DepthwiseSeparable( + num_channels=int(128 * scale), + num_filters1=128, + num_filters2=256, + num_groups=128, + stride=(2, 1), + scale=scale) + self.block_list.append(conv3_2) + + conv4_1 = DepthwiseSeparable( + num_channels=int(256 * scale), + num_filters1=256, + num_filters2=256, + num_groups=256, + stride=1, + scale=scale) + self.block_list.append(conv4_1) + + conv4_2 = DepthwiseSeparable( + num_channels=int(256 * scale), + num_filters1=256, + num_filters2=512, + num_groups=256, + stride=(2, 1), + scale=scale) + self.block_list.append(conv4_2) + + for _ in range(5): + conv5 = DepthwiseSeparable( + num_channels=int(512 * scale), + num_filters1=512, + num_filters2=512, + num_groups=512, + stride=1, + dw_size=5, + padding=2, + scale=scale, + use_se=False) + self.block_list.append(conv5) + + conv5_6 = DepthwiseSeparable( + num_channels=int(512 * scale), + num_filters1=512, + num_filters2=1024, + num_groups=512, + stride=(2, 1), + dw_size=5, + padding=2, + scale=scale, + use_se=True) + self.block_list.append(conv5_6) + + conv6 = DepthwiseSeparable( + num_channels=int(1024 * scale), + num_filters1=1024, + num_filters2=1024, + num_groups=1024, + stride=last_conv_stride, + dw_size=5, + padding=2, + use_se=True, + scale=scale) + self.block_list.append(conv6) + + self.block_list = nn.Sequential(*self.block_list) + if last_pool_type == 'avg': + self.pool = nn.AvgPool2d(kernel_size=2, stride=2, padding=0) + else: + self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) + self.out_channels = int(1024 * scale) + + def forward(self, inputs): + y = self.conv1(inputs) + y = self.block_list(y) + y = self.pool(y) + return y + +def hardsigmoid(x): + return F.relu6(x + 3., inplace=True) / 6. + +class SEModule(nn.Module): + def __init__(self, channel, reduction=4): + super(SEModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.conv1 = nn.Conv2d( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0, + bias=True) + self.conv2 = nn.Conv2d( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + bias=True) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = F.relu(outputs) + outputs = self.conv2(outputs) + outputs = hardsigmoid(outputs) + x = torch.mul(inputs, outputs) + + return x diff --git a/batch_running_task/pytorchocr/modeling/backbones/rec_nrtr_mtb.py b/batch_running_task/pytorchocr/modeling/backbones/rec_nrtr_mtb.py new file mode 100644 index 0000000..33e375c --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/rec_nrtr_mtb.py @@ -0,0 +1,36 @@ + +import torch +from torch import nn + + +class MTB(nn.Module): + def __init__(self, cnn_num, in_channels): + super(MTB, self).__init__() + self.block = nn.Sequential() + self.out_channels = in_channels + self.cnn_num = cnn_num + if self.cnn_num == 2: + for i in range(self.cnn_num): + self.block.add_module( + 'conv_{}'.format(i), + nn.Conv2d( + in_channels=in_channels + if i == 0 else 32 * (2**(i - 1)), + out_channels=32 * (2**i), + kernel_size=3, + stride=2, + padding=1)) + self.block.add_module('relu_{}'.format(i), nn.ReLU()) + self.block.add_module('bn_{}'.format(i), + nn.BatchNorm2d(32 * (2**i))) + + + def forward(self, images): + x = self.block(images) + if self.cnn_num == 2: + # (b, w, h, c) + x = x.permute(0, 3, 2, 1) + x_shape = x.shape + x = torch.reshape( + x, (x_shape[0], x_shape[1], x_shape[2] * x_shape[3])) + return x diff --git a/batch_running_task/pytorchocr/modeling/backbones/rec_resnet_31.py b/batch_running_task/pytorchocr/modeling/backbones/rec_resnet_31.py new file mode 100644 index 0000000..ab7c30f --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/rec_resnet_31.py @@ -0,0 +1,200 @@ +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/layers/conv_layer.py +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/backbones/resnet31_ocr.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F +# import paddle +# from paddle import ParamAttr +# import paddle.nn as nn +# import paddle.nn.functional as F + + +__all__ = ["ResNet31"] + + +def conv3x3(in_channel, out_channel, stride=1): + return nn.Conv2d( + in_channel, + out_channel, + kernel_size=3, + stride=stride, + padding=1, + bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, in_channels, channels, stride=1, downsample=False): + super().__init__() + self.conv1 = conv3x3(in_channels, channels, stride) + self.bn1 = nn.BatchNorm2d(channels) + self.relu = nn.ReLU() + self.conv2 = conv3x3(channels, channels) + self.bn2 = nn.BatchNorm2d(channels) + self.downsample = downsample + if downsample: + self.downsample = nn.Sequential( + nn.Conv2d( + in_channels, + channels * self.expansion, + 1, + stride, + bias=False), + nn.BatchNorm2d(channels * self.expansion), ) + else: + self.downsample = nn.Sequential() + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet31(nn.Module): + ''' + Args: + in_channels (int): Number of channels of input image tensor. + layers (list[int]): List of BasicBlock number for each stage. + channels (list[int]): List of out_channels of Conv2d layer. + out_indices (None | Sequence[int]): Indices of output stages. + last_stage_pool (bool): If True, add `MaxPool2d` layer to last stage. + ''' + + def __init__(self, + in_channels=3, + layers=[1, 2, 5, 3], + channels=[64, 128, 256, 256, 512, 512, 512], + out_indices=None, + last_stage_pool=False): + super(ResNet31, self).__init__() + assert isinstance(in_channels, int) + assert isinstance(last_stage_pool, bool) + + self.out_indices = out_indices + self.last_stage_pool = last_stage_pool + + # conv 1 (Conv Conv) + self.conv1_1 = nn.Conv2d( + in_channels, channels[0], kernel_size=3, stride=1, padding=1) + self.bn1_1 = nn.BatchNorm2d(channels[0]) + self.relu1_1 = nn.ReLU(inplace=True) + + self.conv1_2 = nn.Conv2d( + channels[0], channels[1], kernel_size=3, stride=1, padding=1) + self.bn1_2 = nn.BatchNorm2d(channels[1]) + self.relu1_2 = nn.ReLU(inplace=True) + + # conv 2 (Max-pooling, Residual block, Conv) + self.pool2 = nn.MaxPool2d( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block2 = self._make_layer(channels[1], channels[2], layers[0]) + self.conv2 = nn.Conv2d( + channels[2], channels[2], kernel_size=3, stride=1, padding=1) + self.bn2 = nn.BatchNorm2d(channels[2]) + self.relu2 = nn.ReLU(inplace=True) + + # conv 3 (Max-pooling, Residual block, Conv) + self.pool3 = nn.MaxPool2d( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block3 = self._make_layer(channels[2], channels[3], layers[1]) + self.conv3 = nn.Conv2d( + channels[3], channels[3], kernel_size=3, stride=1, padding=1) + self.bn3 = nn.BatchNorm2d(channels[3]) + self.relu3 = nn.ReLU(inplace=True) + + # conv 4 (Max-pooling, Residual block, Conv) + self.pool4 = nn.MaxPool2d( + kernel_size=(2, 1), stride=(2, 1), padding=0, ceil_mode=True) + self.block4 = self._make_layer(channels[3], channels[4], layers[2]) + self.conv4 = nn.Conv2d( + channels[4], channels[4], kernel_size=3, stride=1, padding=1) + self.bn4 = nn.BatchNorm2d(channels[4]) + self.relu4 = nn.ReLU(inplace=True) + + # conv 5 ((Max-pooling), Residual block, Conv) + self.pool5 = None + if self.last_stage_pool: + self.pool5 = nn.MaxPool2d( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block5 = self._make_layer(channels[4], channels[5], layers[3]) + self.conv5 = nn.Conv2d( + channels[5], channels[5], kernel_size=3, stride=1, padding=1) + self.bn5 = nn.BatchNorm2d(channels[5]) + self.relu5 = nn.ReLU(inplace=True) + + self.out_channels = channels[-1] + + def _make_layer(self, input_channels, output_channels, blocks): + layers = [] + for _ in range(blocks): + downsample = None + if input_channels != output_channels: + downsample = nn.Sequential( + nn.Conv2d( + input_channels, + output_channels, + kernel_size=1, + stride=1, + bias=False), + nn.BatchNorm2d(output_channels), ) + + layers.append( + BasicBlock( + input_channels, output_channels, downsample=downsample)) + input_channels = output_channels + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1_1(x) + x = self.bn1_1(x) + x = self.relu1_1(x) + + x = self.conv1_2(x) + x = self.bn1_2(x) + x = self.relu1_2(x) + + outs = [] + for i in range(4): + layer_index = i + 2 + pool_layer = getattr(self, 'pool{}'.format(layer_index)) + block_layer = getattr(self, 'block{}'.format(layer_index)) + conv_layer = getattr(self, 'conv{}'.format(layer_index)) + bn_layer = getattr(self, 'bn{}'.format(layer_index)) + relu_layer = getattr(self, 'relu{}'.format(layer_index)) + + if pool_layer is not None: + x = pool_layer(x) + x = block_layer(x) + x = conv_layer(x) + x = bn_layer(x) + x = relu_layer(x) + + outs.append(x) + + if self.out_indices is not None: + return tuple([outs[i] for i in self.out_indices]) + + return x \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/backbones/rec_resnet_fpn.py b/batch_running_task/pytorchocr/modeling/backbones/rec_resnet_fpn.py new file mode 100644 index 0000000..e9bad7b --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/rec_resnet_fpn.py @@ -0,0 +1,278 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os, sys +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation + +__all__ = ["ResNetFPN"] + +class ResNetFPN(nn.Module): + def __init__(self, in_channels=1, layers=50, **kwargs): + super(ResNetFPN, self).__init__() + supported_layers = { + 18: { + 'depth': [2, 2, 2, 2], + 'block_class': BasicBlock + }, + 34: { + 'depth': [3, 4, 6, 3], + 'block_class': BasicBlock + }, + 50: { + 'depth': [3, 4, 6, 3], + 'block_class': BottleneckBlock + }, + 101: { + 'depth': [3, 4, 23, 3], + 'block_class': BottleneckBlock + }, + 152: { + 'depth': [3, 8, 36, 3], + 'block_class': BottleneckBlock + } + } + stride_list = [(2, 2), (2, 2), (1, 1), (1, 1)] + num_filters = [64, 128, 256, 512] + self.depth = supported_layers[layers]['depth'] + self.conv = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=7, + stride=2, + act="relu", + name="conv1") + self.block_list = nn.ModuleList() + in_ch = 64 + if layers >= 50: + for block in range(len(self.depth)): + for i in range(self.depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + + bottlenectBlock = BottleneckBlock( + in_channels=in_ch, + out_channels=num_filters[block], + stride=stride_list[block] if i == 0 else 1, + name=conv_name) + + in_ch = num_filters[block] * 4 + self.block_list.add_module("bottleneckBlock_{}_{}".format(block, i), bottlenectBlock) + + else: + for block in range(len(self.depth)): + for i in range(self.depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + if i == 0 and block != 0: + stride = (2, 1) + else: + stride = (1, 1) + + basicBlock = BasicBlock( + in_channels=in_ch, + out_channels=num_filters[block], + stride=stride_list[block] if i == 0 else 1, + is_first=block == i == 0, + name=conv_name) + in_ch = basicBlock.out_channels + self.block_list.add_module(conv_name, basicBlock) + out_ch_list = [in_ch // 4, in_ch // 2, in_ch] + self.base_block = nn.ModuleList() + self.conv_trans = [] + self.bn_block = [] + for i in [-2, -3]: + in_channels = out_ch_list[i + 1] + out_ch_list[i] + bb_0 = nn.Conv2d( + in_channels=in_channels, + out_channels=out_ch_list[i], + kernel_size=1, + bias=True) + self.base_block.add_module("F_{}_base_block_0".format(i), bb_0) + bb_1 = nn.Conv2d( + in_channels=out_ch_list[i], + out_channels=out_ch_list[i], + kernel_size=3, + padding=1, + bias=True) + self.base_block.add_module("F_{}_base_block_1".format(i), bb_1) + bb_2 = nn.Sequential( + nn.BatchNorm2d(out_ch_list[i]), + Activation("relu") + ) + self.base_block.add_module("F_{}_base_block_2".format(i), bb_2) + + bb_3 = nn.Conv2d( + in_channels=out_ch_list[i], + out_channels=512, + kernel_size=1, + bias=True) + self.base_block.add_module("F_{}_base_block_3".format(i), bb_3) + self.out_channels = 512 + + def __call__(self, x): + x = self.conv(x) + fpn_list = [] + F = [] + for i in range(len(self.depth)): + fpn_list.append(np.sum(self.depth[:i + 1])) + + for i, block in enumerate(self.block_list): + x = block(x) + for number in fpn_list: + if i + 1 == number: + F.append(x) + base = F[-1] + + j = 0 + for i, block in enumerate(self.base_block): + if i % 3 == 0 and i < 6: + j = j + 1 + b, c, w, h = F[-j - 1].shape + if [w, h] == list(base.shape[2:]): + base = base + else: + base = self.conv_trans[j - 1](base) + base = self.bn_block[j - 1](base) + base = torch.cat([base, F[-j - 1]], dim=1) + base = block(base) + return base + + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=2 if stride == (1, 1) else kernel_size, + dilation=2 if stride == (1, 1) else 1, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False, ) + + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + + self.bn = nn.BatchNorm2d(out_channels) + self.act = act + if self.act is not None: + self._act = Activation(act_type=self.act, inplace=True) + + def __call__(self, x): + x = self.conv(x) + x = self.bn(x) + if self.act is not None: + x = self._act(x) + return x + + +class ShortCut(nn.Module): + def __init__(self, in_channels, out_channels, stride, name, is_first=False): + super(ShortCut, self).__init__() + self.use_conv = True + + if in_channels != out_channels or stride != 1 or is_first == True: + if stride == (1, 1): + self.conv = ConvBNLayer( + in_channels, out_channels, 1, 1, name=name) + else: # stride==(2,2) + self.conv = ConvBNLayer( + in_channels, out_channels, 1, stride, name=name) + else: + self.use_conv = False + + def forward(self, x): + if self.use_conv: + x = self.conv(x) + return x + + +class BottleneckBlock(nn.Module): + def __init__(self, in_channels, out_channels, stride, name): + super(BottleneckBlock, self).__init__() + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + self.short = ShortCut( + in_channels=in_channels, + out_channels=out_channels * 4, + stride=stride, + is_first=False, + name=name + "_branch1") + self.out_channels = out_channels * 4 + + def forward(self, x): + y = self.conv0(x) + y = self.conv1(y) + y = self.conv2(y) + y = y + self.short(x) + y = F.relu(y) + return y + + +class BasicBlock(nn.Module): + def __init__(self, in_channels, out_channels, stride, name, is_first): + super(BasicBlock, self).__init__() + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + act='relu', + stride=stride, + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + self.short = ShortCut( + in_channels=in_channels, + out_channels=out_channels, + stride=stride, + is_first=is_first, + name=name + "_branch1") + self.out_channels = out_channels + + def forward(self, x): + y = self.conv0(x) + y = self.conv1(y) + y = y + self.short(x) + return F.relu(y) diff --git a/batch_running_task/pytorchocr/modeling/backbones/rec_resnet_vd.py b/batch_running_task/pytorchocr/modeling/backbones/rec_resnet_vd.py new file mode 100644 index 0000000..b8a2ed3 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/rec_resnet_vd.py @@ -0,0 +1,260 @@ +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation + +class ConvBNLayer(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_vd_mode=False, + act=None, + name=None, ): + super(ConvBNLayer, self).__init__() + self.act = act + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2d( + kernel_size=stride, stride=stride, padding=0, ceil_mode=True) + + self._conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1 if is_vd_mode else stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False) + + self._batch_norm = nn.BatchNorm2d( + out_channels,) + if self.act is not None: + self._act = Activation(act_type=act, inplace=True) + + def forward(self, inputs): + if self.is_vd_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + if self.act is not None: + y = self._act(y) + return y + + +class BottleneckBlock(nn.Module): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=stride, + is_vd_mode=not if_first and stride[0] != 1, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = short + conv2 + y = F.relu(y) + return y + + +class BasicBlock(nn.Module): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=stride, + is_vd_mode=not if_first and stride[0] != 1, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = short + conv1 + y = F.relu(y) + return y + + +class ResNet(nn.Module): + def __init__(self, in_channels=3, layers=50, **kwargs): + super(ResNet, self).__init__() + + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_channels = [64, 256, 512, + 1024] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512] + + self.conv1_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + name="conv1_1") + self.conv1_2 = ConvBNLayer( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + name="conv1_2") + self.conv1_3 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + name="conv1_3") + self.pool2d_max = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + # self.block_list = list() + self.block_list = nn.Sequential() + if layers >= 50: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + if layers in [101, 152, 200] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + + if i == 0 and block != 0: + stride = (2, 1) + else: + stride = (1, 1) + + bottleneck_block = BottleneckBlock(in_channels=num_channels[block] if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=stride, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name) + shortcut = True + # self.block_list.append(bottleneck_block) + self.block_list.add_module('bb_%d_%d' % (block, i), bottleneck_block) + self.out_channels = num_filters[block] + else: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + if i == 0 and block != 0: + stride = (2, 1) + else: + stride = (1, 1) + + basic_block = BasicBlock(in_channels=num_channels[block] if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=stride, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name) + + shortcut = True + # self.block_list.append(basic_block) + self.block_list.add_module('bb_%d_%d' % (block, i), basic_block) + self.out_channels = num_filters[block] + self.out_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) + + def forward(self, inputs): + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + y = self.pool2d_max(y) + for block in self.block_list: + y = block(y) + y = self.out_pool(y) + + return y \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/backbones/rec_svtrnet.py b/batch_running_task/pytorchocr/modeling/backbones/rec_svtrnet.py new file mode 100644 index 0000000..1ba3d4f --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/rec_svtrnet.py @@ -0,0 +1,582 @@ +import torch +import torch.nn as nn +from pytorchocr.modeling.common import Activation +import numpy as np + +def drop_path(x, drop_prob=0., training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... + """ + if drop_prob == 0. or not training: + return x + keep_prob = torch.as_tensor(1 - drop_prob) + shape = (x.shape[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype) + random_tensor = torch.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=0, + bias_attr=False, + groups=1, + act='gelu'): + super().__init__() + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=bias_attr) + self.norm = nn.BatchNorm2d(out_channels) + self.act = Activation(act_type=act, inplace=True) + + def forward(self, inputs): + out = self.conv(inputs) + out = self.norm(out) + out = self.act(out) + return out + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Identity(nn.Module): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +class Mlp(nn.Module): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer='gelu', + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = Activation(act_type=act_layer, inplace=True) + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class ConvMixer(nn.Module): + def __init__( + self, + dim, + num_heads=8, + HW=[8, 25], + local_k=[3, 3], ): + super().__init__() + self.HW = HW + self.dim = dim + self.local_mixer = nn.Conv2d( + dim, + dim, + local_k, + 1, [local_k[0] // 2, local_k[1] // 2], + groups=num_heads, + ) + + def forward(self, x): + h = self.HW[0] + w = self.HW[1] + x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w]) + x = self.local_mixer(x) + x = x.flatten(2).permute(0, 2, 1) + return x + + +class Attention(nn.Module): + def __init__(self, + dim, + num_heads=8, + mixer='Global', + HW=[8, 25], + local_k=[7, 11], + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.HW = HW + if HW is not None: + H = HW[0] + W = HW[1] + self.N = H * W + self.C = dim + if mixer == 'Local' and HW is not None: + hk = local_k[0] + wk = local_k[1] + mask = torch.ones(H * W, H + hk - 1, W + wk - 1, dtype=torch.float32) + for h in range(0, H): + for w in range(0, W): + mask[h * W + w, h:h + hk, w:w + wk] = 0. + mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk // + 2].flatten(1) + mask_inf = torch.full([H * W, H * W], fill_value=float("-Inf"), dtype=torch.float32) + mask = torch.where(mask_paddle < 1, mask_paddle, mask_inf) + self.mask = mask.unsqueeze(0).unsqueeze(1) + # self.mask = mask[None, None, :] + self.mixer = mixer + + def forward(self, x): + if self.HW is not None: + N = self.N + C = self.C + else: + _, N, C = x.shape + qkv = self.qkv(x) + qkv = qkv.reshape((-1, N, 3, self.num_heads, C // self.num_heads)).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] + + attn = (q.matmul(k.permute(0, 1, 3, 2))) + if self.mixer == 'Local': + attn += self.mask + attn = nn.functional.softmax(attn, dim=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).permute(0, 2, 1, 3).reshape((-1, N, C)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + def __init__(self, + dim, + num_heads, + mixer='Global', + local_mixer=[7, 11], + HW=None, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer='gelu', + norm_layer='nn.LayerNorm', + epsilon=1e-6, + prenorm=True): + super().__init__() + if isinstance(norm_layer, str): + self.norm1 = eval(norm_layer)(dim, eps=epsilon) + else: + self.norm1 = norm_layer(dim) + if mixer == 'Global' or mixer == 'Local': + self.mixer = Attention( + dim, + num_heads=num_heads, + mixer=mixer, + HW=HW, + local_k=local_mixer, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + elif mixer == 'Conv': + self.mixer = ConvMixer( + dim, num_heads=num_heads, HW=HW, local_k=local_mixer) + else: + raise TypeError("The mixer must be one of [Global, Local, Conv]") + + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + if isinstance(norm_layer, str): + self.norm2 = eval(norm_layer)(dim, eps=epsilon) + else: + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp_ratio = mlp_ratio + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + self.prenorm = prenorm + + def forward(self, x): + if self.prenorm: + x = self.norm1(x + self.drop_path(self.mixer(x))) + x = self.norm2(x + self.drop_path(self.mlp(x))) + else: + x = x + self.drop_path(self.mixer(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + + def __init__(self, + img_size=[32, 100], + in_channels=3, + embed_dim=768, + sub_num=2, + patch_size=[4, 4], + mode='pope', + ): + super().__init__() + num_patches = (img_size[1] // (2 ** sub_num)) * \ + (img_size[0] // (2 ** sub_num)) + self.img_size = img_size + self.num_patches = num_patches + self.embed_dim = embed_dim + self.norm = None + if mode == 'pope': + if sub_num == 2: + self.proj = nn.Sequential( + ConvBNLayer( + in_channels=in_channels, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, + act='gelu', + bias_attr=True), + ConvBNLayer( + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, + act='gelu', + bias_attr=True)) + if sub_num == 3: + self.proj = nn.Sequential( + ConvBNLayer( + in_channels=in_channels, + out_channels=embed_dim // 4, + kernel_size=3, + stride=2, + padding=1, + act='gelu', + bias_attr=True), + ConvBNLayer( + in_channels=embed_dim // 4, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, + act='gelu', + bias_attr=True), + ConvBNLayer( + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, + act='gelu', + bias_attr=True)) + elif mode == 'linear': + self.proj = nn.Conv2d( + 1, embed_dim, kernel_size=patch_size, stride=patch_size) + self.num_patches = img_size[0] // patch_size[0] * img_size[ + 1] // patch_size[1] + + def forward(self, x): + B, C, H, W = x.shape + assert H == self.img_size[0] and W == self.img_size[1], \ + "Input image size ({}*{}) doesn't match model ({}*{}).".format( + H,W,self.img_size[0],self.img_size[1] + ) + x = self.proj(x).flatten(2).permute(0, 2, 1) + return x + + +class SubSample(nn.Module): + def __init__(self, + in_channels, + out_channels, + types='Pool', + stride=[2, 1], + sub_norm='nn.LayerNorm', + act=None): + super().__init__() + self.types = types + if types == 'Pool': + self.avgpool = nn.AvgPool2d( + kernel_size=[3, 5], stride=stride, padding=[1, 2]) + self.maxpool = nn.MaxPool2d( + kernel_size=[3, 5], stride=stride, padding=[1, 2]) + self.proj = nn.Linear(in_channels, out_channels) + else: + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + ) + self.norm = eval(sub_norm)(out_channels) + if act is not None: + self.act = act() + else: + self.act = None + + def forward(self, x): + + if self.types == 'Pool': + x1 = self.avgpool(x) + x2 = self.maxpool(x) + x = (x1 + x2) * 0.5 + out = self.proj(x.flatten(2).permute(0, 2, 1)) + else: + x = self.conv(x) + out = x.flatten(2).permute(0, 2, 1) + out = self.norm(out) + if self.act is not None: + out = self.act(out) + + return out + + +class SVTRNet(nn.Module): + def __init__( + self, + img_size=[32, 100], + in_channels=3, + embed_dim=[64, 128, 256], + depth=[3, 6, 3], + num_heads=[2, 4, 8], + mixer=['Local'] * 6 + ['Global'] * + 6, # Local atten, Global atten, Conv + local_mixer=[[7, 11], [7, 11], [7, 11]], + patch_merging='Conv', # Conv, Pool, None + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + last_drop=0.0, + attn_drop_rate=0., + drop_path_rate=0.1, + norm_layer='nn.LayerNorm', + sub_norm='nn.LayerNorm', + epsilon=1e-6, + out_channels=192, + out_char_num=25, + block_unit='Block', + act='gelu', + last_stage=True, + sub_num=2, + prenorm=True, + use_lenhead=False, + **kwargs): + super().__init__() + self.img_size = img_size + self.embed_dim = embed_dim + self.out_channels = out_channels + self.prenorm = prenorm + patch_merging = None if patch_merging != 'Conv' and patch_merging != 'Pool' else patch_merging + self.patch_embed = PatchEmbed( + img_size=img_size, + in_channels=in_channels, + embed_dim=embed_dim[0], + sub_num=sub_num) + num_patches = self.patch_embed.num_patches + self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)] + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim[0])) + self.pos_drop = nn.Dropout(p=drop_rate) + Block_unit = eval(block_unit) + + dpr = np.linspace(0, drop_path_rate, sum(depth)) + self.blocks1 = nn.ModuleList([ + Block_unit( + dim=embed_dim[0], + num_heads=num_heads[0], + mixer=mixer[0:depth[0]][i], + HW=self.HW, + local_mixer=local_mixer[0], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=act, + attn_drop=attn_drop_rate, + drop_path=dpr[0:depth[0]][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm) for i in range(depth[0]) + ]) + if patch_merging is not None: + self.sub_sample1 = SubSample( + embed_dim[0], + embed_dim[1], + sub_norm=sub_norm, + stride=[2, 1], + types=patch_merging) + HW = [self.HW[0] // 2, self.HW[1]] + else: + HW = self.HW + self.patch_merging = patch_merging + self.blocks2 = nn.ModuleList([ + Block_unit( + dim=embed_dim[1], + num_heads=num_heads[1], + mixer=mixer[depth[0]:depth[0] + depth[1]][i], + HW=HW, + local_mixer=local_mixer[1], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=act, + attn_drop=attn_drop_rate, + drop_path=dpr[depth[0]:depth[0] + depth[1]][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm) for i in range(depth[1]) + ]) + if patch_merging is not None: + self.sub_sample2 = SubSample( + embed_dim[1], + embed_dim[2], + sub_norm=sub_norm, + stride=[2, 1], + types=patch_merging) + HW = [self.HW[0] // 4, self.HW[1]] + else: + HW = self.HW + self.blocks3 = nn.ModuleList([ + Block_unit( + dim=embed_dim[2], + num_heads=num_heads[2], + mixer=mixer[depth[0] + depth[1]:][i], + HW=HW, + local_mixer=local_mixer[2], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=act, + attn_drop=attn_drop_rate, + drop_path=dpr[depth[0] + depth[1]:][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm) for i in range(depth[2]) + ]) + self.last_stage = last_stage + if last_stage: + self.avg_pool = nn.AdaptiveAvgPool2d([1, out_char_num]) + self.last_conv = nn.Conv2d( + in_channels=embed_dim[2], + out_channels=self.out_channels, + kernel_size=1, + stride=1, + padding=0, + bias=False) + self.hardswish = Activation('hard_swish', inplace=True) #nn.Hardswish() + # self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer") + self.dropout = nn.Dropout(p=last_drop) + if not prenorm: + self.norm = eval(norm_layer)(embed_dim[-1], eps=epsilon) + self.use_lenhead = use_lenhead + if use_lenhead: + self.len_conv = nn.Linear(embed_dim[2], self.out_channels) + self.hardswish_len = Activation('hard_swish', inplace=True)# nn.Hardswish() + self.dropout_len = nn.Dropout( + p=last_drop) + + torch.nn.init.xavier_normal_(self.pos_embed) + self.apply(self._init_weights) + + def _init_weights(self, m): + # weight initialization + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.ConvTranspose2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + + def forward_features(self, x): + x = self.patch_embed(x) + x = x + self.pos_embed + x = self.pos_drop(x) + for blk in self.blocks1: + x = blk(x) + if self.patch_merging is not None: + x = self.sub_sample1( + x.permute(0, 2, 1).reshape( + [-1, self.embed_dim[0], self.HW[0], self.HW[1]])) + for blk in self.blocks2: + x = blk(x) + if self.patch_merging is not None: + x = self.sub_sample2( + x.permute(0, 2, 1).reshape( + [-1, self.embed_dim[1], self.HW[0] // 2, self.HW[1]])) + for blk in self.blocks3: + x = blk(x) + if not self.prenorm: + x = self.norm(x) + return x + + def forward(self, x): + x = self.forward_features(x) + if self.use_lenhead: + len_x = self.len_conv(x.mean(1)) + len_x = self.dropout_len(self.hardswish_len(len_x)) + if self.last_stage: + if self.patch_merging is not None: + h = self.HW[0] // 4 + else: + h = self.HW[0] + x = self.avg_pool( + x.permute(0, 2, 1).reshape( + [-1, self.embed_dim[2], h, self.HW[1]])) + x = self.last_conv(x) + x = self.hardswish(x) + x = self.dropout(x) + if self.use_lenhead: + return x, len_x + return x \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/backbones/rec_vitstr.py b/batch_running_task/pytorchocr/modeling/backbones/rec_vitstr.py new file mode 100644 index 0000000..1e22501 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/rec_vitstr.py @@ -0,0 +1,120 @@ +""" +This code is refer from: +https://github.com/roatienza/deep-text-recognition-benchmark/blob/master/modules/vitstr.py +""" + +import numpy as np +import torch +import torch.nn as nn +from pytorchocr.modeling.backbones.rec_svtrnet import Block, PatchEmbed + +# import paddle +# import paddle.nn as nn +# from ppocr.modeling.backbones.rec_svtrnet import Block, PatchEmbed, zeros_, trunc_normal_, ones_ + +scale_dim_heads = {'tiny': [192, 3], 'small': [384, 6], 'base': [768, 12]} + + +class ViTSTR(nn.Module): + def __init__(self, + img_size=[224, 224], + in_channels=1, + scale='tiny', + seqlen=27, + patch_size=[16, 16], + embed_dim=None, + depth=12, + num_heads=None, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_path_rate=0., + drop_rate=0., + attn_drop_rate=0., + norm_layer='nn.LayerNorm', + act_layer='gelu', + epsilon=1e-6, + out_channels=None, + **kwargs): + super().__init__() + self.seqlen = seqlen + embed_dim = embed_dim if embed_dim is not None else scale_dim_heads[ + scale][0] + num_heads = num_heads if num_heads is not None else scale_dim_heads[ + scale][1] + out_channels = out_channels if out_channels is not None else embed_dim + self.patch_embed = PatchEmbed( + img_size=img_size, + in_channels=in_channels, + embed_dim=embed_dim, + patch_size=patch_size, + mode='linear') + num_patches = self.patch_embed.num_patches + + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = np.linspace(0, drop_path_rate, depth) + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + epsilon=epsilon, + prenorm=False) for i in range(depth) + ]) + self.norm = eval(norm_layer)(embed_dim, eps=epsilon) + + self.out_channels = out_channels + + torch.nn.init.xavier_normal_(self.pos_embed) + torch.nn.init.xavier_normal_(self.cls_token) + self.apply(self._init_weights) + + def _init_weights(self, m): + # weight initialization + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.ConvTranspose2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + + def forward_features(self, x): + B = x.shape[0] + x = self.patch_embed(x) + # cls_tokens = paddle.tile(self.cls_token, repeat_times=[B, 1, 1]) + cls_tokens = self.cls_token.repeat(B, 1, 1) + x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embed + x = self.pos_drop(x) + for blk in self.blocks: + x = blk(x) + x = self.norm(x) + return x + + def forward(self, x): + x = self.forward_features(x) + x = x[:, :self.seqlen] + return x.permute(0, 2, 1).unsqueeze(2) diff --git a/batch_running_task/pytorchocr/modeling/backbones/table_mobilenet_v3.py b/batch_running_task/pytorchocr/modeling/backbones/table_mobilenet_v3.py new file mode 100644 index 0000000..fa61e05 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/table_mobilenet_v3.py @@ -0,0 +1,270 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F + +__all__ = ['MobileNetV3'] + + +def make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + +def hard_sigmoid(x, slope=0.1666667, offset=0.5,): + return torch.clamp(slope * x + offset, 0., 1.) + +def hard_swish(x, inplace=True): + return x * F.relu6(x + 3., inplace=inplace) / 6. + +class MobileNetV3(nn.Module): + def __init__(self, + in_channels=3, + model_name='large', + scale=0.5, + disable_se=False, + **kwargs): + """ + the MobilenetV3 backbone network for detection module. + Args: + params(dict): the super parameters for build network + """ + super(MobileNetV3, self).__init__() + + self.disable_se = disable_se + + if model_name == "large": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, False, 'relu', 1], + [3, 64, 24, False, 'relu', 2], + [3, 72, 24, False, 'relu', 1], + [5, 72, 40, True, 'relu', 2], + [5, 120, 40, True, 'relu', 1], + [5, 120, 40, True, 'relu', 1], + [3, 240, 80, False, 'hardswish', 2], + [3, 200, 80, False, 'hardswish', 1], + [3, 184, 80, False, 'hardswish', 1], + [3, 184, 80, False, 'hardswish', 1], + [3, 480, 112, True, 'hardswish', 1], + [3, 672, 112, True, 'hardswish', 1], + [5, 672, 160, True, 'hardswish', 2], + [5, 960, 160, True, 'hardswish', 1], + [5, 960, 160, True, 'hardswish', 1], + ] + cls_ch_squeeze = 960 + elif model_name == "small": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, True, 'relu', 2], + [3, 72, 24, False, 'relu', 2], + [3, 88, 24, False, 'relu', 1], + [5, 96, 40, True, 'hardswish', 2], + [5, 240, 40, True, 'hardswish', 1], + [5, 240, 40, True, 'hardswish', 1], + [5, 120, 48, True, 'hardswish', 1], + [5, 144, 48, True, 'hardswish', 1], + [5, 288, 96, True, 'hardswish', 2], + [5, 576, 96, True, 'hardswish', 1], + [5, 576, 96, True, 'hardswish', 1], + ] + cls_ch_squeeze = 576 + else: + raise NotImplementedError("mode[" + model_name + + "_model] is not implemented!") + + supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25] + assert scale in supported_scale, \ + "supported scale are {} but input scale is {}".format(supported_scale, scale) + inplanes = 16 + # conv1 + self.conv = ConvBNLayer( + in_channels=in_channels, + out_channels=make_divisible(inplanes * scale), + kernel_size=3, + stride=2, + padding=1, + groups=1, + if_act=True, + act='hardswish', + name='conv1') + + self.stages = nn.ModuleList() + self.out_channels = [] + block_list = [] + i = 0 + inplanes = make_divisible(inplanes * scale) + for (k, exp, c, se, nl, s) in cfg: + se = se and not self.disable_se + start_idx = 2 if model_name == 'large' else 0 + if s == 2 and i > start_idx: + self.out_channels.append(inplanes) + self.stages.append(nn.Sequential(*block_list)) + block_list = [] + block_list.append( + ResidualUnit( + in_channels=inplanes, + mid_channels=make_divisible(scale * exp), + out_channels=make_divisible(scale * c), + kernel_size=k, + stride=s, + use_se=se, + act=nl, + name="conv" + str(i + 2))) + inplanes = make_divisible(scale * c) + i += 1 + block_list.append( + ConvBNLayer( + in_channels=inplanes, + out_channels=make_divisible(scale * cls_ch_squeeze), + kernel_size=1, + stride=1, + padding=0, + groups=1, + if_act=True, + act='hardswish', + name='conv_last')) + self.stages.append(nn.Sequential(*block_list)) + self.out_channels.append(make_divisible(scale * cls_ch_squeeze)) + # for i, stage in enumerate(self.stages): + # self.add_module(module=stage, name="stage{}".format(i)) + + def forward(self, x): + x = self.conv(x) + out_list = [] + for stage in self.stages: + x = stage(x) + out_list.append(x) + return out_list + + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False) + + self.bn = nn.BatchNorm2d( + out_channels, + ) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.if_act: + if self.act == "relu": + x = F.relu(x) + elif self.act == "hardswish": + x = hard_swish(x) + else: + print("The activation function({}) is selected incorrectly.". + format(self.act)) + exit() + return x + + +class ResidualUnit(nn.Module): + def __init__(self, + in_channels, + mid_channels, + out_channels, + kernel_size, + stride, + use_se, + act=None, + name=''): + super(ResidualUnit, self).__init__() + self.if_shortcut = stride == 1 and in_channels == out_channels + self.if_se = use_se + + self.expand_conv = ConvBNLayer( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0, + if_act=True, + act=act, + name=name + "_expand") + self.bottleneck_conv = ConvBNLayer( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + padding=int((kernel_size - 1) // 2), + groups=mid_channels, + if_act=True, + act=act, + name=name + "_depthwise") + if self.if_se: + self.mid_se = SEModule(mid_channels, name=name + "_se") + self.linear_conv = ConvBNLayer( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + if_act=False, + act=None, + name=name + "_linear") + + def forward(self, inputs): + x = self.expand_conv(inputs) + x = self.bottleneck_conv(x) + if self.if_se: + x = self.mid_se(x) + x = self.linear_conv(x) + if self.if_shortcut: + x = torch.add(inputs, x) + return x + + +class SEModule(nn.Module): + def __init__(self, in_channels, reduction=4, name=""): + super(SEModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.conv1 = nn.Conv2d( + in_channels=in_channels, + out_channels=in_channels // reduction, + kernel_size=1, + stride=1, + padding=0, + bias=True) + self.conv2 = nn.Conv2d( + in_channels=in_channels // reduction, + out_channels=in_channels, + kernel_size=1, + stride=1, + padding=0, + bias=True) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = F.relu(outputs) + outputs = self.conv2(outputs) + outputs = hard_sigmoid(outputs, slope=0.2, offset=0.5) + return inputs * outputs \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/backbones/table_resnet_vd.py b/batch_running_task/pytorchocr/modeling/backbones/table_resnet_vd.py new file mode 100644 index 0000000..77bfe4a --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/backbones/table_resnet_vd.py @@ -0,0 +1,269 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation + +__all__ = ["ResNet"] + + +class ConvBNLayer(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_vd_mode=False, + act=None, + name=None, ): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2d( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self._conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = nn.BatchNorm2d( + out_channels, + ) + self.act = act + if self.act is not None: + self._act = Activation(act, inplace=True) + + + def forward(self, inputs): + if self.is_vd_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + if self.act is not None: + y = self._act(y) + return y + + +class BottleneckBlock(nn.Module): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = torch.add(short, conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Module): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = torch.add(short, conv1) + y = F.relu(y) + return y + + +class ResNet(nn.Module): + def __init__(self, in_channels=3, layers=50, **kwargs): + super(ResNet, self).__init__() + + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_channels = [64, 256, 512, + 1024] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512] + + self.conv1_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=32, + kernel_size=3, + stride=2, + act='relu', + name="conv1_1") + self.conv1_2 = ConvBNLayer( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + name="conv1_2") + self.conv1_3 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + name="conv1_3") + self.pool2d_max = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.stages = nn.ModuleList() + self.out_channels = [] + if layers >= 50: + for block in range(len(depth)): + block_list = nn.Sequential() + shortcut = False + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name + ) + shortcut = True + # block_list.append(bottleneck_block) + block_list.add_module('bb_%d_%d' % (block, i), bottleneck_block) + self.out_channels.append(num_filters[block] * 4) + # self.stages.append(nn.Sequential(*block_list)) + self.stages.append(block_list) + else: + for block in range(len(depth)): + block_list = nn.Sequential() + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = BasicBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name + ) + shortcut = True + # block_list.append(basic_block) + block_list.add_module('bb_%d_%d' % (block, i), basic_block) + self.out_channels.append(num_filters[block]) + # self.stages.append(nn.Sequential(*block_list)) + self.stages.append(block_list) + + def forward(self, inputs): + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + y = self.pool2d_max(y) + out = [] + for block in self.stages: + y = block(y) + out.append(y) + return out \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/common.py b/batch_running_task/pytorchocr/modeling/common.py new file mode 100644 index 0000000..2d6e3d9 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/common.py @@ -0,0 +1,73 @@ + + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Hswish(nn.Module): + def __init__(self, inplace=True): + super(Hswish, self).__init__() + self.inplace = inplace + + def forward(self, x): + return x * F.relu6(x + 3., inplace=self.inplace) / 6. + +# out = max(0, min(1, slop*x+offset)) +# paddle.fluid.layers.hard_sigmoid(x, slope=0.2, offset=0.5, name=None) +class Hsigmoid(nn.Module): + def __init__(self, inplace=True): + super(Hsigmoid, self).__init__() + self.inplace = inplace + + def forward(self, x): + # torch: F.relu6(x + 3., inplace=self.inplace) / 6. + # paddle: F.relu6(1.2 * x + 3., inplace=self.inplace) / 6. + return F.relu6(1.2 * x + 3., inplace=self.inplace) / 6. + +class GELU(nn.Module): + def __init__(self, inplace=True): + super(GELU, self).__init__() + self.inplace = inplace + + def forward(self, x): + return torch.nn.functional.gelu(x) + + +class Swish(nn.Module): + def __init__(self, inplace=True): + super(Swish, self).__init__() + self.inplace = inplace + + def forward(self, x): + if self.inplace: + x.mul_(torch.sigmoid(x)) + return x + else: + return x*torch.sigmoid(x) + + +class Activation(nn.Module): + def __init__(self, act_type, inplace=True): + super(Activation, self).__init__() + act_type = act_type.lower() + if act_type == 'relu': + self.act = nn.ReLU(inplace=inplace) + elif act_type == 'relu6': + self.act = nn.ReLU6(inplace=inplace) + elif act_type == 'sigmoid': + raise NotImplementedError + elif act_type == 'hard_sigmoid': + self.act = Hsigmoid(inplace)#nn.Hardsigmoid(inplace=inplace)#Hsigmoid(inplace)# + elif act_type == 'hard_swish' or act_type == 'hswish': + self.act = Hswish(inplace=inplace) + elif act_type == 'leakyrelu': + self.act = nn.LeakyReLU(inplace=inplace) + elif act_type == 'gelu': + self.act = GELU(inplace=inplace) + elif act_type == 'swish': + self.act = Swish(inplace=inplace) + else: + raise NotImplementedError + + def forward(self, inputs): + return self.act(inputs) \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/heads/__init__.py b/batch_running_task/pytorchocr/modeling/heads/__init__.py new file mode 100644 index 0000000..ca9fe89 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/__init__.py @@ -0,0 +1,51 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['build_head'] + + +def build_head(config, **kwargs): + # det head + from .det_db_head import DBHead, PFHeadLocal + from .det_east_head import EASTHead + from .det_sast_head import SASTHead + from .det_pse_head import PSEHead + from .det_fce_head import FCEHead + from .e2e_pg_head import PGHead + + # rec head + from .rec_ctc_head import CTCHead + from .rec_att_head import AttentionHead + from .rec_srn_head import SRNHead + from .rec_nrtr_head import Transformer + from .rec_sar_head import SARHead + from .rec_can_head import CANHead + from .rec_multi_head import MultiHead + + # cls head + from .cls_head import ClsHead + support_dict = [ + 'DBHead', 'PSEHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead', 'AttentionHead', + 'SRNHead', 'PGHead', 'Transformer', 'TableAttentionHead','SARHead', 'FCEHead', + 'CANHead', 'MultiHead', 'PFHeadLocal', + + ] + + from .table_att_head import TableAttentionHead + + module_name = config.pop('name') + assert module_name in support_dict, Exception('head only support {}'.format( + support_dict)) + module_class = eval(module_name)(**config, **kwargs) + return module_class \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/heads/cls_head.py b/batch_running_task/pytorchocr/modeling/heads/cls_head.py new file mode 100644 index 0000000..476a212 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/cls_head.py @@ -0,0 +1,28 @@ +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F + +class ClsHead(nn.Module): + """ + Class orientation + Args: + params(dict): super parameters for build Class network + """ + + def __init__(self, in_channels, class_dim, **kwargs): + super(ClsHead, self).__init__() + self.training = False + self.pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Linear( + in_channels, + class_dim, + bias=True) + + def forward(self, x): + x = self.pool(x) + x = torch.reshape(x, shape=[x.shape[0], x.shape[1]]) + x = self.fc(x) + if not self.training: + x = F.softmax(x, dim=1) + return x \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/heads/det_db_head.py b/batch_running_task/pytorchocr/modeling/heads/det_db_head.py new file mode 100644 index 0000000..dbefbb4 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/det_db_head.py @@ -0,0 +1,121 @@ +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation +from pytorchocr.modeling.backbones.det_mobilenet_v3 import ConvBNLayer + +class Head(nn.Module): + def __init__(self, in_channels, **kwargs): + super(Head, self).__init__() + self.conv1 = nn.Conv2d( + in_channels=in_channels, + out_channels=in_channels // 4, + kernel_size=3, + padding=1, + bias=False) + self.conv_bn1 = nn.BatchNorm2d( + in_channels // 4) + self.relu1 = Activation(act_type='relu') + + self.conv2 = nn.ConvTranspose2d( + in_channels=in_channels // 4, + out_channels=in_channels // 4, + kernel_size=2, + stride=2) + self.conv_bn2 = nn.BatchNorm2d( + in_channels // 4) + self.relu2 = Activation(act_type='relu') + + self.conv3 = nn.ConvTranspose2d( + in_channels=in_channels // 4, + out_channels=1, + kernel_size=2, + stride=2) + + def forward(self, x, return_f=False): + x = self.conv1(x) + x = self.conv_bn1(x) + x = self.relu1(x) + x = self.conv2(x) + x = self.conv_bn2(x) + x = self.relu2(x) + if return_f is True: + f = x + x = self.conv3(x) + x = torch.sigmoid(x) + if return_f is True: + return x, f + return x + + +class DBHead(nn.Module): + """ + Differentiable Binarization (DB) for text detection: + see https://arxiv.org/abs/1911.08947 + args: + params(dict): super parameters for build DB network + """ + + def __init__(self, in_channels, k=50, **kwargs): + super(DBHead, self).__init__() + self.k = k + binarize_name_list = [ + 'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48', + 'conv2d_transpose_1', 'binarize' + ] + thresh_name_list = [ + 'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50', + 'conv2d_transpose_3', 'thresh' + ] + self.binarize = Head(in_channels, **kwargs)# binarize_name_list) + self.thresh = Head(in_channels, **kwargs)#thresh_name_list) + + def step_function(self, x, y): + return torch.reciprocal(1 + torch.exp(-self.k * (x - y))) + + def forward(self, x): + shrink_maps = self.binarize(x) + if not self.training: + return {'maps': shrink_maps} + + threshold_maps = self.thresh(x) + binary_maps = self.step_function(shrink_maps, threshold_maps) + y = torch.cat([shrink_maps, threshold_maps, binary_maps], dim=1) + return {'maps': y} + +class LocalModule(nn.Module): + def __init__(self, in_c, mid_c, use_distance=True): + super(self.__class__, self).__init__() + self.last_3 = ConvBNLayer(in_c + 1, mid_c, 3, 1, 1, act='relu') + self.last_1 = nn.Conv2d(mid_c, 1, 1, 1, 0) + + def forward(self, x, init_map, distance_map): + outf = torch.cat([init_map, x], dim=1) + # last Conv + out = self.last_1(self.last_3(outf)) + return out + +class PFHeadLocal(DBHead): + def __init__(self, in_channels, k=50, mode='small', **kwargs): + super(PFHeadLocal, self).__init__(in_channels, k, **kwargs) + self.mode = mode + + self.up_conv = nn.Upsample(scale_factor=2, mode="nearest") + if self.mode == 'large': + self.cbn_layer = LocalModule(in_channels // 4, in_channels // 4) + elif self.mode == 'small': + self.cbn_layer = LocalModule(in_channels // 4, in_channels // 8) + + def forward(self, x, targets=None): + shrink_maps, f = self.binarize(x, return_f=True) + base_maps = shrink_maps + cbn_maps = self.cbn_layer(self.up_conv(f), shrink_maps, None) + cbn_maps = F.sigmoid(cbn_maps) + if not self.training: + return {'maps': 0.5 * (base_maps + cbn_maps), 'cbn_maps': cbn_maps} + + threshold_maps = self.thresh(x) + binary_maps = self.step_function(shrink_maps, threshold_maps) + y = torch.cat([cbn_maps, threshold_maps, binary_maps], dim=1) + return {'maps': y, 'distance_maps': cbn_maps, 'cbn_maps': binary_maps} \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/heads/det_east_head.py b/batch_running_task/pytorchocr/modeling/heads/det_east_head.py new file mode 100644 index 0000000..0df9c48 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/det_east_head.py @@ -0,0 +1,112 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation +# import paddle +# from paddle import nn +# import paddle.nn.functional as F +# from paddle import ParamAttr + + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False) + + self.bn = nn.BatchNorm2d( + out_channels,) + self.act = act + if act is not None: + self._act = Activation(act) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.act is not None: + x = self._act(x) + return x + + +class EASTHead(nn.Module): + """ + """ + def __init__(self, in_channels, model_name, **kwargs): + super(EASTHead, self).__init__() + self.model_name = model_name + if self.model_name == "large": + num_outputs = [128, 64, 1, 8] + else: + num_outputs = [64, 32, 1, 8] + + self.det_conv1 = ConvBNLayer( + in_channels=in_channels, + out_channels=num_outputs[0], + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="det_head1") + self.det_conv2 = ConvBNLayer( + in_channels=num_outputs[0], + out_channels=num_outputs[1], + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="det_head2") + self.score_conv = ConvBNLayer( + in_channels=num_outputs[1], + out_channels=num_outputs[2], + kernel_size=1, + stride=1, + padding=0, + if_act=False, + act=None, + name="f_score") + self.geo_conv = ConvBNLayer( + in_channels=num_outputs[1], + out_channels=num_outputs[3], + kernel_size=1, + stride=1, + padding=0, + if_act=False, + act=None, + name="f_geo") + + def forward(self, x): + f_det = self.det_conv1(x) + f_det = self.det_conv2(f_det) + f_score = self.score_conv(f_det) + f_score = torch.sigmoid(f_score) + f_geo = self.geo_conv(f_det) + f_geo = (torch.sigmoid(f_geo) - 0.5) * 2 * 800 + + pred = {'f_score': f_score, 'f_geo': f_geo} + return pred \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/heads/det_fce_head.py b/batch_running_task/pytorchocr/modeling/heads/det_fce_head.py new file mode 100644 index 0000000..22773c9 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/det_fce_head.py @@ -0,0 +1,82 @@ +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/dense_heads/fce_head.py +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# from paddle import nn +# from paddle import ParamAttr +# import paddle.nn.functional as F +# from paddle.nn.initializer import Normal +# import paddle +from functools import partial + + +def multi_apply(func, *args, **kwargs): + pfunc = partial(func, **kwargs) if kwargs else func + map_results = map(pfunc, *args) + return tuple(map(list, zip(*map_results))) + + +class FCEHead(nn.Module): + """The class for implementing FCENet head. + FCENet(CVPR2021): Fourier Contour Embedding for Arbitrary-shaped Text + Detection. + + [https://arxiv.org/abs/2104.10442] + + Args: + in_channels (int): The number of input channels. + scales (list[int]) : The scale of each layer. + fourier_degree (int) : The maximum Fourier transform degree k. + """ + + def __init__(self, in_channels, fourier_degree=5): + super().__init__() + assert isinstance(in_channels, int) + + self.downsample_ratio = 1.0 + self.in_channels = in_channels + self.fourier_degree = fourier_degree + self.out_channels_cls = 4 + self.out_channels_reg = (2 * self.fourier_degree + 1) * 2 + + self.out_conv_cls = nn.Conv2d( + in_channels=self.in_channels, + out_channels=self.out_channels_cls, + kernel_size=3, + stride=1, + padding=1, + groups=1, + bias=True) + self.out_conv_reg = nn.Conv2d( + in_channels=self.in_channels, + out_channels=self.out_channels_reg, + kernel_size=3, + stride=1, + padding=1, + groups=1, + bias=True) + + def forward(self, feats, targets=None): + cls_res, reg_res = multi_apply(self.forward_single, feats) + level_num = len(cls_res) + outs = {} + if not self.training: + for i in range(level_num): + tr_pred = F.softmax(cls_res[i][:, 0:2, :, :], dim=1) + tcl_pred = F.softmax(cls_res[i][:, 2:, :, :], dim=1) + outs['level_{}'.format(i)] = torch.cat( + [tr_pred, tcl_pred, reg_res[i]], dim=1) + else: + preds = [[cls_res[i], reg_res[i]] for i in range(level_num)] + outs['levels'] = preds + return outs + + def forward_single(self, x): + cls_predict = self.out_conv_cls(x) + reg_predict = self.out_conv_reg(x) + return cls_predict, reg_predict diff --git a/batch_running_task/pytorchocr/modeling/heads/det_pse_head.py b/batch_running_task/pytorchocr/modeling/heads/det_pse_head.py new file mode 100644 index 0000000..c032212 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/det_pse_head.py @@ -0,0 +1,25 @@ +""" +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py +""" + +# from paddle import nn +from torch import nn + + +class PSEHead(nn.Module): + def __init__(self, in_channels, hidden_dim=256, out_channels=7, **kwargs): + super(PSEHead, self).__init__() + self.conv1 = nn.Conv2d( + in_channels, hidden_dim, kernel_size=3, stride=1, padding=1) + self.bn1 = nn.BatchNorm2d(hidden_dim) + self.relu1 = nn.ReLU() + + self.conv2 = nn.Conv2d( + hidden_dim, out_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, x, **kwargs): + out = self.conv1(x) + out = self.relu1(self.bn1(out)) + out = self.conv2(out) + return {'maps': out} diff --git a/batch_running_task/pytorchocr/modeling/heads/det_sast_head.py b/batch_running_task/pytorchocr/modeling/heads/det_sast_head.py new file mode 100644 index 0000000..5b104e6 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/det_sast_head.py @@ -0,0 +1,118 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation +# import paddle +# from paddle import nn +# import paddle.nn.functional as F +# from paddle import ParamAttr + + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False) + + self.bn = nn.BatchNorm2d( + out_channels,) + self.act = act + if act is not None: + self._act = Activation(act) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.act is not None: + x = self._act(x) + return x + + +class SAST_Header1(nn.Module): + def __init__(self, in_channels, **kwargs): + super(SAST_Header1, self).__init__() + out_channels = [64, 64, 128] + self.score_conv = nn.Sequential( + ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_score1'), + ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_score2'), + ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_score3'), + ConvBNLayer(out_channels[2], 1, 3, 1, act=None, name='f_score4') + ) + self.border_conv = nn.Sequential( + ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_border1'), + ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_border2'), + ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_border3'), + ConvBNLayer(out_channels[2], 4, 3, 1, act=None, name='f_border4') + ) + + def forward(self, x): + f_score = self.score_conv(x) + f_score = torch.sigmoid(f_score) + f_border = self.border_conv(x) + return f_score, f_border + + +class SAST_Header2(nn.Module): + def __init__(self, in_channels, **kwargs): + super(SAST_Header2, self).__init__() + out_channels = [64, 64, 128] + self.tvo_conv = nn.Sequential( + ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tvo1'), + ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tvo2'), + ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tvo3'), + ConvBNLayer(out_channels[2], 8, 3, 1, act=None, name='f_tvo4') + ) + self.tco_conv = nn.Sequential( + ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tco1'), + ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tco2'), + ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tco3'), + ConvBNLayer(out_channels[2], 2, 3, 1, act=None, name='f_tco4') + ) + + def forward(self, x): + f_tvo = self.tvo_conv(x) + f_tco = self.tco_conv(x) + return f_tvo, f_tco + + +class SASTHead(nn.Module): + """ + """ + def __init__(self, in_channels, **kwargs): + super(SASTHead, self).__init__() + + self.head1 = SAST_Header1(in_channels) + self.head2 = SAST_Header2(in_channels) + + def forward(self, x): + f_score, f_border = self.head1(x) + f_tvo, f_tco = self.head2(x) + + predicts = {} + predicts['f_score'] = f_score + predicts['f_border'] = f_border + predicts['f_tvo'] = f_tvo + predicts['f_tco'] = f_tco + return predicts \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/heads/e2e_pg_head.py b/batch_running_task/pytorchocr/modeling/heads/e2e_pg_head.py new file mode 100644 index 0000000..e3e2130 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/e2e_pg_head.py @@ -0,0 +1,234 @@ + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation + + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False) + + self.bn = nn.BatchNorm2d(out_channels) + self.act = act + if self.act is not None: + self._act = Activation(act_type=self.act, inplace=True) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.act is not None: + x = self._act(x) + return x + + +class PGHead(nn.Module): + """ + """ + + def __init__(self, in_channels, **kwargs): + super(PGHead, self).__init__() + self.conv_f_score1 = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_score{}".format(1)) + self.conv_f_score2 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_score{}".format(2)) + self.conv_f_score3 = ConvBNLayer( + in_channels=64, + out_channels=128, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_score{}".format(3)) + + self.conv1 = nn.Conv2d( + in_channels=128, + out_channels=1, + kernel_size=3, + stride=1, + padding=1, + groups=1, + bias=False) + + self.conv_f_boder1 = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_boder{}".format(1)) + self.conv_f_boder2 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_boder{}".format(2)) + self.conv_f_boder3 = ConvBNLayer( + in_channels=64, + out_channels=128, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_boder{}".format(3)) + self.conv2 = nn.Conv2d( + in_channels=128, + out_channels=4, + kernel_size=3, + stride=1, + padding=1, + groups=1, + bias=False) + self.conv_f_char1 = ConvBNLayer( + in_channels=in_channels, + out_channels=128, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_char{}".format(1)) + self.conv_f_char2 = ConvBNLayer( + in_channels=128, + out_channels=128, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_char{}".format(2)) + self.conv_f_char3 = ConvBNLayer( + in_channels=128, + out_channels=256, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_char{}".format(3)) + self.conv_f_char4 = ConvBNLayer( + in_channels=256, + out_channels=256, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_char{}".format(4)) + self.conv_f_char5 = ConvBNLayer( + in_channels=256, + out_channels=256, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_char{}".format(5)) + self.conv3 = nn.Conv2d( + in_channels=256, + out_channels=37, + kernel_size=3, + stride=1, + padding=1, + groups=1, + bias=False) + + self.conv_f_direc1 = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_direc{}".format(1)) + self.conv_f_direc2 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_direc{}".format(2)) + self.conv_f_direc3 = ConvBNLayer( + in_channels=64, + out_channels=128, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_direc{}".format(3)) + self.conv4 = nn.Conv2d( + in_channels=128, + out_channels=2, + kernel_size=3, + stride=1, + padding=1, + groups=1, + bias=False) + + def forward(self, x): + f_score = self.conv_f_score1(x) + f_score = self.conv_f_score2(f_score) + f_score = self.conv_f_score3(f_score) + f_score = self.conv1(f_score) + f_score = torch.sigmoid(f_score) + + # f_border + f_border = self.conv_f_boder1(x) + f_border = self.conv_f_boder2(f_border) + f_border = self.conv_f_boder3(f_border) + f_border = self.conv2(f_border) + + f_char = self.conv_f_char1(x) + f_char = self.conv_f_char2(f_char) + f_char = self.conv_f_char3(f_char) + f_char = self.conv_f_char4(f_char) + f_char = self.conv_f_char5(f_char) + f_char = self.conv3(f_char) + + f_direction = self.conv_f_direc1(x) + f_direction = self.conv_f_direc2(f_direction) + f_direction = self.conv_f_direc3(f_direction) + f_direction = self.conv4(f_direction) + + predicts = {} + predicts['f_score'] = f_score + predicts['f_border'] = f_border + predicts['f_char'] = f_char + predicts['f_direction'] = f_direction + return predicts diff --git a/batch_running_task/pytorchocr/modeling/heads/multiheadAttention.py b/batch_running_task/pytorchocr/modeling/heads/multiheadAttention.py new file mode 100644 index 0000000..cbd887c --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/multiheadAttention.py @@ -0,0 +1,150 @@ +import torch +from torch import nn +import torch.nn.functional as F +from torch.nn import Linear +from torch.nn.init import xavier_uniform_ + + +class MultiheadAttention(nn.Module): + """Allows the model to jointly attend to information + from different representation subspaces. + See reference: Attention Is All You Need + + .. math:: + \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O + \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) + + Args: + embed_dim: total dimension of the model + num_heads: parallel attention layers, or heads + + """ + + def __init__(self, + embed_dim, + num_heads, + dropout=0., + bias=True, + add_bias_kv=False, + add_zero_attn=False): + super(MultiheadAttention, self).__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + self.scaling = self.head_dim**-0.5 + self.out_proj = Linear(embed_dim, embed_dim, bias=bias) + self._reset_parameters() + self.conv1 = torch.nn.Conv2d( + in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1)) + self.conv2 = torch.nn.Conv2d( + in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1)) + self.conv3 = torch.nn.Conv2d( + in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1)) + + def _reset_parameters(self): + xavier_uniform_(self.out_proj.weight) + + def forward(self, + query, + key, + value, + key_padding_mask=None, + incremental_state=None, + attn_mask=None): + """ + Inputs of forward function + query: [target length, batch size, embed dim] + key: [sequence length, batch size, embed dim] + value: [sequence length, batch size, embed dim] + key_padding_mask: if True, mask padding based on batch size + incremental_state: if provided, previous time steps are cashed + need_weights: output attn_output_weights + static_kv: key and value are static + + Outputs of forward function + attn_output: [target length, batch size, embed dim] + attn_output_weights: [batch size, target length, sequence length] + """ + q_shape = query.shape + src_shape = key.shape + q = self._in_proj_q(query) + k = self._in_proj_k(key) + v = self._in_proj_v(value) + q *= self.scaling + # q = paddle.transpose( + # paddle.reshape( + # q, [q_shape[0], q_shape[1], self.num_heads, self.head_dim]), + # [1, 2, 0, 3]) + q = torch.reshape(q, (q_shape[0], q_shape[1], self.num_heads, self.head_dim)) + q = q.permute(1, 2, 0, 3) + # k = paddle.transpose( + # paddle.reshape( + # k, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]), + # [1, 2, 0, 3]) + k = torch.reshape(k, (src_shape[0], q_shape[1], self.num_heads, self.head_dim)) + k = k.permute(1, 2, 0, 3) + # v = paddle.transpose( + # paddle.reshape( + # v, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]), + # [1, 2, 0, 3]) + v = torch.reshape(v, (src_shape[0], q_shape[1], self.num_heads, self.head_dim)) + v = v.permute(1, 2, 0, 3) + if key_padding_mask is not None: + assert key_padding_mask.shape[0] == q_shape[1] + assert key_padding_mask.shape[1] == src_shape[0] + attn_output_weights = torch.matmul(q, + k.permute(0, 1, 3, 2)) + if attn_mask is not None: + attn_mask = torch.unsqueeze(torch.unsqueeze(attn_mask, 0), 0) + attn_output_weights += attn_mask + if key_padding_mask is not None: + attn_output_weights = torch.reshape( + attn_output_weights, + [q_shape[1], self.num_heads, q_shape[0], src_shape[0]]) + key = torch.unsqueeze(torch.unsqueeze(key_padding_mask, 1), 2) + key = key.type(torch.float32) + y = torch.full( + size=key.shape, fill_value=float("-Inf"), dtype=torch.float32) + y = torch.where(key == 0., key, y) + attn_output_weights += y + attn_output_weights = F.softmax( + attn_output_weights.type(torch.float32), + dim=-1, + dtype=torch.float32 if attn_output_weights.dtype == torch.float16 + else attn_output_weights.dtype) + attn_output_weights = F.dropout( + attn_output_weights, p=self.dropout, training=self.training) + + attn_output = torch.matmul(attn_output_weights, v) + attn_output = torch.reshape( + attn_output.permute(2, 0, 1, 3), + [q_shape[0], q_shape[1], self.embed_dim]) + attn_output = self.out_proj(attn_output) + + return attn_output + + def _in_proj_q(self, query): + query = query.permute(1, 2, 0) + query = torch.unsqueeze(query, dim=2) + res = self.conv1(query) + res = torch.squeeze(res, dim=2) + res = res.permute(2, 0, 1) + return res + + def _in_proj_k(self, key): + key = key.permute(1, 2, 0) + key = torch.unsqueeze(key, dim=2) + res = self.conv2(key) + res = torch.squeeze(res, dim=2) + res = res.permute(2, 0, 1) + return res + + def _in_proj_v(self, value): + value = value.permute(1, 2, 0) #(1, 2, 0) + value = torch.unsqueeze(value, dim=2) + res = self.conv3(value) + res = torch.squeeze(res, dim=2) + res = res.permute(2, 0, 1) + return res diff --git a/batch_running_task/pytorchocr/modeling/heads/rec_att_head.py b/batch_running_task/pytorchocr/modeling/heads/rec_att_head.py new file mode 100644 index 0000000..a057c52 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/rec_att_head.py @@ -0,0 +1,190 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation + + +class AttentionHead(nn.Module): + def __init__(self, in_channels, out_channels, hidden_size, **kwargs): + super(AttentionHead, self).__init__() + self.input_size = in_channels + self.hidden_size = hidden_size + self.num_classes = out_channels + + self.attention_cell = AttentionGRUCell( + in_channels, hidden_size, out_channels, use_gru=False) + self.generator = nn.Linear(hidden_size, out_channels) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char.type(torch.int64), onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None, batch_max_length=25): + batch_size = inputs.size()[0] + num_steps = batch_max_length + + hidden = torch.zeros((batch_size, self.hidden_size)) + output_hiddens = [] + + if targets is not None: + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets[:, i], onehot_dim=self.num_classes) + (outputs, hidden), alpha = self.attention_cell(hidden, inputs, + char_onehots) + output_hiddens.append(torch.unsqueeze(outputs, dim=1)) + output = torch.cat(output_hiddens, dim=1) + probs = self.generator(output) + + else: + targets = torch.zeros([batch_size], dtype=torch.int32) + probs = None + char_onehots = None + outputs = None + alpha = None + + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets, onehot_dim=self.num_classes) + (outputs, hidden), alpha = self.attention_cell(hidden, inputs, + char_onehots) + probs_step = self.generator(outputs) + if probs is None: + probs = torch.unsqueeze(probs_step, dim=1) + else: + probs = torch.cat( + [probs, torch.unsqueeze( + probs_step, dim=1)], dim=1) + next_input = probs_step.argmax(dim=1) + targets = next_input + + return probs + + +class AttentionGRUCell(nn.Module): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionGRUCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias=False) + + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size, bias=True) + + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = torch.unsqueeze(self.h2h(prev_hidden), dim=1) + + res = torch.add(batch_H_proj, prev_hidden_proj) + res = torch.tanh(res) + e = self.score(res) + + alpha = F.softmax(e, dim=1) + alpha = alpha.permute(0, 2, 1) + context = torch.squeeze(torch.matmul(alpha, batch_H), dim=1) + concat_context = torch.cat([context, char_onehots.float()], 1) + + cur_hidden = self.rnn(concat_context, prev_hidden) + + return (cur_hidden, cur_hidden), alpha + + +class AttentionLSTM(nn.Module): + def __init__(self, in_channels, out_channels, hidden_size, **kwargs): + super(AttentionLSTM, self).__init__() + self.input_size = in_channels + self.hidden_size = hidden_size + self.num_classes = out_channels + + self.attention_cell = AttentionLSTMCell( + in_channels, hidden_size, out_channels, use_gru=False) + self.generator = nn.Linear(hidden_size, out_channels) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char.type(torch.int64), onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None, batch_max_length=25): + batch_size = inputs.shape[0] + num_steps = batch_max_length + + hidden = (torch.zeros((batch_size, self.hidden_size)), torch.zeros( + (batch_size, self.hidden_size))) + output_hiddens = [] + + if targets is not None: + for i in range(num_steps): + # one-hot vectors for a i-th char + char_onehots = self._char_to_onehot( + targets[:, i], onehot_dim=self.num_classes) + hidden, alpha = self.attention_cell(hidden, inputs, + char_onehots) + + hidden = (hidden[1][0], hidden[1][1]) + output_hiddens.append(torch.unsqueeze(hidden[0], dim=1)) + output = torch.cat(output_hiddens, dim=1) + probs = self.generator(output) + + else: + targets = torch.zeros([batch_size], dtype=torch.int32) + probs = None + + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets, onehot_dim=self.num_classes) + hidden, alpha = self.attention_cell(hidden, inputs, + char_onehots) + probs_step = self.generator(hidden[0]) + hidden = (hidden[1][0], hidden[1][1]) + if probs is None: + probs = torch.unsqueeze(probs_step, dim=1) + else: + probs = torch.cat( + [probs, torch.unsqueeze( + probs_step, dim=1)], dim=1) + + next_input = probs_step.argmax(dim=1) + + targets = next_input + + return probs + + +class AttentionLSTMCell(nn.Module): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionLSTMCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias=False) + if not use_gru: + self.rnn = nn.LSTMCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + else: + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = torch.unsqueeze(self.h2h(prev_hidden[0]), dim=1) + res = torch.add(batch_H_proj, prev_hidden_proj) + res = torch.tanh(res) + e = self.score(res) + + alpha = F.softmax(e, dim=1) + alpha = alpha.permute(0, 2, 1) + context = torch.squeeze(torch.matmul(alpha, batch_H), dim=1) + concat_context = torch.cat([context, char_onehots.float()], 1) + + cur_hidden = self.rnn(concat_context, prev_hidden) + + return cur_hidden, alpha diff --git a/batch_running_task/pytorchocr/modeling/heads/rec_can_head.py b/batch_running_task/pytorchocr/modeling/heads/rec_can_head.py new file mode 100644 index 0000000..4d097f5 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/rec_can_head.py @@ -0,0 +1,288 @@ +""" +This code is refer from: +https://github.com/LBH1024/CAN/models/can.py +https://github.com/LBH1024/CAN/models/counting.py +https://github.com/LBH1024/CAN/models/decoder.py +https://github.com/LBH1024/CAN/models/attention.py + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch.nn as nn +import torch +import math +''' +Counting Module +''' + + +class ChannelAtt(nn.Module): + def __init__(self, channel, reduction): + super(ChannelAtt, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction), + nn.ReLU(), nn.Linear(channel // reduction, channel), nn.Sigmoid()) + + def forward(self, x): + b, c, _, _ = x.shape + y = self.avg_pool(x).view(b, c) + y = self.fc(y).view(b, c, 1, 1) + return x * y + + +class CountingDecoder(nn.Module): + def __init__(self, in_channel, out_channel, kernel_size): + super(CountingDecoder, self).__init__() + self.in_channel = in_channel + self.out_channel = out_channel + + self.trans_layer = nn.Sequential( + nn.Conv2d( + self.in_channel, + 512, + kernel_size=kernel_size, + padding=kernel_size // 2, + bias=False), + nn.BatchNorm2d(512)) + + self.channel_att = ChannelAtt(512, 16) + + self.pred_layer = nn.Sequential( + nn.Conv2d( + 512, self.out_channel, kernel_size=1, bias=False), + nn.Sigmoid()) + + def forward(self, x, mask): + b, _, h, w = x.shape + x = self.trans_layer(x) + x = self.channel_att(x) + x = self.pred_layer(x) + + if mask is not None: + x = x * mask + x = x.view(b, self.out_channel, -1) + x1 = torch.sum(x, dim=-1) + + return x1, x.view(b, self.out_channel, h, w) + + +''' +Attention Decoder +''' + + +class PositionEmbeddingSine(nn.Module): + def __init__(self, + num_pos_feats=64, + temperature=10000, + normalize=False, + scale=None): + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, x, mask): + y_embed = mask.cumsum(1, dtype=torch.float32) + x_embed = mask.cumsum(2, dtype=torch.float32) + + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + # dim_d = paddle.expand(paddle.to_tensor(2), dim_t.shape) + # dim_t = self.temperature**(2 * (dim_t / dim_d).astype('int64') / + # self.num_pos_feats) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) + + pos_x = torch.unsqueeze(x_embed, 3) / dim_t + pos_y = torch.unsqueeze(y_embed, 3) / dim_t + + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + + return pos + + +class AttDecoder(nn.Module): + def __init__(self, ratio, is_train, input_size, hidden_size, + encoder_out_channel, dropout, dropout_ratio, word_num, + counting_decoder_out_channel, attention): + super(AttDecoder, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.out_channel = encoder_out_channel + self.attention_dim = attention['attention_dim'] + self.dropout_prob = dropout + self.ratio = ratio + self.word_num = word_num + + self.counting_num = counting_decoder_out_channel + self.is_train = is_train + + self.init_weight = nn.Linear(self.out_channel, self.hidden_size) + self.embedding = nn.Embedding(self.word_num, self.input_size) + self.word_input_gru = nn.GRUCell(self.input_size, self.hidden_size) + self.word_attention = Attention(hidden_size, attention['attention_dim']) + + self.encoder_feature_conv = nn.Conv2d( + self.out_channel, + self.attention_dim, + kernel_size=attention['word_conv_kernel'], + padding=attention['word_conv_kernel'] // 2) + + self.word_state_weight = nn.Linear(self.hidden_size, self.hidden_size) + self.word_embedding_weight = nn.Linear(self.input_size, + self.hidden_size) + self.word_context_weight = nn.Linear(self.out_channel, self.hidden_size) + self.counting_context_weight = nn.Linear(self.counting_num, + self.hidden_size) + self.word_convert = nn.Linear(self.hidden_size, self.word_num) + + if dropout: + self.dropout = nn.Dropout(dropout_ratio) + + def forward(self, cnn_features, labels, counting_preds, images_mask): + if self.is_train: + _, num_steps = labels.shape + else: + num_steps = 36 + + batch_size, _, height, width = cnn_features.shape + images_mask = images_mask[:, :, ::self.ratio, ::self.ratio] + + word_probs = torch.zeros((batch_size, num_steps, self.word_num)).to(device=cnn_features.device) + word_alpha_sum = torch.zeros((batch_size, 1, height, width)).to(device=cnn_features.device) + + hidden = self.init_hidden(cnn_features, images_mask) + counting_context_weighted = self.counting_context_weight(counting_preds) + cnn_features_trans = self.encoder_feature_conv(cnn_features) + + position_embedding = PositionEmbeddingSine(256, normalize=True) + pos = position_embedding(cnn_features_trans, images_mask[:, 0, :, :]) + + cnn_features_trans = cnn_features_trans + pos + + word = torch.ones([batch_size]).long().to(device=cnn_features.device) # init word as sos + for i in range(num_steps): + word_embedding = self.embedding(word) + hidden = self.word_input_gru(word_embedding, hidden) + word_context_vec, _, word_alpha_sum = self.word_attention( + cnn_features, cnn_features_trans, hidden, word_alpha_sum, + images_mask) + + current_state = self.word_state_weight(hidden) + word_weighted_embedding = self.word_embedding_weight(word_embedding) + word_context_weighted = self.word_context_weight(word_context_vec) + + if self.dropout_prob: + word_out_state = self.dropout( + current_state + word_weighted_embedding + + word_context_weighted + counting_context_weighted) + else: + word_out_state = current_state + word_weighted_embedding + word_context_weighted + counting_context_weighted + + word_prob = self.word_convert(word_out_state) + word_probs[:, i] = word_prob + + if self.is_train: + word = labels[:, i] + else: + word = word_prob.argmax(1) + word = torch.mul( + word, labels[:, i] + ) # labels are oneslike tensor in infer/predict mode, torch.multiply + + return word_probs + + def init_hidden(self, features, feature_mask): + average = torch.sum(torch.sum(features * feature_mask, dim=-1), + dim=-1) / torch.sum( + (torch.sum(feature_mask, dim=-1)), dim=-1) + average = self.init_weight(average) + return torch.tanh(average) + + +''' +Attention Module +''' + + +class Attention(nn.Module): + def __init__(self, hidden_size, attention_dim): + super(Attention, self).__init__() + self.hidden = hidden_size + self.attention_dim = attention_dim + self.hidden_weight = nn.Linear(self.hidden, self.attention_dim) + self.attention_conv = nn.Conv2d( + 1, 512, kernel_size=11, padding=5, bias=False) + self.attention_weight = nn.Linear( + 512, self.attention_dim, bias=False) + self.alpha_convert = nn.Linear(self.attention_dim, 1) + + def forward(self, + cnn_features, + cnn_features_trans, + hidden, + alpha_sum, + image_mask=None): + query = self.hidden_weight(hidden) + alpha_sum_trans = self.attention_conv(alpha_sum) + coverage_alpha = self.attention_weight(alpha_sum_trans.permute(0, 2, 3, 1)) + alpha_score = torch.tanh( + query[:, None, None, :] + coverage_alpha + cnn_features_trans.permute(0, 2, 3, 1) + ) + energy = self.alpha_convert(alpha_score) + energy = energy - energy.max() + energy_exp = torch.exp(torch.squeeze(energy, -1)) + + if image_mask is not None: + energy_exp = energy_exp * torch.squeeze(image_mask, 1) + alpha = energy_exp / (energy_exp.sum(-1).sum(-1)[:,None,None] + 1e-10) + alpha_sum = torch.unsqueeze(alpha, 1) + alpha_sum + context_vector = torch.sum( + torch.sum((torch.unsqueeze(alpha, 1) * cnn_features), -1), -1) + + return context_vector, alpha, alpha_sum + + +class CANHead(nn.Module): + def __init__(self, in_channel, out_channel, ratio, attdecoder, **kwargs): + super(CANHead, self).__init__() + + self.in_channel = in_channel + self.out_channel = out_channel + + self.counting_decoder1 = CountingDecoder(self.in_channel, + self.out_channel, 3) # mscm + self.counting_decoder2 = CountingDecoder(self.in_channel, + self.out_channel, 5) + + self.decoder = AttDecoder(ratio, **attdecoder) + + self.ratio = ratio + + def forward(self, inputs, targets=None): + cnn_features, images_mask, labels = inputs + + counting_mask = images_mask[:, :, ::self.ratio, ::self.ratio] + counting_preds1, _ = self.counting_decoder1(cnn_features, counting_mask) + counting_preds2, _ = self.counting_decoder2(cnn_features, counting_mask) + counting_preds = (counting_preds1 + counting_preds2) / 2 + + word_probs = self.decoder(cnn_features, labels, counting_preds, + images_mask) + return word_probs, counting_preds, counting_preds1, counting_preds2 diff --git a/batch_running_task/pytorchocr/modeling/heads/rec_ctc_head.py b/batch_running_task/pytorchocr/modeling/heads/rec_ctc_head.py new file mode 100644 index 0000000..f4f870b --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/rec_ctc_head.py @@ -0,0 +1,53 @@ +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F + +class CTCHead(nn.Module): + def __init__(self, + in_channels, + out_channels=6625, + fc_decay=0.0004, + mid_channels=None, + return_feats=False, + **kwargs): + super(CTCHead, self).__init__() + if mid_channels is None: + self.fc = nn.Linear( + in_channels, + out_channels, + bias=True,) + else: + self.fc1 = nn.Linear( + in_channels, + mid_channels, + bias=True, + ) + self.fc2 = nn.Linear( + mid_channels, + out_channels, + bias=True, + ) + + self.out_channels = out_channels + self.mid_channels = mid_channels + self.return_feats = return_feats + + + def forward(self, x, labels=None): + if self.mid_channels is None: + predicts = self.fc(x) + else: + x = self.fc1(x) + predicts = self.fc2(x) + + if self.return_feats: + result = (x, predicts) + else: + result = predicts + + if not self.training: + predicts = F.softmax(predicts, dim=2) + result = predicts + + return result \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/heads/rec_multi_head.py b/batch_running_task/pytorchocr/modeling/heads/rec_multi_head.py new file mode 100644 index 0000000..a79714d --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/rec_multi_head.py @@ -0,0 +1,88 @@ +import torch +import torch.nn as nn + +from pytorchocr.modeling.necks.rnn import Im2Seq, SequenceEncoder +from .rec_nrtr_head import Transformer +from .rec_ctc_head import CTCHead +from .rec_sar_head import SARHead + +class FCTranspose(nn.Module): + def __init__(self, in_channels, out_channels, only_transpose=False): + super().__init__() + self.only_transpose = only_transpose + if not self.only_transpose: + self.fc = nn.Linear(in_channels, out_channels, bias=False) + + def forward(self, x): + if self.only_transpose: + return x.permute([0, 2, 1]) + else: + return self.fc(x.permute([0, 2, 1])) + + +class MultiHead(nn.Module): + def __init__(self, in_channels, out_channels_list, **kwargs): + super().__init__() + self.head_list = kwargs.pop('head_list') + + self.gtc_head = 'sar' + assert len(self.head_list) >= 2 + for idx, head_name in enumerate(self.head_list): + name = list(head_name)[0] + if name == 'SARHead': + pass + # # sar head + # sar_args = self.head_list[idx][name] + # self.sar_head = eval(name)(in_channels=in_channels, \ + # out_channels=out_channels_list['SARLabelDecode'], **sar_args) + elif name == 'NRTRHead': + pass + # gtc_args = self.head_list[idx][name] + # max_text_length = gtc_args.get('max_text_length', 25) + # nrtr_dim = gtc_args.get('nrtr_dim', 256) + # num_decoder_layers = gtc_args.get('num_decoder_layers', 4) + # self.before_gtc = nn.Sequential( + # nn.Flatten(2), FCTranspose(in_channels, nrtr_dim)) + # self.gtc_head = Transformer( + # d_model=nrtr_dim, + # nhead=nrtr_dim // 32, + # num_encoder_layers=-1, + # beam_size=-1, + # num_decoder_layers=num_decoder_layers, + # max_len=max_text_length, + # dim_feedforward=nrtr_dim * 4, + # out_channels=out_channels_list['NRTRLabelDecode']) + elif name == 'CTCHead': + # ctc neck + self.encoder_reshape = Im2Seq(in_channels) + neck_args = self.head_list[idx][name]['Neck'] + encoder_type = neck_args.pop('name') + self.ctc_encoder = SequenceEncoder(in_channels=in_channels, \ + encoder_type=encoder_type, **neck_args) + # ctc head + head_args = self.head_list[idx][name].get('Head', {}) + if head_args is None: + head_args = {} + self.ctc_head = eval(name)(in_channels=self.ctc_encoder.out_channels, \ + out_channels=out_channels_list['CTCLabelDecode'], **head_args) + else: + raise NotImplementedError( + '{} is not supported in MultiHead yet'.format(name)) + + def forward(self, x, data=None): + ctc_encoder = self.ctc_encoder(x) + ctc_out = self.ctc_head(ctc_encoder) + head_out = dict() + head_out['ctc'] = ctc_out + head_out['res'] = ctc_out + head_out['ctc_neck'] = ctc_encoder + # eval mode + if not self.training: + return ctc_out + if self.gtc_head == 'sar': + sar_out = self.sar_head(x, data[1:])['res'] + head_out['sar'] = sar_out + else: + gtc_out = self.gtc_head(self.before_gtc(x), data[1:])['res'] + head_out['nrtr'] = gtc_out + return head_out diff --git a/batch_running_task/pytorchocr/modeling/heads/rec_nrtr_head.py b/batch_running_task/pytorchocr/modeling/heads/rec_nrtr_head.py new file mode 100644 index 0000000..13cb09a --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/rec_nrtr_head.py @@ -0,0 +1,812 @@ +import math +import torch +import copy +from torch import nn +import torch.nn.functional as F +from torch.nn import ModuleList as LayerList +from torch.nn.init import xavier_uniform_ +from torch.nn import Dropout, LayerNorm, Conv2d +import numpy as np +from pytorchocr.modeling.heads.multiheadAttention import MultiheadAttention +from torch.nn.init import xavier_normal_ + + +class Transformer(nn.Module): + """A transformer model. User is able to modify the attributes as needed. The architechture + is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer, + Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and + Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information + Processing Systems, pages 6000-6010. + + Args: + d_model: the number of expected features in the encoder/decoder inputs (default=512). + nhead: the number of heads in the multiheadattention models (default=8). + num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6). + num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + custom_encoder: custom encoder (default=None). + custom_decoder: custom decoder (default=None). + + """ + + def __init__(self, + d_model=512, + nhead=8, + num_encoder_layers=6, + beam_size=0, + num_decoder_layers=6, + max_len=25, + dim_feedforward=1024, + attention_dropout_rate=0.0, + residual_dropout_rate=0.1, + custom_encoder=None, + custom_decoder=None, + in_channels=0, + out_channels=0, + scale_embedding=True): + super(Transformer, self).__init__() + self.out_channels = out_channels # out_channels + 1 + self.max_len = max_len + self.embedding = Embeddings( + d_model=d_model, + vocab=self.out_channels, + padding_idx=0, + scale_embedding=scale_embedding) + self.positional_encoding = PositionalEncoding( + dropout=residual_dropout_rate, + dim=d_model, ) + if custom_encoder is not None: + self.encoder = custom_encoder + else: + if num_encoder_layers > 0: + encoder_layer = TransformerEncoderLayer( + d_model, nhead, dim_feedforward, attention_dropout_rate, + residual_dropout_rate) + self.encoder = TransformerEncoder(encoder_layer, + num_encoder_layers) + else: + self.encoder = None + + if custom_decoder is not None: + self.decoder = custom_decoder + else: + decoder_layer = TransformerDecoderLayer( + d_model, nhead, dim_feedforward, attention_dropout_rate, + residual_dropout_rate) + self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers) + + self._reset_parameters() + self.beam_size = beam_size + self.d_model = d_model + self.nhead = nhead + self.tgt_word_prj = nn.Linear( + d_model, self.out_channels, bias=False) + w0 = np.random.normal(0.0, d_model ** -0.5, + (self.out_channels, d_model)).astype(np.float32) + self.tgt_word_prj.weight.data = torch.from_numpy(w0) + self.apply(self._init_weights) + + def _init_weights(self, m): + + if isinstance(m, nn.Conv2d): + xavier_normal_(m.weight) + if m.bias is not None: + torch.nn.init.zeros_(m.bias) + + def forward_train(self, src, tgt): + tgt = tgt[:, :-1] + + tgt_key_padding_mask = self.generate_padding_mask(tgt) + tgt = self.embedding(tgt).permute(1, 0, 2) + tgt = self.positional_encoding(tgt) + tgt_mask = self.generate_square_subsequent_mask(tgt.shape[0], tgt.device) + + if self.encoder is not None: + src = self.positional_encoding(src.permute(1, 0, 2)) + memory = self.encoder(src) + else: + memory = src.squeeze(2).permute(2, 0, 1) + output = self.decoder( + tgt, + memory, + tgt_mask=tgt_mask, + memory_mask=None, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=None) + output = output.permute(1, 0, 2) + logit = self.tgt_word_prj(output) + return logit + + def forward(self, src, targets=None): + """Take in and process masked source/target sequences. + Args: + src: the sequence to the encoder (required). + tgt: the sequence to the decoder (required). + Shape: + - src: :math:`(S, N, E)`. + - tgt: :math:`(T, N, E)`. + Examples: + >>> output = transformer_model(src, tgt) + """ + + if self.training: + max_len = targets[1].max() + tgt = targets[0][:, :2 + max_len] + return self.forward_train(src, tgt) + else: + if self.beam_size > 0: + return self.forward_beam(src) + else: + return self.forward_test(src) + + def forward_test(self, src): + bs = src.shape[0] + if self.encoder is not None: + src = self.positional_encoding(src.permute(1, 0, 2)) + memory = self.encoder(src) + else: + memory = torch.squeeze(src, 2).permute(2, 0, 1) + dec_seq = torch.full((bs, 1), 2, dtype=torch.int64) + dec_prob = torch.full((bs, 1), 1., dtype=torch.float32) + for len_dec_seq in range(1, 25): + dec_seq_embed = self.embedding(dec_seq).permute(1, 0, 2) + dec_seq_embed = self.positional_encoding(dec_seq_embed) + tgt_mask = self.generate_square_subsequent_mask( + dec_seq_embed.shape[0]) + output = self.decoder( + dec_seq_embed, + memory, + tgt_mask=tgt_mask, + memory_mask=None, + tgt_key_padding_mask=None, + memory_key_padding_mask=None) + dec_output = output.permute(1, 0, 2) + dec_output = dec_output[:, -1, :] + tgt_word_prj = self.tgt_word_prj(dec_output) + word_prob = F.softmax(tgt_word_prj, dim=1) + preds_idx = word_prob.argmax(dim=1) + if torch.equal( + preds_idx, + torch.full( + preds_idx.shape, 3, dtype=torch.int64)): + break + preds_prob = torch.max(word_prob, dim=1).values + dec_seq = torch.cat( + [dec_seq, torch.reshape(preds_idx, (-1, 1))], dim=1) + dec_prob = torch.cat( + [dec_prob, torch.reshape(preds_prob, (-1, 1))], dim=1) + return [dec_seq, dec_prob] + + def forward_beam(self, images): + ''' Translation work in one batch ''' + + def get_inst_idx_to_tensor_position_map(inst_idx_list): + ''' Indicate the position of an instance in a tensor. ''' + return { + inst_idx: tensor_position + for tensor_position, inst_idx in enumerate(inst_idx_list) + } + + def collect_active_part(beamed_tensor, curr_active_inst_idx, + n_prev_active_inst, n_bm): + ''' Collect tensor parts associated to active instances. ''' + + beamed_tensor_shape = beamed_tensor.shape + n_curr_active_inst = len(curr_active_inst_idx) + new_shape = (n_curr_active_inst * n_bm, beamed_tensor_shape[1], + beamed_tensor_shape[2]) + + beamed_tensor = beamed_tensor.reshape([n_prev_active_inst, -1]) + beamed_tensor = beamed_tensor.index_select( + curr_active_inst_idx, axis=0) + beamed_tensor = beamed_tensor.reshape(new_shape) + + return beamed_tensor + + def collate_active_info(src_enc, inst_idx_to_position_map, + active_inst_idx_list): + # Sentences which are still active are collected, + # so the decoder will not run on completed sentences. + + n_prev_active_inst = len(inst_idx_to_position_map) + active_inst_idx = [ + inst_idx_to_position_map[k] for k in active_inst_idx_list + ] + active_inst_idx = torch.tensor(active_inst_idx, dtype=torch.int64) + active_src_enc = collect_active_part( + src_enc.permute(1, 0, 2), active_inst_idx, + n_prev_active_inst, n_bm).permute(1, 0, 2) + active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map( + active_inst_idx_list) + return active_src_enc, active_inst_idx_to_position_map + + def beam_decode_step(inst_dec_beams, len_dec_seq, enc_output, + inst_idx_to_position_map, n_bm, + memory_key_padding_mask): + ''' Decode and update beam status, and then return active beam idx ''' + + def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq): + dec_partial_seq = [ + b.get_current_state() for b in inst_dec_beams if not b.done + ] + dec_partial_seq = torch.stack(dec_partial_seq) + dec_partial_seq = dec_partial_seq.reshape([-1, len_dec_seq]) + return dec_partial_seq + + def predict_word(dec_seq, enc_output, n_active_inst, n_bm, + memory_key_padding_mask): + dec_seq = self.embedding(dec_seq).permute(1, 0, 2) + dec_seq = self.positional_encoding(dec_seq) + tgt_mask = self.generate_square_subsequent_mask( + dec_seq.shape[0]) + dec_output = self.decoder( + dec_seq, + enc_output, + tgt_mask=tgt_mask, + tgt_key_padding_mask=None, + memory_key_padding_mask=memory_key_padding_mask, ) + dec_output = dec_output.permute(1, 0, 2) + dec_output = dec_output[:, + -1, :] # Pick the last step: (bh * bm) * d_h + word_prob = F.softmax(self.tgt_word_prj(dec_output), dim=1) + word_prob = torch.reshape(word_prob, (n_active_inst, n_bm, -1)) + return word_prob + + def collect_active_inst_idx_list(inst_beams, word_prob, + inst_idx_to_position_map): + active_inst_idx_list = [] + for inst_idx, inst_position in inst_idx_to_position_map.items(): + is_inst_complete = inst_beams[inst_idx].advance(word_prob[ + inst_position]) + if not is_inst_complete: + active_inst_idx_list += [inst_idx] + + return active_inst_idx_list + + n_active_inst = len(inst_idx_to_position_map) + dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq) + word_prob = predict_word(dec_seq, enc_output, n_active_inst, n_bm, + None) + # Update the beam with predicted word prob information and collect incomplete instances + active_inst_idx_list = collect_active_inst_idx_list( + inst_dec_beams, word_prob, inst_idx_to_position_map) + return active_inst_idx_list + + def collect_hypothesis_and_scores(inst_dec_beams, n_best): + all_hyp, all_scores = [], [] + for inst_idx in range(len(inst_dec_beams)): + scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores() + all_scores += [scores[:n_best]] + hyps = [ + inst_dec_beams[inst_idx].get_hypothesis(i) + for i in tail_idxs[:n_best] + ] + all_hyp += [hyps] + return all_hyp, all_scores + + with torch.no_grad(): + #-- Encode + if self.encoder is not None: + src = self.positional_encoding(images.permute(1, 0, 2)) + src_enc = self.encoder(src) + else: + src_enc = images.squeeze(2).transpose([0, 2, 1]) + + n_bm = self.beam_size + src_shape = src_enc.shape + inst_dec_beams = [Beam(n_bm) for _ in range(1)] + active_inst_idx_list = list(range(1)) + # Repeat data for beam search + # src_enc = paddle.tile(src_enc, [1, n_bm, 1]) + src_enc = src_enc.repeat(1, n_bm, 1) + inst_idx_to_position_map = get_inst_idx_to_tensor_position_map( + active_inst_idx_list) + # Decode + for len_dec_seq in range(1, 25): + src_enc_copy = src_enc.clone() + active_inst_idx_list = beam_decode_step( + inst_dec_beams, len_dec_seq, src_enc_copy, + inst_idx_to_position_map, n_bm, None) + if not active_inst_idx_list: + break # all instances have finished their path to + src_enc, inst_idx_to_position_map = collate_active_info( + src_enc_copy, inst_idx_to_position_map, + active_inst_idx_list) + batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams, + 1) + result_hyp = [] + hyp_scores = [] + for bs_hyp, score in zip(batch_hyp, batch_scores): + l = len(bs_hyp[0]) + bs_hyp_pad = bs_hyp[0] + [3] * (25 - l) + result_hyp.append(bs_hyp_pad) + score = float(score) / l + hyp_score = [score for _ in range(25)] + hyp_scores.append(hyp_score) + return [ + torch.tensor( + np.array(result_hyp), dtype=torch.int64), + torch.tensor(hyp_scores) + ] + + def generate_square_subsequent_mask(self, sz): + """Generate a square mask for the sequence. The masked positions are filled with float('-inf'). + Unmasked positions are filled with float(0.0). + """ + mask = torch.zeros([sz, sz], dtype=torch.float32) + mask_inf = torch.triu( + torch.full( + size=[sz, sz], fill_value=float('-Inf'), dtype=torch.float32), + diagonal=1) + mask = mask + mask_inf + return mask + + def generate_padding_mask(self, x): + # padding_mask = paddle.equal(x, paddle.to_tensor(0, dtype=x.dtype)) + padding_mask = (x == torch.tensor(0, dtype=x.dtype)) + return padding_mask + + def _reset_parameters(self): + """Initiate parameters in the transformer model.""" + + for p in self.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + + +class TransformerEncoder(nn.Module): + """TransformerEncoder is a stack of N encoder layers + Args: + encoder_layer: an instance of the TransformerEncoderLayer() class (required). + num_layers: the number of sub-encoder-layers in the encoder (required). + norm: the layer normalization component (optional). + """ + + def __init__(self, encoder_layer, num_layers): + super(TransformerEncoder, self).__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + + def forward(self, src): + """Pass the input through the endocder layers in turn. + Args: + src: the sequnce to the encoder (required). + mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + """ + output = src + + for i in range(self.num_layers): + output = self.layers[i](output, + src_mask=None, + src_key_padding_mask=None) + + return output + + +class TransformerDecoder(nn.Module): + """TransformerDecoder is a stack of N decoder layers + + Args: + decoder_layer: an instance of the TransformerDecoderLayer() class (required). + num_layers: the number of sub-decoder-layers in the decoder (required). + norm: the layer normalization component (optional). + + """ + + def __init__(self, decoder_layer, num_layers): + super(TransformerDecoder, self).__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + + def forward(self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + tgt_key_padding_mask=None, + memory_key_padding_mask=None): + """Pass the inputs (and mask) through the decoder layer in turn. + + Args: + tgt: the sequence to the decoder (required). + memory: the sequnce from the last layer of the encoder (required). + tgt_mask: the mask for the tgt sequence (optional). + memory_mask: the mask for the memory sequence (optional). + tgt_key_padding_mask: the mask for the tgt keys per batch (optional). + memory_key_padding_mask: the mask for the memory keys per batch (optional). + """ + output = tgt + for i in range(self.num_layers): + output = self.layers[i]( + output, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask) + + return output + + +class TransformerEncoderLayer(nn.Module): + """TransformerEncoderLayer is made up of self-attn and feedforward network. + This standard encoder layer is based on the paper "Attention Is All You Need". + Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, + Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in + Neural Information Processing Systems, pages 6000-6010. Users may modify or implement + in a different way during application. + + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + + """ + + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + attention_dropout_rate=0.0, + residual_dropout_rate=0.1): + super(TransformerEncoderLayer, self).__init__() + self.self_attn = MultiheadAttention( + d_model, nhead, dropout=attention_dropout_rate) + + self.conv1 = nn.Conv2d( + in_channels=d_model, + out_channels=dim_feedforward, + kernel_size=(1, 1)) + self.conv2 = nn.Conv2d( + in_channels=dim_feedforward, + out_channels=d_model, + kernel_size=(1, 1)) + + self.norm1 = LayerNorm(d_model) + self.norm2 = LayerNorm(d_model) + self.dropout1 = Dropout(residual_dropout_rate) + self.dropout2 = Dropout(residual_dropout_rate) + + def forward(self, src, src_mask=None, src_key_padding_mask=None): + """Pass the input through the endocder layer. + Args: + src: the sequnce to the encoder layer (required). + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + """ + src2 = self.self_attn( + src, + src, + src, + attn_mask=src_mask, + key_padding_mask=src_key_padding_mask) + src = src + self.dropout1(src2) + src = self.norm1(src) + + src = src.permute(1, 2, 0) + src = torch.unsqueeze(src, 2) + src2 = self.conv2(F.relu(self.conv1(src))) + src2 = torch.squeeze(src2, 2) + src2 = src2.permute(2, 0, 1) + src = torch.squeeze(src, 2) + src = src.permute(2, 0, 1) + + src = src + self.dropout2(src2) + src = self.norm2(src) + return src + + +class TransformerDecoderLayer(nn.Module): + """TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network. + This standard decoder layer is based on the paper "Attention Is All You Need". + Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, + Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in + Neural Information Processing Systems, pages 6000-6010. Users may modify or implement + in a different way during application. + + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + + """ + + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + attention_dropout_rate=0.0, + residual_dropout_rate=0.1): + super(TransformerDecoderLayer, self).__init__() + self.self_attn = MultiheadAttention( + d_model, nhead, dropout=attention_dropout_rate) + self.multihead_attn = MultiheadAttention( + d_model, nhead, dropout=attention_dropout_rate) + + self.conv1 = nn.Conv2d( + in_channels=d_model, + out_channels=dim_feedforward, + kernel_size=(1, 1)) + self.conv2 = nn.Conv2d( + in_channels=dim_feedforward, + out_channels=d_model, + kernel_size=(1, 1)) + + self.norm1 = LayerNorm(d_model) + self.norm2 = LayerNorm(d_model) + self.norm3 = LayerNorm(d_model) + self.dropout1 = Dropout(residual_dropout_rate) + self.dropout2 = Dropout(residual_dropout_rate) + self.dropout3 = Dropout(residual_dropout_rate) + + def forward(self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + tgt_key_padding_mask=None, + memory_key_padding_mask=None): + """Pass the inputs (and mask) through the decoder layer. + + Args: + tgt: the sequence to the decoder layer (required). + memory: the sequnce from the last layer of the encoder (required). + tgt_mask: the mask for the tgt sequence (optional). + memory_mask: the mask for the memory sequence (optional). + tgt_key_padding_mask: the mask for the tgt keys per batch (optional). + memory_key_padding_mask: the mask for the memory keys per batch (optional). + + """ + tgt2 = self.self_attn( + tgt, + tgt, + tgt, + attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask) + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + tgt2 = self.multihead_attn( + tgt, + memory, + memory, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask) + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + + # default + tgt = tgt.permute(1, 2, 0) + tgt = torch.unsqueeze(tgt, 2) + tgt2 = self.conv2(F.relu(self.conv1(tgt))) + tgt2 = torch.squeeze(tgt2, 2) + tgt2 = tgt2.permute(2, 0, 1) + tgt = torch.squeeze(tgt, 2) + tgt = tgt.permute(2, 0, 1) + + tgt = tgt + self.dropout3(tgt2) + tgt = self.norm3(tgt) + return tgt + + +def _get_clones(module, N): + return LayerList([copy.deepcopy(module) for i in range(N)]) + + +class PositionalEncoding(nn.Module): + """Inject some information about the relative or absolute position of the tokens + in the sequence. The positional encodings have the same dimension as + the embeddings, so that the two can be summed. Here, we use sine and cosine + functions of different frequencies. + .. math:: + \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) + \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) + \text{where pos is the word position and i is the embed idx) + Args: + d_model: the embed dim (required). + dropout: the dropout value (default=0.1). + max_len: the max. length of the incoming sequence (default=5000). + Examples: + >>> pos_encoder = PositionalEncoding(d_model) + """ + + def __init__(self, dropout, dim, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros([max_len, dim]) + position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, dim, 2).type(torch.float32) * + (-math.log(10000.0) / dim)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = torch.unsqueeze(pe, 0) + pe = pe.permute(1, 0, 2) + self.register_buffer('pe', pe) + + def forward(self, x): + """Inputs of forward function + Args: + x: the sequence fed to the positional encoder model (required). + Shape: + x: [sequence length, batch size, embed dim] + output: [sequence length, batch size, embed dim] + Examples: + >>> output = pos_encoder(x) + """ + x = x + self.pe[:x.shape[0], :] + return self.dropout(x) + + +class PositionalEncoding_2d(nn.Module): + """Inject some information about the relative or absolute position of the tokens + in the sequence. The positional encodings have the same dimension as + the embeddings, so that the two can be summed. Here, we use sine and cosine + functions of different frequencies. + .. math:: + \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) + \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) + \text{where pos is the word position and i is the embed idx) + Args: + d_model: the embed dim (required). + dropout: the dropout value (default=0.1). + max_len: the max. length of the incoming sequence (default=5000). + Examples: + >>> pos_encoder = PositionalEncoding(d_model) + """ + + def __init__(self, dropout, dim, max_len=5000): + super(PositionalEncoding_2d, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros([max_len, dim]) + position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, dim, 2).type(torch.float32) * + (-math.log(10000.0) / dim)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = torch.unsqueeze(pe, 0).permute(1, 0, 2) + self.register_buffer('pe', pe) + + self.avg_pool_1 = nn.AdaptiveAvgPool2d((1, 1)) + self.linear1 = nn.Linear(dim, dim) + self.linear1.weight.data.fill_(1.) + self.avg_pool_2 = nn.AdaptiveAvgPool2d((1, 1)) + self.linear2 = nn.Linear(dim, dim) + self.linear2.weight.data.fill_(1.) + + def forward(self, x): + """Inputs of forward function + Args: + x: the sequence fed to the positional encoder model (required). + Shape: + x: [sequence length, batch size, embed dim] + output: [sequence length, batch size, embed dim] + Examples: + >>> output = pos_encoder(x) + """ + w_pe = self.pe[:x.shape[-1], :] + w1 = self.linear1(self.avg_pool_1(x).squeeze()).unsqueeze(0) + w_pe = w_pe * w1 + w_pe = w_pe.permute(1, 2, 0) + w_pe = torch.unsqueeze(w_pe, 2) + + h_pe = self.pe[:x.shape[-2], :] + w2 = self.linear2(self.avg_pool_2(x).squeeze()).unsqueeze(0) + h_pe = h_pe * w2 + h_pe = h_pe.permute(1, 2, 0) + h_pe = torch.unsqueeze(h_pe, 3) + + x = x + w_pe + h_pe + x = torch.reshape( + x, [x.shape[0], x.shape[1], x.shape[2] * x.shape[3]] + ).permute(2,0,1) + + return self.dropout(x) + + +class Embeddings(nn.Module): + def __init__(self, d_model, vocab, padding_idx, scale_embedding): + super(Embeddings, self).__init__() + self.embedding = nn.Embedding(vocab, d_model, padding_idx=padding_idx) + w0 = np.random.normal(0.0, d_model**-0.5, + (vocab, d_model)).astype(np.float32) + self.embedding.weight.data = torch.from_numpy(w0) + self.d_model = d_model + self.scale_embedding = scale_embedding + + def forward(self, x): + if self.scale_embedding: + x = self.embedding(x) + return x * math.sqrt(self.d_model) + return self.embedding(x) + + +class Beam(): + ''' Beam search ''' + + def __init__(self, size, device=False): + + self.size = size + self._done = False + # The score for each translation on the beam. + self.scores = torch.zeros((size, ), dtype=torch.float32) + self.all_scores = [] + # The backpointers at each time-step. + self.prev_ks = [] + # The outputs at each time-step. + self.next_ys = [torch.full((size, ), 0, dtype=torch.int64)] + self.next_ys[0][0] = 2 + + def get_current_state(self): + "Get the outputs for the current timestep." + return self.get_tentative_hypothesis() + + def get_current_origin(self): + "Get the backpointers for the current timestep." + return self.prev_ks[-1] + + @property + def done(self): + return self._done + + def advance(self, word_prob): + "Update beam status and check if finished or not." + num_words = word_prob.shape[1] + + # Sum the previous scores. + if len(self.prev_ks) > 0: + beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob) + else: + beam_lk = word_prob[0] + + flat_beam_lk = beam_lk.reshape([-1]) + best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, + True) # 1st sort + self.all_scores.append(self.scores) + self.scores = best_scores + # bestScoresId is flattened as a (beam x word) array, + # so we need to calculate which word and beam each score came from + prev_k = best_scores_id // num_words + self.prev_ks.append(prev_k) + self.next_ys.append(best_scores_id - prev_k * num_words) + # End condition is when top-of-beam is EOS. + if self.next_ys[-1][0] == 3: + self._done = True + self.all_scores.append(self.scores) + + return self._done + + def sort_scores(self): + "Sort the scores." + return self.scores, torch.tensor( + [i for i in range(int(self.scores.shape[0]))], dtype=torch.int32) + + def get_the_best_score_and_idx(self): + "Get the score of the best in the beam." + scores, ids = self.sort_scores() + return scores[1], ids[1] + + def get_tentative_hypothesis(self): + "Get the decoded sequence for the current timestep." + if len(self.next_ys) == 1: + dec_seq = self.next_ys[0].unsqueeze(1) + else: + _, keys = self.sort_scores() + hyps = [self.get_hypothesis(k) for k in keys] + hyps = [[2] + h for h in hyps] + dec_seq = torch.tensor(hyps, dtype=torch.int64) + return dec_seq + + def get_hypothesis(self, k): + """ Walk back to construct the full hypothesis. """ + hyp = [] + for j in range(len(self.prev_ks) - 1, -1, -1): + hyp.append(self.next_ys[j + 1][k]) + k = self.prev_ks[j][k] + return list(map(lambda x: x.item(), hyp[::-1])) diff --git a/batch_running_task/pytorchocr/modeling/heads/rec_sar_head.py b/batch_running_task/pytorchocr/modeling/heads/rec_sar_head.py new file mode 100644 index 0000000..128081c --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/rec_sar_head.py @@ -0,0 +1,403 @@ +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/encoders/sar_encoder.py +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/decoders/sar_decoder.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# import math +import torch +import torch.nn as nn +import torch.nn.functional as F +# import paddle +# from paddle import ParamAttr +# import paddle.nn as nn +# import paddle.nn.functional as F + + +class SAREncoder(nn.Module): + """ + Args: + enc_bi_rnn (bool): If True, use bidirectional RNN in encoder. + enc_drop_rnn (float): Dropout probability of RNN layer in encoder. + enc_gru (bool): If True, use GRU, else LSTM in encoder. + d_model (int): Dim of channels from backbone. + d_enc (int): Dim of encoder RNN layer. + mask (bool): If True, mask padding in RNN sequence. + """ + + def __init__(self, + enc_bi_rnn=False, + enc_drop_rnn=0.0, + enc_gru=False, + d_model=512, + d_enc=512, + mask=True, + **kwargs): + super().__init__() + assert isinstance(enc_bi_rnn, bool) + assert isinstance(enc_drop_rnn, (int, float)) + assert 0 <= enc_drop_rnn < 1.0 + assert isinstance(enc_gru, bool) + assert isinstance(d_model, int) + assert isinstance(d_enc, int) + assert isinstance(mask, bool) + + self.enc_bi_rnn = enc_bi_rnn + self.enc_drop_rnn = enc_drop_rnn + self.mask = mask + + # LSTM Encoder + # if enc_bi_rnn: + # direction = 'bidirectional' + # else: + # direction = 'forward' + kwargs = dict( + input_size=d_model, + hidden_size=d_enc, + num_layers=2, + batch_first=True, + dropout=enc_drop_rnn, + bidirectional=enc_bi_rnn) + if enc_gru: + self.rnn_encoder = nn.GRU(**kwargs) + else: + self.rnn_encoder = nn.LSTM(**kwargs) + + # global feature transformation + encoder_rnn_out_size = d_enc * (int(enc_bi_rnn) + 1) + self.linear = nn.Linear(encoder_rnn_out_size, encoder_rnn_out_size) + + def forward(self, feat, img_metas=None): + if img_metas is not None: + assert len(img_metas[0]) == feat.size(0) + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + h_feat = feat.shape[2] # bsz c h w + feat_v = F.max_pool2d( + feat, kernel_size=(h_feat, 1), stride=1, padding=0) + feat_v = feat_v.squeeze(2) # bsz * C * W + feat_v = feat_v.permute(0, 2, 1).contiguous() # bsz * W * C + holistic_feat = self.rnn_encoder(feat_v)[0] # bsz * T * C + + if valid_ratios is not None: + valid_hf = [] + T = holistic_feat.size(1) + for i in range(valid_ratios.size(0)): + valid_step = torch.min(T, torch.ceil(T * valid_ratios[i])) - 1 + # valid_step = paddle.minimum( + # T, paddle.ceil(valid_ratios[i] * T).astype('int32')) - 1 + valid_hf.append(holistic_feat[i, valid_step, :]) + valid_hf = torch.stack(valid_hf, dim=0) + else: + valid_hf = holistic_feat[:, -1, :] # bsz * C + holistic_feat = self.linear(valid_hf) # bsz * C + + return holistic_feat + + +class BaseDecoder(nn.Module): + def __init__(self, **kwargs): + super().__init__() + + def forward_train(self, feat, out_enc, targets, img_metas): + raise NotImplementedError + + def forward_test(self, feat, out_enc, img_metas): + raise NotImplementedError + + def forward(self, + feat, + out_enc, + label=None, + img_metas=None, + train_mode=True): + self.train_mode = train_mode + + if train_mode: + return self.forward_train(feat, out_enc, label, img_metas) + return self.forward_test(feat, out_enc, img_metas) + + +class ParallelSARDecoder(BaseDecoder): + """ + Args: + out_channels (int): Output class number. + enc_bi_rnn (bool): If True, use bidirectional RNN in encoder. + dec_bi_rnn (bool): If True, use bidirectional RNN in decoder. + dec_drop_rnn (float): Dropout of RNN layer in decoder. + dec_gru (bool): If True, use GRU, else LSTM in decoder. + d_model (int): Dim of channels from backbone. + d_enc (int): Dim of encoder RNN layer. + d_k (int): Dim of channels of attention module. + pred_dropout (float): Dropout probability of prediction layer. + max_seq_len (int): Maximum sequence length for decoding. + mask (bool): If True, mask padding in feature map. + start_idx (int): Index of start token. + padding_idx (int): Index of padding token. + pred_concat (bool): If True, concat glimpse feature from + attention with holistic feature and hidden state. + """ + + def __init__( + self, + out_channels, # 90 + unknown + start + padding + enc_bi_rnn=False, + dec_bi_rnn=False, + dec_drop_rnn=0.0, + dec_gru=False, + d_model=512, + d_enc=512, + d_k=64, + pred_dropout=0.0, + max_text_length=30, + mask=True, + pred_concat=True, + **kwargs): + super().__init__() + + self.num_classes = out_channels + self.enc_bi_rnn = enc_bi_rnn + self.d_k = d_k + self.start_idx = out_channels - 2 + self.padding_idx = out_channels - 1 + self.max_seq_len = max_text_length + self.mask = mask + self.pred_concat = pred_concat + + encoder_rnn_out_size = d_enc * (int(enc_bi_rnn) + 1) + decoder_rnn_out_size = encoder_rnn_out_size * (int(dec_bi_rnn) + 1) + + # 2D attention layer + self.conv1x1_1 = nn.Linear(decoder_rnn_out_size, d_k) + self.conv3x3_1 = nn.Conv2d( + d_model, d_k, kernel_size=3, stride=1, padding=1) + self.conv1x1_2 = nn.Linear(d_k, 1) + + # Decoder RNN layer + # if dec_bi_rnn: + # direction = 'bidirectional' + # else: + # direction = 'forward' + + kwargs = dict( + input_size=encoder_rnn_out_size, + hidden_size=encoder_rnn_out_size, + num_layers=2, + batch_first=True, + dropout=dec_drop_rnn, + bidirectional=dec_bi_rnn) + if dec_gru: + self.rnn_decoder = nn.GRU(**kwargs) + else: + self.rnn_decoder = nn.LSTM(**kwargs) + + # Decoder input embedding + self.embedding = nn.Embedding( + self.num_classes, + encoder_rnn_out_size, + padding_idx=self.padding_idx) + + # Prediction layer + self.pred_dropout = nn.Dropout(pred_dropout) + pred_num_classes = self.num_classes - 1 + if pred_concat: + fc_in_channel = decoder_rnn_out_size + d_model + encoder_rnn_out_size + else: + fc_in_channel = d_model + self.prediction = nn.Linear(fc_in_channel, pred_num_classes) + + def _2d_attention(self, + decoder_input, + feat, + holistic_feat, + valid_ratios=None): + + y = self.rnn_decoder(decoder_input)[0] + # y: bsz * (seq_len + 1) * hidden_size + + attn_query = self.conv1x1_1(y) # bsz * (seq_len + 1) * attn_size + bsz, seq_len, attn_size = attn_query.shape + # attn_query = paddle.unsqueeze(attn_query, axis=[3, 4]) + attn_query = attn_query.view(bsz, seq_len, attn_size, 1, 1) + # attn_query = attn_query.unsqueeze(3).unsqueeze(4) + # (bsz, seq_len + 1, attn_size, 1, 1) + + attn_key = self.conv3x3_1(feat) + # bsz * attn_size * h * w + attn_key = attn_key.unsqueeze(1) + # bsz * 1 * attn_size * h * w + + attn_weight = torch.tanh(torch.add(attn_key, attn_query)) + + # bsz * (seq_len + 1) * attn_size * h * w + attn_weight = attn_weight.permute(0, 1, 3, 4, 2).contiguous() + # bsz * (seq_len + 1) * h * w * attn_size + attn_weight = self.conv1x1_2(attn_weight) + # bsz * (seq_len + 1) * h * w * 1 + bsz, T, h, w, c = attn_weight.size() + assert c == 1 + + if valid_ratios is not None: + # cal mask of attention weight + for i in range(valid_ratios.size(0)): + valid_width = torch.min(w, torch.ceil(w * valid_ratios[i])) + # valid_width = paddle.minimum( + # w, paddle.ceil(valid_ratios[i] * w).astype("int32")) + if valid_width < w: + attn_weight[i, :, :, valid_width:, :] = float('-inf') + + # attn_weight = paddle.reshape(attn_weight, [bsz, T, -1]) + attn_weight = attn_weight.view(bsz, T, -1) + attn_weight = F.softmax(attn_weight, dim=-1) + + attn_weight = attn_weight.view(bsz, T, h, w, + c).permute(0, 1, 4, 2, 3).contiguous() + # attn_weight: bsz * T * c * h * w + # feat: bsz * c * h * w + attn_feat = torch.sum( + torch.mul(feat.unsqueeze(1), attn_weight), (3, 4), keepdim=False) + # bsz * (seq_len + 1) * C + + # Linear transformation + if self.pred_concat: + hf_c = holistic_feat.shape[-1] + holistic_feat = holistic_feat.expand(bsz, seq_len, hf_c) + y = self.prediction(torch.cat((y, attn_feat, holistic_feat), 2)) + else: + y = self.prediction(attn_feat) + # bsz * (seq_len + 1) * num_classes + if self.train_mode: + y = self.pred_dropout(y) + + return y + + def forward_train(self, feat, out_enc, label, img_metas): + ''' + img_metas: [label, valid_ratio] + ''' + if img_metas is not None: + assert img_metas[0].size(0) == feat.size(0) + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + lab_embedding = self.embedding(label) + # bsz * seq_len * emb_dim + out_enc = out_enc.unsqueeze(1) + # bsz * 1 * emb_dim + in_dec = torch.cat((out_enc, lab_embedding), dim=1) + # bsz * (seq_len + 1) * C + out_dec = self._2d_attention( + in_dec, feat, out_enc, valid_ratios=valid_ratios) + + return out_dec[:, 1:, :] # bsz * seq_len * num_classes + + def forward_test(self, feat, out_enc, img_metas): + if img_metas is not None: + assert len(img_metas[0]) == feat.shape[0] + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + seq_len = self.max_seq_len + bsz = feat.size(0) + start_token = torch.full( + (bsz, ), fill_value=self.start_idx, device=feat.device,dtype=torch.long) + # bsz + start_token = self.embedding(start_token) + # bsz * emb_dim + emb_dim = start_token.shape[1] + # start_token = start_token.unsqueeze(1).expand(-1, seq_len, -1) + start_token = start_token.unsqueeze(1).expand(bsz, seq_len, emb_dim) + # bsz * seq_len * emb_dim + out_enc = out_enc.unsqueeze(1) + # bsz * 1 * emb_dim + decoder_input = torch.cat((out_enc, start_token), dim=1) + # bsz * (seq_len + 1) * emb_dim + + outputs = [] + for i in range(1, seq_len + 1): + decoder_output = self._2d_attention( + decoder_input, feat, out_enc, valid_ratios=valid_ratios) + char_output = decoder_output[:, i, :] # bsz * num_classes + char_output = F.softmax(char_output, -1) + outputs.append(char_output) + _, max_idx = torch.max(char_output, dim=1, keepdim=False) + char_embedding = self.embedding(max_idx) # bsz * emb_dim + if i < seq_len: + decoder_input[:, i + 1, :] = char_embedding + + outputs = torch.stack(outputs, 1) # bsz * seq_len * num_classes + + return outputs + + +class SARHead(nn.Module): + def __init__(self, + in_channels, + out_channels, + enc_dim=512, + max_text_length=30, + enc_bi_rnn=False, + enc_drop_rnn=0.1, + enc_gru=False, + dec_bi_rnn=False, + dec_drop_rnn=0.0, + dec_gru=False, + d_k=512, + pred_dropout=0.1, + pred_concat=True, + **kwargs): + super(SARHead, self).__init__() + + # encoder module + self.encoder = SAREncoder( + enc_bi_rnn=enc_bi_rnn, + enc_drop_rnn=enc_drop_rnn, + enc_gru=enc_gru, + d_model=in_channels, + d_enc=enc_dim) + + # decoder module + self.decoder = ParallelSARDecoder( + out_channels=out_channels, + enc_bi_rnn=enc_bi_rnn, + dec_bi_rnn=dec_bi_rnn, + dec_drop_rnn=dec_drop_rnn, + dec_gru=dec_gru, + d_model=in_channels, + d_enc=enc_dim, + d_k=d_k, + pred_dropout=pred_dropout, + max_text_length=max_text_length, + pred_concat=pred_concat) + + def forward(self, feat, targets=None): + ''' + img_metas: [label, valid_ratio] + ''' + holistic_feat = self.encoder(feat, targets) # bsz c + + if self.training: + label = targets[0] # label + final_out = self.decoder( + feat, holistic_feat, label, img_metas=targets) + else: + final_out = self.decoder( + feat, + holistic_feat, + label=None, + img_metas=targets, + train_mode=False) + # (bsz, seq_len, num_classes) + + return final_out diff --git a/batch_running_task/pytorchocr/modeling/heads/rec_srn_head.py b/batch_running_task/pytorchocr/modeling/heads/rec_srn_head.py new file mode 100644 index 0000000..47ebb7c --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/rec_srn_head.py @@ -0,0 +1,271 @@ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation +import numpy as np +from .self_attention import WrapEncoderForFeature +from .self_attention import WrapEncoder + +from collections import OrderedDict +gradient_clip = 10 + +# https://forums.fast.ai/t/lambda-layer/28507/5 +class Lambda(nn.Module): + "An easy way to create a pytorch layer for a simple `func`." + def __init__(self, func): + "create a layer that simply calls `func` with `x`" + super().__init__() + self.func=func + + def forward(self, x): + return self.func(x) + +class PVAM(nn.Module): + def __init__(self, in_channels, char_num, max_text_length, num_heads, + num_encoder_tus, hidden_dims): + super(PVAM, self).__init__() + self.char_num = char_num + self.max_length = max_text_length + self.num_heads = num_heads + self.num_encoder_TUs = num_encoder_tus + self.hidden_dims = hidden_dims + # Transformer encoder + t = 256 + c = 512 + self.wrap_encoder_for_feature = WrapEncoderForFeature( + src_vocab_size=1, + max_length=t, + n_layer=self.num_encoder_TUs, + n_head=self.num_heads, + d_key=int(self.hidden_dims / self.num_heads), + d_value=int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.0,#0.1, + attention_dropout=0.0,#0.1, + relu_dropout=0.0,#0.1, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True) + + # PVAM + self.flatten0 = Lambda(lambda x: torch.flatten(x, start_dim=0, end_dim=1)) + self.fc0 = torch.nn.Linear( + in_features=in_channels, + out_features=in_channels, ) + self.emb = torch.nn.Embedding( + num_embeddings=self.max_length, embedding_dim=in_channels) + self.flatten1 = Lambda(lambda x: torch.flatten(x, start_dim=0, end_dim=2)) + self.fc1 = torch.nn.Linear( + in_features=in_channels, out_features=1, bias=False) + + def forward(self, inputs, encoder_word_pos, gsrm_word_pos): + b, c, h, w = inputs.shape + conv_features = torch.reshape(inputs, shape=[-1, c, h * w]) + conv_features = conv_features.permute(0, 2, 1) + # transformer encoder + b, t, c = conv_features.shape + + enc_inputs = [conv_features, encoder_word_pos, None] + word_features = self.wrap_encoder_for_feature(enc_inputs) + + # pvam + b, t, c = word_features.shape + word_features = self.fc0(word_features) + word_features_ = torch.reshape(word_features, [-1, 1, t, c]) + word_features_ = word_features_.repeat([1, self.max_length, 1, 1]) + word_pos_feature = self.emb(gsrm_word_pos) + word_pos_feature_ = torch.reshape(word_pos_feature, + [-1, self.max_length, 1, c]) + word_pos_feature_ = word_pos_feature_.repeat([1, 1, t, 1]) + y = word_pos_feature_ + word_features_ + y = torch.tanh(y) + attention_weight = self.fc1(y) + attention_weight = torch.reshape( + attention_weight, shape=[-1, self.max_length, t]) + attention_weight = F.softmax(attention_weight, dim=-1) + pvam_features = torch.matmul(attention_weight, + word_features) #[b, max_length, c] + return pvam_features + + +class GSRM(nn.Module): + def __init__(self, in_channels, char_num, max_text_length, num_heads, + num_encoder_tus, num_decoder_tus, hidden_dims): + super(GSRM, self).__init__() + self.char_num = char_num + self.max_length = max_text_length + self.num_heads = num_heads + self.num_encoder_TUs = num_encoder_tus + self.num_decoder_TUs = num_decoder_tus + self.hidden_dims = hidden_dims + + self.fc0 = torch.nn.Linear( + in_features=in_channels, out_features=self.char_num) + self.wrap_encoder0 = WrapEncoder( + src_vocab_size=self.char_num + 1, + max_length=self.max_length, + n_layer=self.num_decoder_TUs, + n_head=self.num_heads, + d_key=int(self.hidden_dims / self.num_heads), + d_value=int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.0, + attention_dropout=0.0, + relu_dropout=0.0, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True) + + self.wrap_encoder1 = WrapEncoder( + src_vocab_size=self.char_num + 1, + max_length=self.max_length, + n_layer=self.num_decoder_TUs, + n_head=self.num_heads, + d_key=int(self.hidden_dims / self.num_heads), + d_value=int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.0, + attention_dropout=0.0, + relu_dropout=0.0, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True) + + self.mul = lambda x: torch.matmul(x, + self.wrap_encoder0.prepare_decoder.emb0.weight.t(), + ) + + def forward(self, inputs, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2): + # ===== GSRM Visual-to-semantic embedding block ===== + b, t, c = inputs.shape + pvam_features = torch.reshape(inputs, [-1, c]) + word_out = self.fc0(pvam_features) + word_ids = torch.argmax(F.softmax(word_out, dim=-1), dim=1) + word_ids = torch.reshape(word_ids, shape=[-1, t, 1]) + + #===== GSRM Semantic reasoning block ===== + """ + This module is achieved through bi-transformers, + ngram_feature1 is the froward one, ngram_fetaure2 is the backward one + """ + pad_idx = self.char_num + word1 = F.pad(word_ids.type(torch.float32), [0, 0, 1, 0, 0, 0], value=1.0 * pad_idx) + word1 = word1.type(torch.int64) + word1 = word1[:, :-1, :] + word2 = word_ids + + enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1] + enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2] + + gsrm_feature1 = self.wrap_encoder0(enc_inputs_1) + gsrm_feature2 = self.wrap_encoder1(enc_inputs_2) + + gsrm_feature2 = F.pad(gsrm_feature2, [0, 0, 0, 1, 0, 0], + value=0., + ) + gsrm_feature2 = gsrm_feature2[:, 1:, ] + gsrm_features = gsrm_feature1 + gsrm_feature2 + + gsrm_out = self.mul(gsrm_features) + + b, t, c = gsrm_out.shape + gsrm_out = torch.reshape(gsrm_out, [-1, c]) + + return gsrm_features, word_out, gsrm_out + + +class VSFD(nn.Module): + def __init__(self, in_channels=512, pvam_ch=512, char_num=38): + super(VSFD, self).__init__() + self.char_num = char_num + self.fc0 = torch.nn.Linear( + in_features=in_channels * 2, out_features=pvam_ch) + self.fc1 = torch.nn.Linear( + in_features=pvam_ch, out_features=self.char_num) + + def forward(self, pvam_feature, gsrm_feature): + b, t, c1 = pvam_feature.shape + b, t, c2 = gsrm_feature.shape + combine_feature_ = torch.cat([pvam_feature, gsrm_feature], dim=2) + img_comb_feature_ = torch.reshape( + combine_feature_, shape=[-1, c1 + c2]) + img_comb_feature_map = self.fc0(img_comb_feature_) + img_comb_feature_map = torch.sigmoid(img_comb_feature_map) + img_comb_feature_map = torch.reshape( + img_comb_feature_map, shape=[-1, t, c1]) + combine_feature = img_comb_feature_map * pvam_feature + ( + 1.0 - img_comb_feature_map) * gsrm_feature + img_comb_feature = torch.reshape(combine_feature, shape=[-1, c1]) + + out = self.fc1(img_comb_feature) + return out + + +class SRNHead(nn.Module): + def __init__(self, in_channels, out_channels, max_text_length, num_heads, + num_encoder_TUs, num_decoder_TUs, hidden_dims, **kwargs): + super(SRNHead, self).__init__() + self.char_num = out_channels + self.max_length = max_text_length + self.num_heads = num_heads + self.num_encoder_TUs = num_encoder_TUs + self.num_decoder_TUs = num_decoder_TUs + self.hidden_dims = hidden_dims + + self.pvam = PVAM( + in_channels=in_channels, + char_num=self.char_num, + max_text_length=self.max_length, + num_heads=self.num_heads, + num_encoder_tus=self.num_encoder_TUs, + hidden_dims=self.hidden_dims) + + self.gsrm = GSRM( + in_channels=in_channels, + char_num=self.char_num, + max_text_length=self.max_length, + num_heads=self.num_heads, + num_encoder_tus=self.num_encoder_TUs, + num_decoder_tus=self.num_decoder_TUs, + hidden_dims=self.hidden_dims) + self.vsfd = VSFD(in_channels=in_channels, char_num=self.char_num) + + self.gsrm.wrap_encoder1.prepare_decoder.emb0 = self.gsrm.wrap_encoder0.prepare_decoder.emb0 + + def forward(self, inputs, others): + encoder_word_pos = others[0] + gsrm_word_pos = others[1].type(torch.long) + gsrm_slf_attn_bias1 = others[2] + gsrm_slf_attn_bias2 = others[3] + + pvam_feature = self.pvam(inputs, encoder_word_pos, gsrm_word_pos) + + gsrm_feature, word_out, gsrm_out = self.gsrm( + pvam_feature, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2) + + final_out = self.vsfd(pvam_feature, gsrm_feature) + if not self.training: + final_out = F.softmax(final_out, dim=1) + + _, decoded_out = torch.topk(final_out, k=1) + + predicts = OrderedDict([ + ('predict', final_out), + ('pvam_feature', pvam_feature), + ('decoded_out', decoded_out), + ('word_out', word_out), + ('gsrm_out', gsrm_out), + ]) + + return predicts diff --git a/batch_running_task/pytorchocr/modeling/heads/self_attention.py b/batch_running_task/pytorchocr/modeling/heads/self_attention.py new file mode 100644 index 0000000..3d37c60 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/self_attention.py @@ -0,0 +1,419 @@ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation +import numpy as np +gradient_clip = 10 + + +class WrapEncoderForFeature(nn.Module): + def __init__(self, + src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + bos_idx=0): + super(WrapEncoderForFeature, self).__init__() + + self.prepare_encoder = PrepareEncoder( + src_vocab_size, + d_model, + max_length, + prepostprocess_dropout, + bos_idx=bos_idx, + word_emb_param_name="src_word_emb_table") + self.encoder = Encoder(n_layer, n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, + postprocess_cmd) + + def forward(self, enc_inputs): + conv_features, src_pos, src_slf_attn_bias = enc_inputs + enc_input = self.prepare_encoder(conv_features, src_pos) + enc_output = self.encoder(enc_input, src_slf_attn_bias) + return enc_output + + +class WrapEncoder(nn.Module): + """ + embedder + encoder + """ + + def __init__(self, + src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + bos_idx=0): + super(WrapEncoder, self).__init__() + + self.prepare_decoder = PrepareDecoder( + src_vocab_size, + d_model, + max_length, + prepostprocess_dropout, + bos_idx=bos_idx) + self.encoder = Encoder(n_layer, n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, + postprocess_cmd) + + def forward(self, enc_inputs): + src_word, src_pos, src_slf_attn_bias = enc_inputs + enc_input = self.prepare_decoder(src_word, src_pos) + enc_output = self.encoder(enc_input, src_slf_attn_bias) + return enc_output + + +class Encoder(nn.Module): + """ + encoder + """ + + def __init__(self, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + + super(Encoder, self).__init__() + + self.encoder_layers = nn.ModuleList() + for i in range(n_layer): + encoderLayer = EncoderLayer(n_head, d_key, d_value, d_model, d_inner_hid, + prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, + postprocess_cmd) + self.encoder_layers.add_module("layer_%d" % i, encoderLayer) + self.processer = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, enc_input, attn_bias): + for encoder_layer in self.encoder_layers: + enc_output = encoder_layer(enc_input, attn_bias) + enc_input = enc_output + enc_output = self.processer(enc_output) + + return enc_output + + +class EncoderLayer(nn.Module): + """ + EncoderLayer + """ + + def __init__(self, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + + super(EncoderLayer, self).__init__() + self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, + attention_dropout) + self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.ffn = FFN(d_inner_hid, d_model, relu_dropout) + self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, enc_input, attn_bias): + attn_output = self.self_attn( + self.preprocesser1(enc_input), None, None, attn_bias) + attn_output = self.postprocesser1(attn_output, enc_input) + ffn_output = self.ffn(self.preprocesser2(attn_output)) + ffn_output = self.postprocesser2(ffn_output, attn_output) + return ffn_output + + +class MultiHeadAttention(nn.Module): + """ + Multi-Head Attention + """ + + def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.): + super(MultiHeadAttention, self).__init__() + self.n_head = n_head + self.d_key = d_key + self.d_value = d_value + self.d_model = d_model + self.dropout_rate = dropout_rate + self.q_fc = torch.nn.Linear( + in_features=d_model, out_features=d_key * n_head, bias=False) + self.k_fc = torch.nn.Linear( + in_features=d_model, out_features=d_key * n_head, bias=False) + self.v_fc = torch.nn.Linear( + in_features=d_model, out_features=d_value * n_head, bias=False) + self.proj_fc = torch.nn.Linear( + in_features=d_value * n_head, out_features=d_model, bias=False) + + def _prepare_qkv(self, queries, keys, values, cache=None): + if keys is None: # self-attention + keys, values = queries, queries + static_kv = False + else: # cross-attention + static_kv = True + + q = self.q_fc(queries) + q = torch.reshape(q, shape=[q.size(0), q.size(1), self.n_head, self.d_key]) + q = q.permute(0, 2, 1, 3) + + if cache is not None and static_kv and "static_k" in cache: + # for encoder-decoder attention in inference and has cached + k = cache["static_k"] + v = cache["static_v"] + else: + k = self.k_fc(keys) + v = self.v_fc(values) + k = torch.reshape(k, shape=[k.size(0), k.size(1), self.n_head, self.d_key]) + k = k.permute(0, 2, 1, 3) + v = torch.reshape(v, shape=[v.size(0), v.size(1), self.n_head, self.d_value]) + v = v.permute(0, 2, 1, 3) + + if cache is not None: + if static_kv and not "static_k" in cache: + # for encoder-decoder attention in inference and has not cached + cache["static_k"], cache["static_v"] = k, v + elif not static_kv: + # for decoder self-attention in inference + cache_k, cache_v = cache["k"], cache["v"] + k = torch.cat([cache_k, k], dim=2) + v = torch.cat([cache_v, v], dim=2) + cache["k"], cache["v"] = k, v + + return q, k, v + + def forward(self, queries, keys, values, attn_bias, cache=None): + # compute q ,k ,v + keys = queries if keys is None else keys + values = keys if values is None else values + q, k, v = self._prepare_qkv(queries, keys, values, cache) + + # scale dot product attention + product = torch.matmul(q, k.transpose(2, 3)) + product = product * self.d_model**-0.5 + if attn_bias is not None: + product += attn_bias + weights = F.softmax(product, dim=-1) + if self.dropout_rate: + weights = F.dropout( + weights, p=self.dropout_rate) + out = torch.matmul(weights, v) + + # combine heads + out = out.permute(0, 2, 1, 3) + out = torch.reshape(out, shape=[out.size(0), out.size(1), out.shape[2] * out.shape[3]]) + + # project to output + out = self.proj_fc(out) + + return out + + +# https://forums.fast.ai/t/lambda-layer/28507/5 +class Lambda(nn.Module): + "An easy way to create a pytorch layer for a simple `func`." + def __init__(self, func): + "create a layer that simply calls `func` with `x`" + super().__init__() + self.func=func + + def forward(self, x): + return self.func(x) + +class LambdaXY(nn.Module): + "An easy way to create a pytorch layer for a simple `func`." + def __init__(self, func): + "create a layer that simply calls `func` with `x`" + super().__init__() + self.func=func + + def forward(self, x, y): + return self.func(x, y) + + +class PrePostProcessLayer(nn.Module): + """ + PrePostProcessLayer + """ + + def __init__(self, process_cmd, d_model, dropout_rate): + super(PrePostProcessLayer, self).__init__() + self.process_cmd = process_cmd + self.functors = nn.ModuleList() + cur_a_len = 0 + cur_n_len = 0 + cur_d_len = 0 + for cmd in self.process_cmd: + if cmd == "a": # add residual connection + self.functors.add_module('add_res_connect_{}'.format(cur_a_len), LambdaXY(lambda x, y: x + y if y is not None else x)) + cur_a_len += 1 + elif cmd == "n": # add layer normalization + layerNorm = torch.nn.LayerNorm(normalized_shape=d_model, + elementwise_affine=True, + eps=1e-5) + self.functors.add_module("layer_norm_%d" % cur_n_len, + layerNorm) + cur_n_len += 1 + + + elif cmd == "d": # add dropout + self.functors.add_module('add_drop_{}'.format(cur_d_len), + Lambda(lambda x: F.dropout( + x, p=dropout_rate) + if dropout_rate else x) + ) + cur_d_len += 1 + + def forward(self, x, residual=None): + for i, (cmd, functor) in enumerate(zip(self.process_cmd, self.functors)): + if cmd == "a": + x = functor(x, residual) + else: + x = functor(x) + + return x + + +class PrepareEncoder(nn.Module): + def __init__(self, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate=0, + bos_idx=0, + word_emb_param_name=None, + pos_enc_param_name=None): + super(PrepareEncoder, self).__init__() + self.src_emb_dim = src_emb_dim + self.src_max_len = src_max_len + self.emb = torch.nn.Embedding( + num_embeddings=self.src_max_len, + embedding_dim=self.src_emb_dim, + sparse=True, + ) + self.dropout_rate = dropout_rate + + def forward(self, src_word, src_pos): + src_word_emb = src_word.type(torch.float32) + src_word_emb = self.src_emb_dim**0.5 * src_word_emb + src_pos = torch.squeeze(src_pos, dim=-1) + src_pos_enc = self.emb(src_pos.type(torch.int64)) + src_pos_enc.stop_gradient = True + enc_input = src_word_emb + src_pos_enc + if self.dropout_rate: + out = F.dropout( + enc_input, p=self.dropout_rate) + else: + out = enc_input + return out + + +class PrepareDecoder(nn.Module): + def __init__(self, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate=0, + bos_idx=0, + word_emb_param_name=None, + pos_enc_param_name=None): + super(PrepareDecoder, self).__init__() + self.src_emb_dim = src_emb_dim + """ + self.emb0 = Embedding(num_embeddings=src_vocab_size, + embedding_dim=src_emb_dim) + """ + self.emb0 = torch.nn.Embedding( + num_embeddings=src_vocab_size, + embedding_dim=self.src_emb_dim, + padding_idx=bos_idx, + ) + self.emb1 = torch.nn.Embedding( + num_embeddings=src_max_len, + embedding_dim=self.src_emb_dim, + ) + self.dropout_rate = dropout_rate + + def forward(self, src_word, src_pos): + src_word = torch.squeeze(src_word.type(torch.int64), dim=-1) + src_word_emb = self.emb0(src_word) + src_word_emb = self.src_emb_dim**0.5 * src_word_emb + src_pos = torch.squeeze(src_pos, dim=-1) + src_pos_enc = self.emb1(src_pos) + src_pos_enc.stop_gradient = True + enc_input = src_word_emb + src_pos_enc + if self.dropout_rate: + out = F.dropout( + enc_input, p=self.dropout_rate) + else: + out = enc_input + return out + + +class FFN(nn.Module): + """ + Feed-Forward Network + """ + + def __init__(self, d_inner_hid, d_model, dropout_rate): + super(FFN, self).__init__() + self.dropout_rate = dropout_rate + self.fc1 = torch.nn.Linear( + in_features=d_model, out_features=d_inner_hid) + self.fc2 = torch.nn.Linear( + in_features=d_inner_hid, out_features=d_model) + + def forward(self, x): + hidden = self.fc1(x) + hidden = F.relu(hidden) + if self.dropout_rate: + hidden = F.dropout( + hidden, p=self.dropout_rate) + out = self.fc2(hidden) + return out diff --git a/batch_running_task/pytorchocr/modeling/heads/sr_rensnet_transformer.py b/batch_running_task/pytorchocr/modeling/heads/sr_rensnet_transformer.py new file mode 100644 index 0000000..92d93d9 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/sr_rensnet_transformer.py @@ -0,0 +1,419 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FudanVI/FudanOCR/blob/main/text-gestalt/loss/transformer_english_decomposition.py +""" +import copy +import math +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def subsequent_mask(size): + """Generate a square mask for the sequence. The masked positions are filled with float('-inf'). + Unmasked positions are filled with float(0.0). + """ + mask = torch.ones(1, size, size, dtype=torch.float32) + mask_inf = torch.triu( + torch.full( + size=[1, size, size], fill_value=-np.inf, dtype=torch.float32), + diagonal=1) + mask = mask + mask_inf + padding_mask = torch.equal(mask, torch.Tensor(1).type(mask.dtype)) + return padding_mask + + + +def clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) + + +def attention(query, key, value, mask=None, dropout=None, attention_map=None): + d_k = query.shape[-1] + scores = torch.matmul(query, + key.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + scores = scores.masked_fill(mask == 0, float('-inf')) + else: + pass + + p_attn = F.softmax(scores, dim=-1) + + if dropout is not None: + p_attn = dropout(p_attn) + return torch.matmul(p_attn, value), p_attn + + +class MultiHeadedAttention(nn.Module): + def __init__(self, h, d_model, dropout=0.1, compress_attention=False): + super(MultiHeadedAttention, self).__init__() + assert d_model % h == 0 + self.d_k = d_model // h + self.h = h + self.linears = clones(nn.Linear(d_model, d_model), 4) + self.attn = None + self.dropout = nn.Dropout(p=dropout) + self.compress_attention = compress_attention + self.compress_attention_linear = nn.Linear(h, 1) + + def forward(self, query, key, value, mask=None, attention_map=None): + if mask is not None: + mask = mask.unsqueeze(1) + nbatches = query.size(0) + + query, key, value = \ + [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) + for l, x in zip(self.linears, (query, key, value))] + + x, attention_map = attention( + query, + key, + value, + mask=mask, + dropout=self.dropout, + attention_map=attention_map) + + x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k) + + return self.linears[-1](x), attention_map + + +class ResNet(nn.Module): + def __init__(self, num_in, block, layers): + super(ResNet, self).__init__() + + self.conv1 = nn.Conv2d(num_in, 64, kernel_size=3, stride=1, padding=1) + self.bn1 = nn.BatchNorm2d(64) + self.relu1 = nn.ReLU(inplace=True) + self.pool = nn.MaxPool2d((2, 2), (2, 2)) + + self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1) + self.bn2 = nn.BatchNorm2d(128) + self.relu2 = nn.ReLU(inplace=True) + + self.layer1_pool = nn.MaxPool2d((2, 2), (2, 2)) + self.layer1 = self._make_layer(block, 128, 256, layers[0]) + self.layer1_conv = nn.Conv2d(256, 256, 3, 1, 1) + self.layer1_bn = nn.BatchNorm2d(256) + self.layer1_relu = nn.ReLU(inplace=True) + + self.layer2_pool = nn.MaxPool2d((2, 2), (2, 2)) + self.layer2 = self._make_layer(block, 256, 256, layers[1]) + self.layer2_conv = nn.Conv2d(256, 256, 3, 1, 1) + self.layer2_bn = nn.BatchNorm2d(256) + self.layer2_relu = nn.ReLU(inplace=True) + + self.layer3_pool = nn.MaxPool2d((2, 2), (2, 2)) + self.layer3 = self._make_layer(block, 256, 512, layers[2]) + self.layer3_conv = nn.Conv2d(512, 512, 3, 1, 1) + self.layer3_bn = nn.BatchNorm2d(512) + self.layer3_relu = nn.ReLU(inplace=True) + + self.layer4_pool = nn.MaxPool2d((2, 2), (2, 2)) + self.layer4 = self._make_layer(block, 512, 512, layers[3]) + self.layer4_conv2 = nn.Conv2d(512, 1024, 3, 1, 1) + self.layer4_conv2_bn = nn.BatchNorm2d(1024) + self.layer4_conv2_relu = nn.ReLU(inplace=True) + + def _make_layer(self, block, inplanes, planes, blocks): + + if inplanes != planes: + downsample = nn.Sequential( + nn.Conv2d(inplanes, planes, 3, 1, 1), + nn.BatchNorm2d( + planes), ) + else: + downsample = None + layers = [] + layers.append(block(inplanes, planes, downsample)) + for i in range(1, blocks): + layers.append(block(planes, planes, downsample=None)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu1(x) + x = self.pool(x) + + x = self.conv2(x) + x = self.bn2(x) + x = self.relu2(x) + + x = self.layer1_pool(x) + x = self.layer1(x) + x = self.layer1_conv(x) + x = self.layer1_bn(x) + x = self.layer1_relu(x) + + x = self.layer2(x) + x = self.layer2_conv(x) + x = self.layer2_bn(x) + x = self.layer2_relu(x) + + x = self.layer3(x) + x = self.layer3_conv(x) + x = self.layer3_bn(x) + x = self.layer3_relu(x) + + x = self.layer4(x) + x = self.layer4_conv2(x) + x = self.layer4_conv2_bn(x) + x = self.layer4_conv2_relu(x) + + return x + + +class Bottleneck(nn.Module): + def __init__(self, input_dim): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(input_dim, input_dim, 1) + self.bn1 = nn.BatchNorm2d(input_dim) + self.relu = nn.ReLU() + + self.conv2 = nn.Conv2d(input_dim, input_dim, 3, 1, 1) + self.bn2 = nn.BatchNorm2d(input_dim) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + out += residual + out = self.relu(out) + + return out + + +class PositionalEncoding(nn.Module): + "Implement the PE function." + + def __init__(self, dropout, dim, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros(max_len, dim) + position = torch.arange(0, max_len).unsqueeze(1).float() + div_term = torch.exp( + torch.arange(0, dim, 2).float() * + (-math.log(10000.0) / dim)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + x = x + self.pe[:, :x.size(1)] + return self.dropout(x) + + +class PositionwiseFeedForward(nn.Module): + "Implements FFN equation." + + def __init__(self, d_model, d_ff, dropout=0.1): + super(PositionwiseFeedForward, self).__init__() + self.w_1 = nn.Linear(d_model, d_ff) + self.w_2 = nn.Linear(d_ff, d_model) + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + return self.w_2(self.dropout(F.relu(self.w_1(x)))) + + +class Generator(nn.Module): + "Define standard linear + softmax generation step." + + def __init__(self, d_model, vocab): + super(Generator, self).__init__() + self.proj = nn.Linear(d_model, vocab) + self.relu = nn.ReLU() + + def forward(self, x): + out = self.proj(x) + return out + + +class Embeddings(nn.Module): + def __init__(self, d_model, vocab): + super(Embeddings, self).__init__() + self.lut = nn.Embedding(vocab, d_model) + self.d_model = d_model + + def forward(self, x): + embed = self.lut(x) * math.sqrt(self.d_model) + return embed + + +class LayerNorm(nn.Module): + "Construct a layernorm module (See citation for details)." + + def __init__(self, features, eps=1e-6): + super(LayerNorm, self).__init__() + self.a_2 = nn.parameter.Parameter(torch.ones(features)) + self.b_2 = nn.parameter.Parameter(torch.zeros(features)) + self.eps = eps + + def forward(self, x): + mean = x.mean(-1, keepdim=True) + std = x.std(-1, keepdim=True) + return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 + + +class Decoder(nn.Module): + def __init__(self): + super(Decoder, self).__init__() + + self.mask_multihead = MultiHeadedAttention( + h=16, d_model=1024, dropout=0.1) + self.mul_layernorm1 = LayerNorm(1024) + + self.multihead = MultiHeadedAttention(h=16, d_model=1024, dropout=0.1) + self.mul_layernorm2 = LayerNorm(1024) + + self.pff = PositionwiseFeedForward(1024, 2048) + self.mul_layernorm3 = LayerNorm(1024) + + def forward(self, text, conv_feature, attention_map=None): + text_max_length = text.shape[1] + mask = subsequent_mask(text_max_length) + result = text + result = self.mul_layernorm1(result + self.mask_multihead( + text, text, text, mask=mask)[0]) + b, c, h, w = conv_feature.shape + conv_feature = conv_feature.view(b, c, h * w).permute(0, 2, 1).contiguous() + word_image_align, attention_map = self.multihead( + result, + conv_feature, + conv_feature, + mask=None, + attention_map=attention_map) + result = self.mul_layernorm2(result + word_image_align) + result = self.mul_layernorm3(result + self.pff(result)) + + return result, attention_map + + +class BasicBlock(nn.Module): + def __init__(self, inplanes, planes, downsample): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d( + inplanes, planes, kernel_size=3, stride=1, padding=1) + self.bn1 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU() + self.conv2 = nn.Conv2d( + planes, planes, kernel_size=3, stride=1, padding=1) + self.bn2 = nn.BatchNorm2d(planes) + self.downsample = downsample + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample != None: + residual = self.downsample(residual) + + out += residual + out = self.relu(out) + + return out + + +class Encoder(nn.Module): + def __init__(self): + super(Encoder, self).__init__() + self.cnn = ResNet(num_in=1, block=BasicBlock, layers=[1, 2, 5, 3]) + + def forward(self, input): + conv_result = self.cnn(input) + return conv_result + + +class Transformer(nn.Module): + def __init__(self, in_channels=1, alphabet='0123456789'): + super(Transformer, self).__init__() + self.alphabet = alphabet + word_n_class = self.get_alphabet_len() + self.embedding_word_with_upperword = Embeddings(512, word_n_class) + self.pe = PositionalEncoding(dim=512, dropout=0.1, max_len=5000) + + self.encoder = Encoder() + self.decoder = Decoder() + self.generator_word_with_upperword = Generator(1024, word_n_class) + + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def get_alphabet_len(self): + return len(self.alphabet) + + def forward(self, image, text_length, text_input, attention_map=None): + if image.shape[1] == 3: + R = image[:, 0:1, :, :] + G = image[:, 1:2, :, :] + B = image[:, 2:3, :, :] + image = 0.299 * R + 0.587 * G + 0.114 * B + + conv_feature = self.encoder(image) # batch, 1024, 8, 32 + max_length = max(text_length) + text_input = text_input[:, :max_length] + + text_embedding = self.embedding_word_with_upperword( + text_input) # batch, text_max_length, 512 + if torch.cuda.is_available(): + postion_embedding = self.pe( + torch.zeros(text_embedding.shape).cuda()).cuda() + else: + postion_embedding = self.pe( + torch.zeros(text_embedding.shape)) # batch, text_max_length, 512 + text_input_with_pe = torch.cat([text_embedding, postion_embedding], 2) # batch, text_max_length, 1024 + batch, seq_len, _ = text_input_with_pe.shape + + text_input_with_pe, word_attention_map = self.decoder( + text_input_with_pe, conv_feature) + + word_decoder_result = self.generator_word_with_upperword( + text_input_with_pe) + + if self.training: + total_length = torch.sum(text_length).data + probs_res = torch.zeros([total_length, self.get_alphabet_len()]).type_as(word_decoder_result.data) + start = 0 + + for index, length in enumerate(text_length): + length = int(length.numpy()) + probs_res[start:start + length, :] = word_decoder_result[ + index, 0:0 + length, :] + + start = start + length + + return probs_res, word_attention_map, None + else: + return word_decoder_result diff --git a/batch_running_task/pytorchocr/modeling/heads/table_att_head.py b/batch_running_task/pytorchocr/modeling/heads/table_att_head.py new file mode 100644 index 0000000..a2a97ee --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/heads/table_att_head.py @@ -0,0 +1,207 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + + +class TableAttentionHead(nn.Module): + def __init__(self, in_channels, hidden_size, loc_type, in_max_len=488, **kwargs): + super(TableAttentionHead, self).__init__() + self.input_size = in_channels[-1] + self.hidden_size = hidden_size + self.elem_num = 30 + self.max_text_length = 100 + self.max_elem_length = kwargs.get('max_elem_length', 500) + self.max_cell_num = 500 + + self.structure_attention_cell = AttentionGRUCell( + self.input_size, hidden_size, self.elem_num, use_gru=False) + self.structure_generator = nn.Linear(hidden_size, self.elem_num) + self.loc_type = loc_type + self.in_max_len = in_max_len + + if self.loc_type == 1: + self.loc_generator = nn.Linear(hidden_size, 4) + else: + if self.in_max_len == 640: + self.loc_fea_trans = nn.Linear(400, self.max_elem_length + 1) + elif self.in_max_len == 800: + self.loc_fea_trans = nn.Linear(625, self.max_elem_length + 1) + else: + self.loc_fea_trans = nn.Linear(256, self.max_elem_length + 1) + self.loc_generator = nn.Linear(self.input_size + hidden_size, 4) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char.type(torch.int64), onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None): + # if and else branch are both needed when you want to assign a variable + # if you modify the var in just one branch, then the modification will not work. + fea = inputs[-1] + if len(fea.shape) == 3: + pass + else: + last_shape = int(np.prod(fea.shape[2:])) # gry added + fea = torch.reshape(fea, [fea.shape[0], fea.shape[1], last_shape]) + # fea = fea.transpose([0, 2, 1]) # (NTC)(batch, width, channels) + fea = fea.permute(0, 2, 1) + batch_size = fea.shape[0] + + hidden = torch.zeros((batch_size, self.hidden_size)) + output_hiddens = [] + if self.training and targets is not None: + raise NotImplementedError + else: + temp_elem = torch.zeros([batch_size], dtype=torch.int32) + structure_probs = None + loc_preds = None + elem_onehots = None + outputs = None + alpha = None + max_elem_length = torch.as_tensor(self.max_elem_length) + i = 0 + while i < max_elem_length + 1: + elem_onehots = self._char_to_onehot( + temp_elem, onehot_dim=self.elem_num) + (outputs, hidden), alpha = self.structure_attention_cell( + hidden, fea, elem_onehots) + output_hiddens.append(torch.unsqueeze(outputs, dim=1)) + structure_probs_step = self.structure_generator(outputs) + temp_elem = structure_probs_step.argmax(dim=1, keepdim=False) + i += 1 + + output = torch.cat(output_hiddens, dim=1) + structure_probs = self.structure_generator(output) + structure_probs = F.softmax(structure_probs, dim=-1) + if self.loc_type == 1: + loc_preds = self.loc_generator(output) + loc_preds = F.sigmoid(loc_preds) + else: + loc_fea = fea.permute(0, 2, 1) + loc_fea = self.loc_fea_trans(loc_fea) + loc_fea = loc_fea.permute(0, 2, 1) + loc_concat = torch.cat([output, loc_fea], dim=2) + loc_preds = self.loc_generator(loc_concat) + loc_preds = F.sigmoid(loc_preds) + return {'structure_probs': structure_probs, 'loc_preds': loc_preds} + + +class AttentionGRUCell(nn.Module): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionGRUCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias=False) + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = torch.unsqueeze(self.h2h(prev_hidden), dim=1) + res = torch.add(batch_H_proj, prev_hidden_proj) + res = torch.tanh(res) + e = self.score(res) + alpha = F.softmax(e, dim=1) + alpha = alpha.permute(0, 2, 1) + context = torch.squeeze(torch.matmul(alpha, batch_H), dim=1) + concat_context = torch.cat([context, char_onehots.float()], 1) + cur_hidden = self.rnn(concat_context, prev_hidden) + return (cur_hidden, cur_hidden), alpha + + +class AttentionLSTM(nn.Module): + def __init__(self, in_channels, out_channels, hidden_size, **kwargs): + super(AttentionLSTM, self).__init__() + self.input_size = in_channels + self.hidden_size = hidden_size + self.num_classes = out_channels + + self.attention_cell = AttentionLSTMCell( + in_channels, hidden_size, out_channels, use_gru=False) + self.generator = nn.Linear(hidden_size, out_channels) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char, onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None, batch_max_length=25): + batch_size = inputs.shape[0] + num_steps = batch_max_length + + hidden = (torch.zeros((batch_size, self.hidden_size)), torch.zeros( + (batch_size, self.hidden_size))) + output_hiddens = [] + + if targets is not None: + for i in range(num_steps): + # one-hot vectors for a i-th char + char_onehots = self._char_to_onehot( + targets[:, i], onehot_dim=self.num_classes) + hidden, alpha = self.attention_cell(hidden, inputs, + char_onehots) + + hidden = (hidden[1][0], hidden[1][1]) + output_hiddens.append(torch.unsqueeze(hidden[0], dim=1)) + output = torch.cat(output_hiddens, dim=1) + probs = self.generator(output) + + else: + targets = torch.zeros([batch_size], dtype=torch.int32) + probs = None + + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets, onehot_dim=self.num_classes) + hidden, alpha = self.attention_cell(hidden, inputs, + char_onehots) + probs_step = self.generator(hidden[0]) + hidden = (hidden[1][0], hidden[1][1]) + if probs is None: + probs = torch.unsqueeze(probs_step, dim=1) + else: + probs = torch.cat( + [probs, torch.unsqueeze( + probs_step, dim=1)], dim=1) + + next_input = probs_step.argmax(dim=1) + + targets = next_input + + return probs + + +class AttentionLSTMCell(nn.Module): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionLSTMCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias=False) + if not use_gru: + self.rnn = nn.LSTMCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + else: + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = torch.unsqueeze(self.h2h(prev_hidden[0]), dim=1) + res = torch.add(batch_H_proj, prev_hidden_proj) + res = torch.tanh(res) + e = self.score(res) + + alpha = F.softmax(e, dim=1) + alpha = alpha.permute(0, 2, 1) + context = torch.squeeze(torch.matmul(alpha, batch_H), dim=1) + concat_context = torch.cat([context, char_onehots.float()], 1) + cur_hidden = self.rnn(concat_context, prev_hidden) + + return (cur_hidden, cur_hidden), alpha \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/necks/__init__.py b/batch_running_task/pytorchocr/modeling/necks/__init__.py new file mode 100644 index 0000000..e525810 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/necks/__init__.py @@ -0,0 +1,33 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['build_neck'] + +def build_neck(config): + from .db_fpn import DBFPN, RSEFPN, LKPAN + from .east_fpn import EASTFPN + from .sast_fpn import SASTFPN + from .rnn import SequenceEncoder + from .pg_fpn import PGFPN + from .fpn import FPN + from .fce_fpn import FCEFPN + from .table_fpn import TableFPN + support_dict = ['FPN', 'DBFPN', 'EASTFPN', 'SASTFPN', 'SequenceEncoder', 'PGFPN', 'TableFPN', + 'RSEFPN', 'LKPAN', 'FCEFPN'] + + module_name = config.pop('name') + assert module_name in support_dict, Exception('neck only support {}'.format( + support_dict)) + module_class = eval(module_name)(**config) + return module_class \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/necks/db_fpn.py b/batch_running_task/pytorchocr/modeling/necks/db_fpn.py new file mode 100644 index 0000000..6d98ad0 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/necks/db_fpn.py @@ -0,0 +1,414 @@ +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.backbones.det_mobilenet_v3 import SEModule +from pytorchocr.modeling.necks.intracl import IntraCLBlock + +def hard_swish(x, inplace=True): + return x * F.relu6(x + 3., inplace=inplace) / 6. + +class DSConv(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + padding, + stride=1, + groups=None, + if_act=True, + act="relu", + **kwargs): + super(DSConv, self).__init__() + if groups == None: + groups = in_channels + self.if_act = if_act + self.act = act + self.conv1 = nn.Conv2d( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False) + + self.bn1 = nn.BatchNorm2d(in_channels) + + self.conv2 = nn.Conv2d( + in_channels=in_channels, + out_channels=int(in_channels * 4), + kernel_size=1, + stride=1, + bias=False) + + self.bn2 = nn.BatchNorm2d(int(in_channels * 4)) + + self.conv3 = nn.Conv2d( + in_channels=int(in_channels * 4), + out_channels=out_channels, + kernel_size=1, + stride=1, + bias=False) + self._c = [in_channels, out_channels] + if in_channels != out_channels: + self.conv_end = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + bias=False) + + def forward(self, inputs): + + x = self.conv1(inputs) + x = self.bn1(x) + + x = self.conv2(x) + x = self.bn2(x) + if self.if_act: + if self.act == "relu": + x = F.relu(x) + elif self.act == "hardswish": + x = hard_swish(x) + else: + print("The activation function({}) is selected incorrectly.". + format(self.act)) + exit() + + x = self.conv3(x) + if self._c[0] != self._c[1]: + x = x + self.conv_end(inputs) + return x + + +class DBFPN(nn.Module): + def __init__(self, in_channels, out_channels, use_asf=False, **kwargs): + super(DBFPN, self).__init__() + self.out_channels = out_channels + self.use_asf = use_asf + + self.in2_conv = nn.Conv2d( + in_channels=in_channels[0], + out_channels=self.out_channels, + kernel_size=1, + bias=False) + self.in3_conv = nn.Conv2d( + in_channels=in_channels[1], + out_channels=self.out_channels, + kernel_size=1, + bias=False) + self.in4_conv = nn.Conv2d( + in_channels=in_channels[2], + out_channels=self.out_channels, + kernel_size=1, + bias=False) + self.in5_conv = nn.Conv2d( + in_channels=in_channels[3], + out_channels=self.out_channels, + kernel_size=1, + bias=False) + self.p5_conv = nn.Conv2d( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + bias=False) + self.p4_conv = nn.Conv2d( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + bias=False) + self.p3_conv = nn.Conv2d( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + bias=False) + self.p2_conv = nn.Conv2d( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + bias=False) + + if self.use_asf is True: + self.asf = ASFBlock(self.out_channels, self.out_channels // 4) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.in5_conv(c5) + in4 = self.in4_conv(c4) + in3 = self.in3_conv(c3) + in2 = self.in2_conv(c2) + + out4 = in4 + F.interpolate( + in5, scale_factor=2, mode="nearest", )#align_mode=1) # 1/16 + out3 = in3 + F.interpolate( + out4, scale_factor=2, mode="nearest", )#align_mode=1) # 1/8 + out2 = in2 + F.interpolate( + out3, scale_factor=2, mode="nearest", )#align_mode=1) # 1/4 + + p5 = self.p5_conv(in5) + p4 = self.p4_conv(out4) + p3 = self.p3_conv(out3) + p2 = self.p2_conv(out2) + p5 = F.interpolate(p5, scale_factor=8, mode="nearest", )#align_mode=1) + p4 = F.interpolate(p4, scale_factor=4, mode="nearest", )#align_mode=1) + p3 = F.interpolate(p3, scale_factor=2, mode="nearest", )#align_mode=1) + + fuse = torch.cat([p5, p4, p3, p2], dim=1) + + if self.use_asf is True: + fuse = self.asf(fuse, [p5, p4, p3, p2]) + + return fuse + + +class RSELayer(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, shortcut=True): + super(RSELayer, self).__init__() + self.out_channels = out_channels + self.in_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=self.out_channels, + kernel_size=kernel_size, + padding=int(kernel_size // 2), + bias=False) + self.se_block = SEModule(self.out_channels) + self.shortcut = shortcut + + def forward(self, ins): + x = self.in_conv(ins) + if self.shortcut: + out = x + self.se_block(x) + else: + out = self.se_block(x) + return out + + +class RSEFPN(nn.Module): + def __init__(self, in_channels, out_channels, shortcut=True, **kwargs): + super(RSEFPN, self).__init__() + self.out_channels = out_channels + self.ins_conv = nn.ModuleList() + self.inp_conv = nn.ModuleList() + self.intracl = False + if 'intracl' in kwargs.keys() and kwargs['intracl'] is True: + self.intracl = kwargs['intracl'] + self.incl1 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl2 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl3 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl4 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + + for i in range(len(in_channels)): + self.ins_conv.append( + RSELayer( + in_channels[i], + out_channels, + kernel_size=1, + shortcut=shortcut)) + self.inp_conv.append( + RSELayer( + out_channels, + out_channels // 4, + kernel_size=3, + shortcut=shortcut)) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.ins_conv[3](c5) + in4 = self.ins_conv[2](c4) + in3 = self.ins_conv[1](c3) + in2 = self.ins_conv[0](c2) + + out4 = in4 + F.upsample( + in5, scale_factor=2, mode="nearest") # 1/16 + out3 = in3 + F.upsample( + out4, scale_factor=2, mode="nearest") # 1/8 + out2 = in2 + F.upsample( + out3, scale_factor=2, mode="nearest") # 1/4 + + p5 = self.inp_conv[3](in5) + p4 = self.inp_conv[2](out4) + p3 = self.inp_conv[1](out3) + p2 = self.inp_conv[0](out2) + + if self.intracl is True: + p5 = self.incl4(p5) + p4 = self.incl3(p4) + p3 = self.incl2(p3) + p2 = self.incl1(p2) + + p5 = F.upsample(p5, scale_factor=8, mode="nearest") + p4 = F.upsample(p4, scale_factor=4, mode="nearest") + p3 = F.upsample(p3, scale_factor=2, mode="nearest") + + fuse = torch.cat([p5, p4, p3, p2], dim=1) + return fuse + + +class LKPAN(nn.Module): + def __init__(self, in_channels, out_channels, mode='large', **kwargs): + super(LKPAN, self).__init__() + self.out_channels = out_channels + + self.ins_conv = nn.ModuleList() + self.inp_conv = nn.ModuleList() + # pan head + self.pan_head_conv = nn.ModuleList() + self.pan_lat_conv = nn.ModuleList() + + if mode.lower() == 'lite': + p_layer = DSConv + elif mode.lower() == 'large': + p_layer = nn.Conv2d + else: + raise ValueError( + "mode can only be one of ['lite', 'large'], but received {}". + format(mode)) + + for i in range(len(in_channels)): + self.ins_conv.append( + nn.Conv2d( + in_channels=in_channels[i], + out_channels=self.out_channels, + kernel_size=1, + bias=False)) + + self.inp_conv.append( + p_layer( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=9, + padding=4, + bias=False)) + + if i > 0: + self.pan_head_conv.append( + nn.Conv2d( + in_channels=self.out_channels // 4, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + stride=2, + bias=False)) + self.pan_lat_conv.append( + p_layer( + in_channels=self.out_channels // 4, + out_channels=self.out_channels // 4, + kernel_size=9, + padding=4, + bias=False)) + self.intracl = False + if 'intracl' in kwargs.keys() and kwargs['intracl'] is True: + self.intracl = kwargs['intracl'] + self.incl1 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl2 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl3 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl4 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.ins_conv[3](c5) + in4 = self.ins_conv[2](c4) + in3 = self.ins_conv[1](c3) + in2 = self.ins_conv[0](c2) + + out4 = in4 + F.upsample( + in5, scale_factor=2, mode="nearest") # 1/16 + out3 = in3 + F.upsample( + out4, scale_factor=2, mode="nearest") # 1/8 + out2 = in2 + F.upsample( + out3, scale_factor=2, mode="nearest") # 1/4 + + f5 = self.inp_conv[3](in5) + f4 = self.inp_conv[2](out4) + f3 = self.inp_conv[1](out3) + f2 = self.inp_conv[0](out2) + + pan3 = f3 + self.pan_head_conv[0](f2) + pan4 = f4 + self.pan_head_conv[1](pan3) + pan5 = f5 + self.pan_head_conv[2](pan4) + + p2 = self.pan_lat_conv[0](f2) + p3 = self.pan_lat_conv[1](pan3) + p4 = self.pan_lat_conv[2](pan4) + p5 = self.pan_lat_conv[3](pan5) + + if self.intracl is True: + p5 = self.incl4(p5) + p4 = self.incl3(p4) + p3 = self.incl2(p3) + p2 = self.incl1(p2) + + p5 = F.upsample(p5, scale_factor=8, mode="nearest") + p4 = F.upsample(p4, scale_factor=4, mode="nearest") + p3 = F.upsample(p3, scale_factor=2, mode="nearest") + + fuse = torch.cat([p5, p4, p3, p2], dim=1) + return fuse + + +class ASFBlock(nn.Module): + """ + This code is refered from: + https://github.com/MhLiao/DB/blob/master/decoders/feature_attention.py + """ + + def __init__(self, in_channels, inter_channels, out_features_num=4): + """ + Adaptive Scale Fusion (ASF) block of DBNet++ + Args: + in_channels: the number of channels in the input data + inter_channels: the number of middle channels + out_features_num: the number of fused stages + """ + super(ASFBlock, self).__init__() + self.in_channels = in_channels + self.inter_channels = inter_channels + self.out_features_num = out_features_num + self.conv = nn.Conv2d(in_channels, inter_channels, 3, padding=1) + + self.spatial_scale = nn.Sequential( + #Nx1xHxW + nn.Conv2d( + in_channels=1, + out_channels=1, + kernel_size=3, + bias=False, + padding=1, + ), + nn.ReLU(), + nn.Conv2d( + in_channels=1, + out_channels=1, + kernel_size=1, + bias=False, + ), + nn.Sigmoid()) + + self.channel_scale = nn.Sequential( + nn.Conv2d( + in_channels=inter_channels, + out_channels=out_features_num, + kernel_size=1, + bias=False, + ), + nn.Sigmoid()) + + def forward(self, fuse_features, features_list): + fuse_features = self.conv(fuse_features) + spatial_x = torch.mean(fuse_features, dim=1, keepdim=True) + attention_scores = self.spatial_scale(spatial_x) + fuse_features + attention_scores = self.channel_scale(attention_scores) + assert len(features_list) == self.out_features_num + + out_list = [] + for i in range(self.out_features_num): + out_list.append(attention_scores[:, i:i + 1] * features_list[i]) + return torch.cat(out_list, dim=1) diff --git a/batch_running_task/pytorchocr/modeling/necks/east_fpn.py b/batch_running_task/pytorchocr/modeling/necks/east_fpn.py new file mode 100644 index 0000000..e5fdac4 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/necks/east_fpn.py @@ -0,0 +1,184 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation +# import paddle +# from paddle import nn +# import paddle.nn.functional as F +# from paddle import ParamAttr + + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False) + + self.bn = nn.BatchNorm2d( + out_channels,) + self.act = act + if act is not None: + self._act = Activation(act) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.act is not None: + x = self._act(x) + return x + + +class DeConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None): + super(DeConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + + self.deconv = nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False) + self.bn = nn.BatchNorm2d( + out_channels, + ) + self.act = act + if act is not None: + self._act = Activation(act) + + + def forward(self, x): + x = self.deconv(x) + x = self.bn(x) + if self.act is not None: + x = self._act(x) + return x + + +class EASTFPN(nn.Module): + def __init__(self, in_channels, model_name, **kwargs): + super(EASTFPN, self).__init__() + self.model_name = model_name + if self.model_name == "large": + self.out_channels = 128 + else: + self.out_channels = 64 + self.in_channels = in_channels[::-1] + self.h1_conv = ConvBNLayer( + in_channels=self.out_channels+self.in_channels[1], + out_channels=self.out_channels, + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="unet_h_1") + self.h2_conv = ConvBNLayer( + in_channels=self.out_channels+self.in_channels[2], + out_channels=self.out_channels, + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="unet_h_2") + self.h3_conv = ConvBNLayer( + in_channels=self.out_channels+self.in_channels[3], + out_channels=self.out_channels, + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="unet_h_3") + self.g0_deconv = DeConvBNLayer( + in_channels=self.in_channels[0], + out_channels=self.out_channels, + kernel_size=4, + stride=2, + padding=1, + if_act=True, + act='relu', + name="unet_g_0") + self.g1_deconv = DeConvBNLayer( + in_channels=self.out_channels, + out_channels=self.out_channels, + kernel_size=4, + stride=2, + padding=1, + if_act=True, + act='relu', + name="unet_g_1") + self.g2_deconv = DeConvBNLayer( + in_channels=self.out_channels, + out_channels=self.out_channels, + kernel_size=4, + stride=2, + padding=1, + if_act=True, + act='relu', + name="unet_g_2") + self.g3_conv = ConvBNLayer( + in_channels=self.out_channels, + out_channels=self.out_channels, + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="unet_g_3") + + def forward(self, x): + f = x[::-1] + + h = f[0] + g = self.g0_deconv(h) + # h = paddle.concat([g, f[1]], axis=1) + h = torch.cat([g, f[1]], dim=1) + h = self.h1_conv(h) + g = self.g1_deconv(h) + # h = paddle.concat([g, f[2]], axis=1) + h = torch.cat([g, f[2]], dim=1) + h = self.h2_conv(h) + g = self.g2_deconv(h) + # h = paddle.concat([g, f[3]], axis=1) + h = torch.cat([g, f[3]], dim=1) + h = self.h3_conv(h) + g = self.g3_conv(h) + + return g \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/necks/fce_fpn.py b/batch_running_task/pytorchocr/modeling/necks/fce_fpn.py new file mode 100644 index 0000000..8111c9b --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/necks/fce_fpn.py @@ -0,0 +1,323 @@ +""" +This code is refer from: +https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.3/ppdet/modeling/necks/fpn.py +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.init import xavier_normal_ +from torch.nn.init import xavier_uniform_ +# import paddle.nn as nn +# import paddle.nn.functional as F +# from paddle import ParamAttr +# from paddle.nn.initializer import XavierUniform +# from paddle.nn.initializer import Normal +# from paddle.regularizer import L2Decay + +__all__ = ['FCEFPN'] + + +class ConvNormLayer(nn.Module): + def __init__(self, + ch_in, + ch_out, + filter_size, + stride, + groups=1, + norm_type='bn', + norm_decay=0., + norm_groups=32, + lr_scale=1., + freeze_norm=False, + initializer=None): + super(ConvNormLayer, self).__init__() + assert norm_type in ['bn', 'sync_bn', 'gn'] + + bias_attr = False + + self.conv = nn.Conv2d( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + bias=bias_attr) + + norm_lr = 0. if freeze_norm else 1. + # param_attr = ParamAttr( + # learning_rate=norm_lr, + # regularizer=L2Decay(norm_decay) if norm_decay is not None else None) + # bias_attr = ParamAttr( + # learning_rate=norm_lr, + # regularizer=L2Decay(norm_decay) if norm_decay is not None else None) + if norm_type == 'bn': + self.norm = nn.BatchNorm2d( + ch_out, + ) + elif norm_type == 'sync_bn': + self.norm = nn.SyncBatchNorm( + ch_out, + ) + elif norm_type == 'gn': + self.norm = nn.GroupNorm( + num_groups=norm_groups, + num_channels=ch_out, + affine=bias_attr) + + def forward(self, inputs): + out = self.conv(inputs) + out = self.norm(out) + return out + + +class FCEFPN(nn.Module): + """ + Feature Pyramid Network, see https://arxiv.org/abs/1612.03144 + Args: + in_channels (list[int]): input channels of each level which can be + derived from the output shape of backbone by from_config + out_channels (list[int]): output channel of each level + spatial_scales (list[float]): the spatial scales between input feature + maps and original input image which can be derived from the output + shape of backbone by from_config + has_extra_convs (bool): whether to add extra conv to the last level. + default False + extra_stage (int): the number of extra stages added to the last level. + default 1 + use_c5 (bool): Whether to use c5 as the input of extra stage, + otherwise p5 is used. default True + norm_type (string|None): The normalization type in FPN module. If + norm_type is None, norm will not be used after conv and if + norm_type is string, bn, gn, sync_bn are available. default None + norm_decay (float): weight decay for normalization layer weights. + default 0. + freeze_norm (bool): whether to freeze normalization layer. + default False + relu_before_extra_convs (bool): whether to add relu before extra convs. + default False + + """ + + def __init__(self, + in_channels, + out_channels, + spatial_scales=[0.25, 0.125, 0.0625, 0.03125], + has_extra_convs=False, + extra_stage=1, + use_c5=True, + norm_type=None, + norm_decay=0., + freeze_norm=False, + relu_before_extra_convs=True): + super(FCEFPN, self).__init__() + self.out_channels = out_channels + for s in range(extra_stage): + spatial_scales = spatial_scales + [spatial_scales[-1] / 2.] + self.spatial_scales = spatial_scales + self.has_extra_convs = has_extra_convs + self.extra_stage = extra_stage + self.use_c5 = use_c5 + self.relu_before_extra_convs = relu_before_extra_convs + self.norm_type = norm_type + self.norm_decay = norm_decay + self.freeze_norm = freeze_norm + + self.lateral_convs = []#nn.ModuleList() + self.lateral_convs_module = nn.ModuleList() + self.fpn_convs = []#nn.ModuleList() + self.fpn_convs_module = nn.ModuleList() + fan = out_channels * 3 * 3 + + # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone + # 0 <= st_stage < ed_stage <= 3 + st_stage = 4 - len(in_channels) + ed_stage = st_stage + len(in_channels) - 1 + for i in range(st_stage, ed_stage + 1): + if i == 3: + lateral_name = 'fpn_inner_res5_sum' + else: + lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2) + in_c = in_channels[i - st_stage] + if self.norm_type is not None: + # self.lateral_convs_module.add_module( + # lateral_name, + # ConvNormLayer( + # ch_in=in_c, + # ch_out=out_channels, + # filter_size=1, + # stride=1, + # norm_type=self.norm_type, + # norm_decay=self.norm_decay, + # freeze_norm=self.freeze_norm, + # initializer=None)) + lateral = ConvNormLayer( + ch_in=in_c, + ch_out=out_channels, + filter_size=1, + stride=1, + norm_type=self.norm_type, + norm_decay=self.norm_decay, + freeze_norm=self.freeze_norm, + initializer=None) + else: + # self.lateral_convs_module.add_module( + # lateral_name, + # nn.Conv2d( + # in_channels=in_c, + # out_channels=out_channels, + # kernel_size=1, + # ) + # ) + lateral = nn.Conv2d( + in_channels=in_c, + out_channels=out_channels, + kernel_size=1, + ) + self.lateral_convs_module.add_module(lateral_name, lateral) + self.lateral_convs.append(lateral) + + for i in range(st_stage, ed_stage + 1): + fpn_name = 'fpn_res{}_sum'.format(i + 2) + fpn_conv_module = nn.Sequential() + if self.norm_type is not None: + # fpn_conv_module.add_module( + # fpn_name, + # ConvNormLayer( + # ch_in=out_channels, + # ch_out=out_channels, + # filter_size=3, + # stride=1, + # norm_type=self.norm_type, + # norm_decay=self.norm_decay, + # freeze_norm=self.freeze_norm, + # initializer=None)) + fpn_conv = ConvNormLayer( + ch_in=out_channels, + ch_out=out_channels, + filter_size=3, + stride=1, + norm_type=self.norm_type, + norm_decay=self.norm_decay, + freeze_norm=self.freeze_norm, + initializer=None) + else: + # fpn_conv_module.add_module( + # fpn_name, + # nn.Conv2d( + # in_channels=out_channels, + # out_channels=out_channels, + # kernel_size=3, + # padding=1, + # ) + # ) + fpn_conv = nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + padding=1, + ) + self.fpn_convs_module.add_module(fpn_name, fpn_conv) + self.fpn_convs.append(fpn_conv) + + # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) + if self.has_extra_convs: + for i in range(self.extra_stage): + lvl = ed_stage + 1 + i + if i == 0 and self.use_c5: + in_c = in_channels[-1] + else: + in_c = out_channels + extra_fpn_name = 'fpn_{}'.format(lvl + 2) + extra_fpn_conv_module = nn.Sequential() + if self.norm_type is not None: + # extra_fpn_conv_module.add_module( + # extra_fpn_name, + # ConvNormLayer( + # ch_in=in_c, + # ch_out=out_channels, + # filter_size=3, + # stride=2, + # norm_type=self.norm_type, + # norm_decay=self.norm_decay, + # freeze_norm=self.freeze_norm, + # initializer=None)) + extra_fpn_conv = ConvNormLayer( + ch_in=in_c, + ch_out=out_channels, + filter_size=3, + stride=2, + norm_type=self.norm_type, + norm_decay=self.norm_decay, + freeze_norm=self.freeze_norm, + initializer=None) + else: + # extra_fpn_conv_module.add_module( + # extra_fpn_name, + # nn.Conv2d( + # in_channels=in_c, + # out_channels=out_channels, + # kernel_size=3, + # stride=2, + # padding=1, + # ) + # ) + extra_fpn_conv = nn.Conv2d( + in_channels=in_c, + out_channels=out_channels, + kernel_size=3, + stride=2, + padding=1, + ) + + self.fpn_convs_module.add_module(extra_fpn_name, extra_fpn_conv) + self.fpn_convs.append(extra_fpn_conv) + + @classmethod + def from_config(cls, cfg, input_shape): + return { + 'in_channels': [i.channels for i in input_shape], + 'spatial_scales': [1.0 / i.stride for i in input_shape], + } + + def forward(self, body_feats): + laterals = [] + num_levels = len(body_feats) + + for i in range(num_levels): + laterals.append(self.lateral_convs[i](body_feats[i])) + + for i in range(1, num_levels): + lvl = num_levels - i + upsample = F.interpolate( + laterals[lvl], + scale_factor=2., + mode='nearest', ) + laterals[lvl - 1] += upsample + + fpn_output = [] + for lvl in range(num_levels): + fpn_output.append(self.fpn_convs[lvl](laterals[lvl])) + + if self.extra_stage > 0: + # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN) + if not self.has_extra_convs: + assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs' + fpn_output.append(torch.max_pool2d(fpn_output[-1], 1, stride=2)) + # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) + else: + if self.use_c5: + extra_source = body_feats[-1] + else: + extra_source = fpn_output[-1] + fpn_output.append(self.fpn_convs[num_levels](extra_source)) + + for i in range(1, self.extra_stage): + if self.relu_before_extra_convs: + fpn_output.append(self.fpn_convs[num_levels + i](F.relu( + fpn_output[-1]))) + else: + fpn_output.append(self.fpn_convs[num_levels + i]( + fpn_output[-1])) + return fpn_output diff --git a/batch_running_task/pytorchocr/modeling/necks/fpn.py b/batch_running_task/pytorchocr/modeling/necks/fpn.py new file mode 100644 index 0000000..df1097f --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/necks/fpn.py @@ -0,0 +1,109 @@ +""" +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/neck/fpn.py +""" + +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + + + +class Conv_BN_ReLU(nn.Module): + def __init__(self, + in_planes, + out_planes, + kernel_size=1, + stride=1, + padding=0): + super(Conv_BN_ReLU, self).__init__() + self.conv = nn.Conv2d( + in_planes, + out_planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias=False) + self.bn = nn.BatchNorm2d(out_planes, momentum=0.1) + self.relu = nn.ReLU() + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + + + def forward(self, x): + return self.relu(self.bn(self.conv(x))) + + +class FPN(nn.Module): + def __init__(self, in_channels, out_channels): + super(FPN, self).__init__() + + # Top layer + self.toplayer_ = Conv_BN_ReLU( + in_channels[3], out_channels, kernel_size=1, stride=1, padding=0) + # Lateral layers + self.latlayer1_ = Conv_BN_ReLU( + in_channels[2], out_channels, kernel_size=1, stride=1, padding=0) + + self.latlayer2_ = Conv_BN_ReLU( + in_channels[1], out_channels, kernel_size=1, stride=1, padding=0) + + self.latlayer3_ = Conv_BN_ReLU( + in_channels[0], out_channels, kernel_size=1, stride=1, padding=0) + + # Smooth layers + self.smooth1_ = Conv_BN_ReLU( + out_channels, out_channels, kernel_size=3, stride=1, padding=1) + + self.smooth2_ = Conv_BN_ReLU( + out_channels, out_channels, kernel_size=3, stride=1, padding=1) + + self.smooth3_ = Conv_BN_ReLU( + out_channels, out_channels, kernel_size=3, stride=1, padding=1) + + self.out_channels = out_channels * 4 + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + + def _upsample(self, x, scale=1): + return F.upsample(x, scale_factor=scale, mode='bilinear') + + def _upsample_add(self, x, y, scale=1): + return F.upsample(x, scale_factor=scale, mode='bilinear') + y + + def forward(self, x): + f2, f3, f4, f5 = x + p5 = self.toplayer_(f5) + + f4 = self.latlayer1_(f4) + p4 = self._upsample_add(p5, f4, 2) + p4 = self.smooth1_(p4) + + f3 = self.latlayer2_(f3) + p3 = self._upsample_add(p4, f3, 2) + p3 = self.smooth2_(p3) + + f2 = self.latlayer3_(f2) + p2 = self._upsample_add(p3, f2, 2) + p2 = self.smooth3_(p2) + + p3 = self._upsample(p3, 2) + p4 = self._upsample(p4, 4) + p5 = self._upsample(p5, 8) + + fuse = torch.cat([p2, p3, p4, p5], dim=1) + return fuse diff --git a/batch_running_task/pytorchocr/modeling/necks/intracl.py b/batch_running_task/pytorchocr/modeling/necks/intracl.py new file mode 100644 index 0000000..414b723 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/necks/intracl.py @@ -0,0 +1,114 @@ +from torch import nn + + +class IntraCLBlock(nn.Module): + def __init__(self, in_channels=96, reduce_factor=4): + super(IntraCLBlock, self).__init__() + self.channels = in_channels + self.rf = reduce_factor + self.conv1x1_reduce_channel = nn.Conv2d( + self.channels, + self.channels // self.rf, + kernel_size=1, + stride=1, + padding=0) + self.conv1x1_return_channel = nn.Conv2d( + self.channels // self.rf, + self.channels, + kernel_size=1, + stride=1, + padding=0) + + self.v_layer_7x1 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(7, 1), + stride=(1, 1), + padding=(3, 0)) + self.v_layer_5x1 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(5, 1), + stride=(1, 1), + padding=(2, 0)) + self.v_layer_3x1 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(3, 1), + stride=(1, 1), + padding=(1, 0)) + + self.q_layer_1x7 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(1, 7), + stride=(1, 1), + padding=(0, 3)) + self.q_layer_1x5 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(1, 5), + stride=(1, 1), + padding=(0, 2)) + self.q_layer_1x3 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(1, 3), + stride=(1, 1), + padding=(0, 1)) + + # base + self.c_layer_7x7 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(7, 7), + stride=(1, 1), + padding=(3, 3)) + self.c_layer_5x5 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(5, 5), + stride=(1, 1), + padding=(2, 2)) + self.c_layer_3x3 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1)) + + self.bn = nn.BatchNorm2d(self.channels) + self.relu = nn.ReLU() + + def forward(self, x): + x_new = self.conv1x1_reduce_channel(x) + + x_7_c = self.c_layer_7x7(x_new) + x_7_v = self.v_layer_7x1(x_new) + x_7_q = self.q_layer_1x7(x_new) + x_7 = x_7_c + x_7_v + x_7_q + + x_5_c = self.c_layer_5x5(x_7) + x_5_v = self.v_layer_5x1(x_7) + x_5_q = self.q_layer_1x5(x_7) + x_5 = x_5_c + x_5_v + x_5_q + + x_3_c = self.c_layer_3x3(x_5) + x_3_v = self.v_layer_3x1(x_5) + x_3_q = self.q_layer_1x3(x_5) + x_3 = x_3_c + x_3_v + x_3_q + + x_relation = self.conv1x1_return_channel(x_3) + + x_relation = self.bn(x_relation) + x_relation = self.relu(x_relation) + + return x + x_relation + + +def build_intraclblock_list(num_block): + IntraCLBlock_list = nn.ModuleList() + for i in range(num_block): + IntraCLBlock_list.append(IntraCLBlock()) + + return IntraCLBlock_list \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/necks/pg_fpn.py b/batch_running_task/pytorchocr/modeling/necks/pg_fpn.py new file mode 100644 index 0000000..ec48d3c --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/necks/pg_fpn.py @@ -0,0 +1,297 @@ + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation + + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_vd_mode=False, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2d( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self._conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = nn.BatchNorm2d(out_channels) + self.act = act + if self.act is not None: + self._act = Activation(act_type=self.act, inplace=True) + + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + if self.act is not None: + y = self._act(y) + return y + + +class DeConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size=4, + stride=2, + padding=1, + groups=1, + if_act=True, + act=None, + name=None): + super(DeConvBNLayer, self).__init__() + + self.if_act = if_act + self.act = act + self.deconv = nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False) + self.bn = nn.BatchNorm2d(out_channels) + self.act = act + if self.act is not None: + self._act = Activation(act_type=self.act, inplace=True) + + def forward(self, x): + x = self.deconv(x) + x = self.bn(x) + if self.act is not None: + x = self._act(x) + return x + + +class PGFPN(nn.Module): + def __init__(self, in_channels, **kwargs): + super(PGFPN, self).__init__() + num_inputs = [2048, 2048, 1024, 512, 256] + num_outputs = [256, 256, 192, 192, 128] + self.out_channels = 128 + self.conv_bn_layer_1 = ConvBNLayer( + in_channels=3, + out_channels=32, + kernel_size=3, + stride=1, + act=None, + name='FPN_d1') + self.conv_bn_layer_2 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + act=None, + name='FPN_d2') + self.conv_bn_layer_3 = ConvBNLayer( + in_channels=256, + out_channels=128, + kernel_size=3, + stride=1, + act=None, + name='FPN_d3') + self.conv_bn_layer_4 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=2, + act=None, + name='FPN_d4') + self.conv_bn_layer_5 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + name='FPN_d5') + self.conv_bn_layer_6 = ConvBNLayer( + in_channels=64, + out_channels=128, + kernel_size=3, + stride=2, + act=None, + name='FPN_d6') + self.conv_bn_layer_7 = ConvBNLayer( + in_channels=128, + out_channels=128, + kernel_size=3, + stride=1, + act='relu', + name='FPN_d7') + self.conv_bn_layer_8 = ConvBNLayer( + in_channels=128, + out_channels=128, + kernel_size=1, + stride=1, + act=None, + name='FPN_d8') + + self.conv_h0 = ConvBNLayer( + in_channels=num_inputs[0], + out_channels=num_outputs[0], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(0)) + self.conv_h1 = ConvBNLayer( + in_channels=num_inputs[1], + out_channels=num_outputs[1], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(1)) + self.conv_h2 = ConvBNLayer( + in_channels=num_inputs[2], + out_channels=num_outputs[2], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(2)) + self.conv_h3 = ConvBNLayer( + in_channels=num_inputs[3], + out_channels=num_outputs[3], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(3)) + self.conv_h4 = ConvBNLayer( + in_channels=num_inputs[4], + out_channels=num_outputs[4], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(4)) + + self.dconv0 = DeConvBNLayer( + in_channels=num_outputs[0], + out_channels=num_outputs[0 + 1], + name="dconv_{}".format(0)) + self.dconv1 = DeConvBNLayer( + in_channels=num_outputs[1], + out_channels=num_outputs[1 + 1], + act=None, + name="dconv_{}".format(1)) + self.dconv2 = DeConvBNLayer( + in_channels=num_outputs[2], + out_channels=num_outputs[2 + 1], + act=None, + name="dconv_{}".format(2)) + self.dconv3 = DeConvBNLayer( + in_channels=num_outputs[3], + out_channels=num_outputs[3 + 1], + act=None, + name="dconv_{}".format(3)) + self.conv_g1 = ConvBNLayer( + in_channels=num_outputs[1], + out_channels=num_outputs[1], + kernel_size=3, + stride=1, + act='relu', + name="conv_g{}".format(1)) + self.conv_g2 = ConvBNLayer( + in_channels=num_outputs[2], + out_channels=num_outputs[2], + kernel_size=3, + stride=1, + act='relu', + name="conv_g{}".format(2)) + self.conv_g3 = ConvBNLayer( + in_channels=num_outputs[3], + out_channels=num_outputs[3], + kernel_size=3, + stride=1, + act='relu', + name="conv_g{}".format(3)) + self.conv_g4 = ConvBNLayer( + in_channels=num_outputs[4], + out_channels=num_outputs[4], + kernel_size=3, + stride=1, + act='relu', + name="conv_g{}".format(4)) + self.convf = ConvBNLayer( + in_channels=num_outputs[4], + out_channels=num_outputs[4], + kernel_size=1, + stride=1, + act=None, + name="conv_f{}".format(4)) + + def forward(self, x): + c0, c1, c2, c3, c4, c5, c6 = x + # FPN_Down_Fusion + f = [c0, c1, c2] + g = [None, None, None] + h = [None, None, None] + h[0] = self.conv_bn_layer_1(f[0]) + h[1] = self.conv_bn_layer_2(f[1]) + h[2] = self.conv_bn_layer_3(f[2]) + + g[0] = self.conv_bn_layer_4(h[0]) + g[1] = torch.add(g[0], h[1]) + g[1] = F.relu(g[1]) + g[1] = self.conv_bn_layer_5(g[1]) + g[1] = self.conv_bn_layer_6(g[1]) + + g[2] = torch.add(g[1], h[2]) + g[2] = F.relu(g[2]) + g[2] = self.conv_bn_layer_7(g[2]) + f_down = self.conv_bn_layer_8(g[2]) + + # FPN UP Fusion + f1 = [c6, c5, c4, c3, c2] + g = [None, None, None, None, None] + h = [None, None, None, None, None] + h[0] = self.conv_h0(f1[0]) + h[1] = self.conv_h1(f1[1]) + h[2] = self.conv_h2(f1[2]) + h[3] = self.conv_h3(f1[3]) + h[4] = self.conv_h4(f1[4]) + + g[0] = self.dconv0(h[0]) + g[1] = torch.add(g[0], h[1]) + g[1] = F.relu(g[1]) + g[1] = self.conv_g1(g[1]) + g[1] = self.dconv1(g[1]) + + g[2] = torch.add(g[1], h[2]) + g[2] = F.relu(g[2]) + g[2] = self.conv_g2(g[2]) + g[2] = self.dconv2(g[2]) + + g[3] = torch.add(g[2], h[3]) + g[3] = F.relu(g[3]) + g[3] = self.conv_g3(g[3]) + g[3] = self.dconv3(g[3]) + + g[4] = torch.add(g[3], h[4]) + g[4] = F.relu(g[4]) + g[4] = self.conv_g4(g[4]) + f_up = self.convf(g[4]) + f_common = torch.add(f_down, f_up) + f_common = F.relu(f_common) + return f_common diff --git a/batch_running_task/pytorchocr/modeling/necks/rnn.py b/batch_running_task/pytorchocr/modeling/necks/rnn.py new file mode 100644 index 0000000..5a353cb --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/necks/rnn.py @@ -0,0 +1,205 @@ +import os, sys +import torch +import torch.nn as nn +from pytorchocr.modeling.backbones.rec_svtrnet import Block, ConvBNLayer + +class Im2Seq(nn.Module): + def __init__(self, in_channels, **kwargs): + super().__init__() + self.out_channels = in_channels + + def forward(self, x): + B, C, H, W = x.shape + # assert H == 1 + x = x.squeeze(dim=2) + # x = x.transpose([0, 2, 1]) # paddle (NTC)(batch, width, channels) + x = x.permute(0,2,1) + return x + + +class EncoderWithRNN_(nn.Module): + def __init__(self, in_channels, hidden_size): + super(EncoderWithRNN_, self).__init__() + self.out_channels = hidden_size * 2 + self.rnn1 = nn.LSTM(in_channels, hidden_size, bidirectional=False, batch_first=True, num_layers=2) + self.rnn2 = nn.LSTM(in_channels, hidden_size, bidirectional=False, batch_first=True, num_layers=2) + + def forward(self, x): + self.rnn1.flatten_parameters() + self.rnn2.flatten_parameters() + out1, h1 = self.rnn1(x) + out2, h2 = self.rnn2(torch.flip(x, [1])) + return torch.cat([out1, torch.flip(out2, [1])], 2) + + +class EncoderWithRNN(nn.Module): + def __init__(self, in_channels, hidden_size): + super(EncoderWithRNN, self).__init__() + self.out_channels = hidden_size * 2 + self.lstm = nn.LSTM( + in_channels, hidden_size, num_layers=2, batch_first=True, bidirectional=True) # batch_first:=True + + def forward(self, x): + x, _ = self.lstm(x) + return x + + +class EncoderWithFC(nn.Module): + def __init__(self, in_channels, hidden_size): + super(EncoderWithFC, self).__init__() + self.out_channels = hidden_size + self.fc = nn.Linear( + in_channels, + hidden_size, + bias=True, + ) + + def forward(self, x): + x = self.fc(x) + return x + + +class EncoderWithSVTR(nn.Module): + def __init__( + self, + in_channels, + dims=64, # XS + depth=2, + hidden_dims=120, + use_guide=False, + num_heads=8, + qkv_bias=True, + mlp_ratio=2.0, + drop_rate=0.1, + kernel_size=[3,3], + attn_drop_rate=0.1, + drop_path=0., + qk_scale=None): + super(EncoderWithSVTR, self).__init__() + self.depth = depth + self.use_guide = use_guide + self.conv1 = ConvBNLayer( + in_channels, + in_channels // 8, + kernel_size=kernel_size, + padding=[kernel_size[0] // 2, kernel_size[1] // 2], + act='swish') + self.conv2 = ConvBNLayer( + in_channels // 8, hidden_dims, kernel_size=1, act='swish') + + self.svtr_block = nn.ModuleList([ + Block( + dim=hidden_dims, + num_heads=num_heads, + mixer='Global', + HW=None, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer='swish', + attn_drop=attn_drop_rate, + drop_path=drop_path, + norm_layer='nn.LayerNorm', + epsilon=1e-05, + prenorm=False) for i in range(depth) + ]) + self.norm = nn.LayerNorm(hidden_dims, eps=1e-6) + self.conv3 = ConvBNLayer( + hidden_dims, in_channels, kernel_size=1, act='swish') + # last conv-nxn, the input is concat of input tensor and conv3 output tensor + self.conv4 = ConvBNLayer( + 2 * in_channels, in_channels // 8, padding=1, act='swish') + + self.conv1x1 = ConvBNLayer( + in_channels // 8, dims, kernel_size=1, act='swish') + self.out_channels = dims + self.apply(self._init_weights) + + def _init_weights(self, m): + # weight initialization + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.ConvTranspose2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + + def forward(self, x): + # for use guide + if self.use_guide: + z = x.clone() + z.stop_gradient = True + else: + z = x + # for short cut + h = z + # reduce dim + z = self.conv1(z) + z = self.conv2(z) + # SVTR global block + B, C, H, W = z.shape + z = z.flatten(2).permute(0, 2, 1) + + for blk in self.svtr_block: + z = blk(z) + + z = self.norm(z) + # last stage + z = z.reshape([-1, H, W, C]).permute(0, 3, 1, 2) + z = self.conv3(z) + z = torch.cat((h, z), dim=1) + z = self.conv1x1(self.conv4(z)) + + return z + + +class SequenceEncoder(nn.Module): + def __init__(self, in_channels, encoder_type, hidden_size=48, **kwargs): + super(SequenceEncoder, self).__init__() + self.encoder_reshape = Im2Seq(in_channels) + self.out_channels = self.encoder_reshape.out_channels + self.encoder_type = encoder_type + if encoder_type == 'reshape': + self.only_reshape = True + else: + support_encoder_dict = { + 'reshape': Im2Seq, + 'fc': EncoderWithFC, + 'rnn': EncoderWithRNN, + 'svtr': EncoderWithSVTR, + } + assert encoder_type in support_encoder_dict, '{} must in {}'.format( + encoder_type, support_encoder_dict.keys()) + + if encoder_type == "svtr": + self.encoder = support_encoder_dict[encoder_type]( + self.encoder_reshape.out_channels, **kwargs) + else: + self.encoder = support_encoder_dict[encoder_type]( + self.encoder_reshape.out_channels, hidden_size) + self.out_channels = self.encoder.out_channels + self.only_reshape = False + + def forward(self, x): + if self.encoder_type != 'svtr': + x = self.encoder_reshape(x) + if not self.only_reshape: + x = self.encoder(x) + return x + else: + x = self.encoder(x) + x = self.encoder_reshape(x) + return x \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/necks/sast_fpn.py b/batch_running_task/pytorchocr/modeling/necks/sast_fpn.py new file mode 100644 index 0000000..118467f --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/necks/sast_fpn.py @@ -0,0 +1,305 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation +# import paddle +# from paddle import nn +# import paddle.nn.functional as F +# from paddle import ParamAttr + + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + # self.conv = nn.Conv2D( + # in_channels=in_channels, + # out_channels=out_channels, + # kernel_size=kernel_size, + # stride=stride, + # padding=(kernel_size - 1) // 2, + # groups=groups, + # weight_attr=ParamAttr(name=name + '_weights'), + # bias_attr=False) + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False) + + self.bn = nn.BatchNorm2d( + out_channels,) + self.act = act + if act is not None: + self._act = Activation(act) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.act is not None: + x = self._act(x) + return x + + +class DeConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + if_act=True, + act=None, + name=None): + super(DeConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + + self.deconv = nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False) + self.bn = nn.BatchNorm2d( + out_channels, + ) + self.act = act + if act is not None: + self._act = Activation(act) + + def forward(self, x): + x = self.deconv(x) + x = self.bn(x) + if self.act is not None: + x = self._act(x) + return x + + +class FPN_Up_Fusion(nn.Module): + def __init__(self, in_channels): + super(FPN_Up_Fusion, self).__init__() + in_channels = in_channels[::-1] + out_channels = [256, 256, 192, 192, 128] + + self.h0_conv = ConvBNLayer(in_channels[0], out_channels[0], 1, 1, act=None, name='fpn_up_h0') + self.h1_conv = ConvBNLayer(in_channels[1], out_channels[1], 1, 1, act=None, name='fpn_up_h1') + self.h2_conv = ConvBNLayer(in_channels[2], out_channels[2], 1, 1, act=None, name='fpn_up_h2') + self.h3_conv = ConvBNLayer(in_channels[3], out_channels[3], 1, 1, act=None, name='fpn_up_h3') + self.h4_conv = ConvBNLayer(in_channels[4], out_channels[4], 1, 1, act=None, name='fpn_up_h4') + + self.g0_conv = DeConvBNLayer(out_channels[0], out_channels[1], 4, 2, act=None, name='fpn_up_g0') + + self.g1_conv = nn.Sequential( + ConvBNLayer(out_channels[1], out_channels[1], 3, 1, act='relu', name='fpn_up_g1_1'), + DeConvBNLayer(out_channels[1], out_channels[2], 4, 2, act=None, name='fpn_up_g1_2') + ) + self.g2_conv = nn.Sequential( + ConvBNLayer(out_channels[2], out_channels[2], 3, 1, act='relu', name='fpn_up_g2_1'), + DeConvBNLayer(out_channels[2], out_channels[3], 4, 2, act=None, name='fpn_up_g2_2') + ) + self.g3_conv = nn.Sequential( + ConvBNLayer(out_channels[3], out_channels[3], 3, 1, act='relu', name='fpn_up_g3_1'), + DeConvBNLayer(out_channels[3], out_channels[4], 4, 2, act=None, name='fpn_up_g3_2') + ) + + self.g4_conv = nn.Sequential( + ConvBNLayer(out_channels[4], out_channels[4], 3, 1, act='relu', name='fpn_up_fusion_1'), + ConvBNLayer(out_channels[4], out_channels[4], 1, 1, act=None, name='fpn_up_fusion_2') + ) + + def _add_relu(self, x1, x2): + # x = paddle.add(x=x1, y=x2) + x = torch.add(x1, x2) + x = F.relu(x) + return x + + def forward(self, x): + f = x[2:][::-1] + h0 = self.h0_conv(f[0]) + h1 = self.h1_conv(f[1]) + h2 = self.h2_conv(f[2]) + h3 = self.h3_conv(f[3]) + h4 = self.h4_conv(f[4]) + + g0 = self.g0_conv(h0) + g1 = self._add_relu(g0, h1) + g1 = self.g1_conv(g1) + g2 = self.g2_conv(self._add_relu(g1, h2)) + g3 = self.g3_conv(self._add_relu(g2, h3)) + g4 = self.g4_conv(self._add_relu(g3, h4)) + + return g4 + + +class FPN_Down_Fusion(nn.Module): + def __init__(self, in_channels): + super(FPN_Down_Fusion, self).__init__() + out_channels = [32, 64, 128] + + self.h0_conv = ConvBNLayer(in_channels[0], out_channels[0], 3, 1, act=None, name='fpn_down_h0') + self.h1_conv = ConvBNLayer(in_channels[1], out_channels[1], 3, 1, act=None, name='fpn_down_h1') + self.h2_conv = ConvBNLayer(in_channels[2], out_channels[2], 3, 1, act=None, name='fpn_down_h2') + + self.g0_conv = ConvBNLayer(out_channels[0], out_channels[1], 3, 2, act=None, name='fpn_down_g0') + + self.g1_conv = nn.Sequential( + ConvBNLayer(out_channels[1], out_channels[1], 3, 1, act='relu', name='fpn_down_g1_1'), + ConvBNLayer(out_channels[1], out_channels[2], 3, 2, act=None, name='fpn_down_g1_2') + ) + + self.g2_conv = nn.Sequential( + ConvBNLayer(out_channels[2], out_channels[2], 3, 1, act='relu', name='fpn_down_fusion_1'), + ConvBNLayer(out_channels[2], out_channels[2], 1, 1, act=None, name='fpn_down_fusion_2') + ) + + def forward(self, x): + f = x[:3] + h0 = self.h0_conv(f[0]) + h1 = self.h1_conv(f[1]) + h2 = self.h2_conv(f[2]) + g0 = self.g0_conv(h0) + # g1 = paddle.add(x=g0, y=h1) + g1 = torch.add(g0, h1) + g1 = F.relu(g1) + g1 = self.g1_conv(g1) + # g2 = paddle.add(x=g1, y=h2) + g2 = torch.add(g1, h2) + g2 = F.relu(g2) + g2 = self.g2_conv(g2) + return g2 + + +class Cross_Attention(nn.Module): + def __init__(self, in_channels): + super(Cross_Attention, self).__init__() + self.theta_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_theta') + self.phi_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_phi') + self.g_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_g') + + self.fh_weight_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fh_weight') + self.fh_sc_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fh_sc') + + self.fv_weight_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fv_weight') + self.fv_sc_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fv_sc') + + self.f_attn_conv = ConvBNLayer(in_channels * 2, in_channels, 1, 1, act='relu', name='f_attn') + + def _cal_fweight(self, f, shape): + f_theta, f_phi, f_g = f + # flatten + # f_theta = paddle.transpose(f_theta, [0, 2, 3, 1]) + f_theta = f_theta.permute(0, 2, 3, 1) + # f_theta = paddle.reshape(f_theta, [shape[0] * shape[1], shape[2], 128]) + f_theta = torch.reshape(f_theta, [shape[0] * shape[1], shape[2], 128]) + # f_phi = paddle.transpose(f_phi, [0, 2, 3, 1]) + f_phi = f_phi.permute(0, 2, 3, 1) + # f_phi = paddle.reshape(f_phi, [shape[0] * shape[1], shape[2], 128]) + f_phi = torch.reshape(f_phi, [shape[0] * shape[1], shape[2], 128]) + # f_g = paddle.transpose(f_g, [0, 2, 3, 1]) + f_g = f_g.permute(0, 2, 3, 1) + # f_g = paddle.reshape(f_g, [shape[0] * shape[1], shape[2], 128]) + f_g = torch.reshape(f_g, [shape[0] * shape[1], shape[2], 128]) + # correlation + # f_attn = paddle.matmul(f_theta, paddle.transpose(f_phi, [0, 2, 1])) + f_attn = torch.matmul(f_theta, f_phi.permute(0, 2, 1)) + # scale + f_attn = f_attn / (128 ** 0.5) + f_attn = F.softmax(f_attn, dim=-1) + # weighted sum + # f_weight = paddle.matmul(f_attn, f_g) + f_weight = torch.matmul(f_attn, f_g) + # f_weight = paddle.reshape( + # f_weight, [shape[0], shape[1], shape[2], 128]) + f_weight = torch.reshape( + f_weight, [shape[0], shape[1], shape[2], 128]) + return f_weight + + def forward(self, f_common): + # f_shape = paddle.shape(f_common) + f_shape = f_common.size() + # print('f_shape: ', f_shape) + + f_theta = self.theta_conv(f_common) + f_phi = self.phi_conv(f_common) + f_g = self.g_conv(f_common) + + ######## horizon ######## + fh_weight = self._cal_fweight([f_theta, f_phi, f_g], + [f_shape[0], f_shape[2], f_shape[3]]) + # fh_weight = paddle.transpose(fh_weight, [0, 3, 1, 2]) + fh_weight = fh_weight.permute(0, 3, 1, 2) + fh_weight = self.fh_weight_conv(fh_weight) + # short cut + fh_sc = self.fh_sc_conv(f_common) + f_h = F.relu(fh_weight + fh_sc) + + ######## vertical ######## + # fv_theta = paddle.transpose(f_theta, [0, 1, 3, 2]) + fv_theta = f_theta.permute(0, 1, 3, 2) + # fv_phi = paddle.transpose(f_phi, [0, 1, 3, 2]) + fv_phi = f_phi.permute(0, 1, 3, 2) + # fv_g = paddle.transpose(f_g, [0, 1, 3, 2]) + fv_g = f_g.permute(0, 1, 3, 2) + fv_weight = self._cal_fweight([fv_theta, fv_phi, fv_g], + [f_shape[0], f_shape[3], f_shape[2]]) + # fv_weight = paddle.transpose(fv_weight, [0, 3, 2, 1]) + fv_weight = fv_weight.permute(0, 3, 2, 1) + fv_weight = self.fv_weight_conv(fv_weight) + # short cut + fv_sc = self.fv_sc_conv(f_common) + f_v = F.relu(fv_weight + fv_sc) + + ######## merge ######## + # f_attn = paddle.concat([f_h, f_v], axis=1) + f_attn = torch.cat([f_h, f_v], dim=1) + f_attn = self.f_attn_conv(f_attn) + return f_attn + + +class SASTFPN(nn.Module): + def __init__(self, in_channels, with_cab=False, **kwargs): + super(SASTFPN, self).__init__() + self.in_channels = in_channels + self.with_cab = with_cab + self.FPN_Down_Fusion = FPN_Down_Fusion(self.in_channels) + self.FPN_Up_Fusion = FPN_Up_Fusion(self.in_channels) + self.out_channels = 128 + self.cross_attention = Cross_Attention(self.out_channels) + + def forward(self, x): + # down fpn + f_down = self.FPN_Down_Fusion(x) + + # up fpn + f_up = self.FPN_Up_Fusion(x) + + # fusion + # f_common = paddle.add(x=f_down, y=f_up) + f_common = torch.add(f_down, f_up) + f_common = F.relu(f_common) + + if self.with_cab: + # print('enhence f_common with CAB.') + f_common = self.cross_attention(f_common) + + return f_common \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/necks/table_fpn.py b/batch_running_task/pytorchocr/modeling/necks/table_fpn.py new file mode 100644 index 0000000..1d0ee69 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/necks/table_fpn.py @@ -0,0 +1,88 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation + + +class TableFPN(nn.Module): + def __init__(self, in_channels, out_channels, **kwargs): + super(TableFPN, self).__init__() + self.out_channels = 512 + + self.in2_conv = nn.Conv2d( + in_channels=in_channels[0], + out_channels=self.out_channels, + kernel_size=1, + bias=False) + self.in3_conv = nn.Conv2d( + in_channels=in_channels[1], + out_channels=self.out_channels, + kernel_size=1, + stride = 1, + bias=False) + self.in4_conv = nn.Conv2d( + in_channels=in_channels[2], + out_channels=self.out_channels, + kernel_size=1, + bias=False) + self.in5_conv = nn.Conv2d( + in_channels=in_channels[3], + out_channels=self.out_channels, + kernel_size=1, + bias=False) + self.p5_conv = nn.Conv2d( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + bias=False) + self.p4_conv = nn.Conv2d( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + bias=False) + self.p3_conv = nn.Conv2d( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + bias=False) + self.p2_conv = nn.Conv2d( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + bias=False) + self.fuse_conv = nn.Conv2d( + in_channels=self.out_channels * 4, + out_channels=512, + kernel_size=3, + padding=1, + bias=False) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.in5_conv(c5) + in4 = self.in4_conv(c4) + in3 = self.in3_conv(c3) + in2 = self.in2_conv(c2) + + out4 = in4 + F.interpolate( + in5, size=in4.shape[2:4], mode="nearest", )#align_mode=1) # 1/16 + out3 = in3 + F.interpolate( + out4, size=in3.shape[2:4], mode="nearest", )#align_mode=1) # 1/8 + out2 = in2 + F.interpolate( + out3, size=in2.shape[2:4], mode="nearest", )#align_mode=1) # 1/4 + + p4 = F.interpolate(out4, size=in5.shape[2:4], mode="nearest", )#align_mode=1) + p3 = F.interpolate(out3, size=in5.shape[2:4], mode="nearest", )#align_mode=1) + p2 = F.interpolate(out2, size=in5.shape[2:4], mode="nearest", )#align_mode=1) + fuse = torch.cat([in5, p4, p3, p2], dim=1) + fuse_conv = self.fuse_conv(fuse) * 0.005 + return [c5 + fuse_conv] \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/transforms/__init__.py b/batch_running_task/pytorchocr/modeling/transforms/__init__.py new file mode 100644 index 0000000..f038399 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/transforms/__init__.py @@ -0,0 +1,30 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['build_transform'] + + +def build_transform(config): + from .tps import TPS + from .stn import STN_ON + from .tsrn import TSRN + from .tbsrn import TBSRN + + support_dict = ['TPS', 'STN_ON', 'TSRN', 'TBSRN'] + + module_name = config.pop('name') + assert module_name in support_dict, Exception( + 'transform only support {}'.format(support_dict)) + module_class = eval(module_name)(**config) + return module_class \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/transforms/stn.py b/batch_running_task/pytorchocr/modeling/transforms/stn.py new file mode 100644 index 0000000..dbed2d1 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/transforms/stn.py @@ -0,0 +1,121 @@ +""" +This code is refer from: +https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/stn_head.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import torch +from torch import nn +from torch.nn import functional as F +import numpy as np + +from .tps_spatial_transformer import TPSSpatialTransformer + + +def conv3x3_block(in_channels, out_channels, stride=1): + n = 3 * 3 * out_channels + w = math.sqrt(2. / n) + conv_layer = nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + bias=True) + block = nn.Sequential(conv_layer, nn.BatchNorm2d(out_channels), nn.ReLU()) + return block + + +class STN(nn.Module): + def __init__(self, in_channels, num_ctrlpoints, activation='none'): + super(STN, self).__init__() + self.in_channels = in_channels + self.num_ctrlpoints = num_ctrlpoints + self.activation = activation + self.stn_convnet = nn.Sequential( + conv3x3_block(in_channels, 32), #32x64 + nn.MaxPool2d( + kernel_size=2, stride=2), + conv3x3_block(32, 64), #16x32 + nn.MaxPool2d( + kernel_size=2, stride=2), + conv3x3_block(64, 128), # 8*16 + nn.MaxPool2d( + kernel_size=2, stride=2), + conv3x3_block(128, 256), # 4*8 + nn.MaxPool2d( + kernel_size=2, stride=2), + conv3x3_block(256, 256), # 2*4, + nn.MaxPool2d( + kernel_size=2, stride=2), + conv3x3_block(256, 256)) # 1*2 + self.stn_fc1 = nn.Sequential( + nn.Linear( + 2 * 256, + 512, + bias=True), + nn.BatchNorm1d(512), + nn.ReLU(inplace=True)) + fc2_bias = self.init_stn() + self.stn_fc2 = nn.Linear( + 512, + num_ctrlpoints * 2, + bias=True) + + def init_stn(self): + margin = 0.01 + sampling_num_per_side = int(self.num_ctrlpoints / 2) + ctrl_pts_x = np.linspace(margin, 1. - margin, sampling_num_per_side) + ctrl_pts_y_top = np.ones(sampling_num_per_side) * margin + ctrl_pts_y_bottom = np.ones(sampling_num_per_side) * (1 - margin) + ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) + ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) + ctrl_points = np.concatenate( + [ctrl_pts_top, ctrl_pts_bottom], axis=0).astype(np.float32) + if self.activation == 'none': + pass + elif self.activation == 'sigmoid': + ctrl_points = -np.log(1. / ctrl_points - 1.) + ctrl_points = torch.Tensor(ctrl_points) + # fc2_bias = ctrl_points.view(-1) + fc2_bias = torch.reshape( + ctrl_points, shape=[ctrl_points.shape[0] * ctrl_points.shape[1]]) + return fc2_bias + + def forward(self, x): + x = self.stn_convnet(x) + batch_size, _, h, w = x.shape + # x = x.view(batch_size, -1) + x = torch.reshape(x, shape=(batch_size, -1)) + img_feat = self.stn_fc1(x) + x = self.stn_fc2(0.1 * img_feat) + if self.activation == 'sigmoid': + x = F.sigmoid(x) + # x = x.view(-1, self.num_ctrlpoints, 2) + x = torch.reshape(x, shape=[-1, self.num_ctrlpoints, 2]) + return img_feat, x + + +class STN_ON(nn.Module): + def __init__(self, in_channels, tps_inputsize, tps_outputsize, + num_control_points, tps_margins, stn_activation): + super(STN_ON, self).__init__() + self.tps = TPSSpatialTransformer( + output_image_size=tuple(tps_outputsize), + num_control_points=num_control_points, + margins=tuple(tps_margins)) + self.stn_head = STN(in_channels=in_channels, + num_ctrlpoints=num_control_points, + activation=stn_activation) + self.tps_inputsize = tps_inputsize + self.out_channels = in_channels + + def forward(self, image): + stn_input = torch.nn.functional.interpolate( + image, self.tps_inputsize, mode="bilinear", align_corners=True) + stn_img_feat, ctrl_points = self.stn_head(stn_input) + x, _ = self.tps(image, ctrl_points) + return x diff --git a/batch_running_task/pytorchocr/modeling/transforms/tbsrn.py b/batch_running_task/pytorchocr/modeling/transforms/tbsrn.py new file mode 100644 index 0000000..8cbaa69 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/transforms/tbsrn.py @@ -0,0 +1,267 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FudanVI/FudanOCR/blob/main/scene-text-telescope/model/tbsrn.py +""" + +import math +import warnings +import numpy as np +import torch +from torch import nn +import string + +warnings.filterwarnings("ignore") + +from .tps_spatial_transformer import TPSSpatialTransformer +from .stn import STN as STNHead +from .tsrn import GruBlock, mish, UpsampleBLock +from pytorchocr.modeling.heads.sr_rensnet_transformer import Transformer, LayerNorm, \ + PositionwiseFeedForward, MultiHeadedAttention + + +def positionalencoding2d(d_model, height, width): + """ + :param d_model: dimension of the model + :param height: height of the positions + :param width: width of the positions + :return: d_model*height*width position matrix + """ + if d_model % 4 != 0: + raise ValueError("Cannot use sin/cos positional encoding with " + "odd dimension (got dim={:d})".format(d_model)) + pe = torch.zeros([d_model, height, width]) + # Each dimension use half of d_model + d_model = int(d_model / 2) + div_term = torch.exp(torch.arange(0., d_model, 2) * + -(math.log(10000.0) / d_model)) + pos_w = torch.arange(0., width, dtype=torch.float32).unsqueeze(1) + pos_h = torch.arange(0., height, dtype=torch.float32).unsqueeze(1) + + pe[0:d_model:2, :, :] = torch.sin(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1) + pe[1:d_model:2, :, :] = torch.cos(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1) + pe[d_model::2, :, :] = torch.sin(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width) + pe[d_model + 1::2, :, :] = torch.cos(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width) + + return pe + + +class FeatureEnhancer(nn.Module): + + def __init__(self): + super(FeatureEnhancer, self).__init__() + + self.multihead = MultiHeadedAttention(h=4, d_model=128, dropout=0.1) + self.mul_layernorm1 = LayerNorm(features=128) + + self.pff = PositionwiseFeedForward(128, 128) + self.mul_layernorm3 = LayerNorm(features=128) + + self.linear = nn.Linear(128, 64) + + def forward(self, conv_feature): + ''' + text : (batch, seq_len, embedding_size) + global_info: (batch, embedding_size, 1, 1) + conv_feature: (batch, channel, H, W) + ''' + batch = conv_feature.shape[0] + if torch.cuda.is_available(): + position2d = positionalencoding2d(64, 16, 64).float().cuda().unsqueeze(0).reshape([1, 64, 1024]) + else: + position2d = positionalencoding2d(64, 16, 64).float().unsqueeze(0).reshape([1, 64, 1024]) + position2d = position2d.repeat(batch, 1, 1) + conv_feature = torch.cat([conv_feature, position2d], 1) # batch, 128(64+64), 32, 128 + result = conv_feature.permute(0, 2, 1).contiguous() + origin_result = result + result = self.mul_layernorm1(origin_result + self.multihead(result, result, result, mask=None)[0]) + origin_result = result + result = self.mul_layernorm3(origin_result + self.pff(result)) + result = self.linear(result) + return result.permute(0, 2, 1).contiguous() + + +def str_filt(str_, voc_type): + alpha_dict = { + 'digit': string.digits, + 'lower': string.digits + string.ascii_lowercase, + 'upper': string.digits + string.ascii_letters, + 'all': string.digits + string.ascii_letters + string.punctuation + } + if voc_type == 'lower': + str_ = str_.lower() + for char in str_: + if char not in alpha_dict[voc_type]: + str_ = str_.replace(char, '') + str_ = str_.lower() + return str_ + + +class TBSRN(nn.Module): + def __init__(self, + in_channels=3, + scale_factor=2, + width=128, + height=32, + STN=True, + srb_nums=5, + mask=False, + hidden_units=32, + infer_mode=False): + super(TBSRN, self).__init__() + in_planes = 3 + if mask: + in_planes = 4 + assert math.log(scale_factor, 2) % 1 == 0 + upsample_block_num = int(math.log(scale_factor, 2)) + self.block1 = nn.Sequential( + nn.Conv2d(in_planes, 2 * hidden_units, kernel_size=9, padding=4), + nn.PReLU() + # nn.ReLU() + ) + self.srb_nums = srb_nums + for i in range(srb_nums): + setattr(self, 'block%d' % (i + 2), RecurrentResidualBlock(2 * hidden_units)) + + setattr(self, 'block%d' % (srb_nums + 2), + nn.Sequential( + nn.Conv2d(2 * hidden_units, 2 * hidden_units, kernel_size=3, padding=1), + nn.BatchNorm2d(2 * hidden_units) + )) + + # self.non_local = NonLocalBlock2D(64, 64) + block_ = [UpsampleBLock(2 * hidden_units, 2) for _ in range(upsample_block_num)] + block_.append(nn.Conv2d(2 * hidden_units, in_planes, kernel_size=9, padding=4)) + setattr(self, 'block%d' % (srb_nums + 3), nn.Sequential(*block_)) + self.tps_inputsize = [height // scale_factor, width // scale_factor] + tps_outputsize = [height // scale_factor, width // scale_factor] + num_control_points = 20 + tps_margins = [0.05, 0.05] + self.stn = STN + self.out_channels = in_channels + if self.stn: + self.tps = TPSSpatialTransformer( + output_image_size=tuple(tps_outputsize), + num_control_points=num_control_points, + margins=tuple(tps_margins)) + + self.stn_head = STNHead( + in_channels=in_planes, + num_ctrlpoints=num_control_points, + activation='none') + self.infer_mode = infer_mode + + self.english_alphabet = '-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + self.english_dict = {} + for index in range(len(self.english_alphabet)): + self.english_dict[self.english_alphabet[index]] = index + transformer = Transformer(alphabet='-0123456789abcdefghijklmnopqrstuvwxyz') + self.transformer = transformer + for param in self.transformer.parameters(): + param.trainable = False + + def label_encoder(self, label): + batch = len(label) + + length = [len(i) for i in label] + length_tensor = torch.Tensor(length).type(torch.int64) + + max_length = max(length) + input_tensor = np.zeros((batch, max_length)) + for i in range(batch): + for j in range(length[i] - 1): + input_tensor[i][j + 1] = self.english_dict[label[i][j]] + + text_gt = [] + for i in label: + for j in i: + text_gt.append(self.english_dict[j]) + text_gt = torch.Tensor(text_gt).type(torch.int64) + + input_tensor = torch.Tensor(input_tensor).type(torch.int64) + return length_tensor, input_tensor, text_gt + + def forward(self, x): + output = {} + if self.infer_mode: + output["lr_img"] = x + y = x + else: + output["lr_img"] = x[0] + output["hr_img"] = x[1] + y = x[0] + if self.stn and self.training: + _, ctrl_points_x = self.stn_head(y) + y, _ = self.tps(y, ctrl_points_x) + block = {'1': self.block1(y)} + for i in range(self.srb_nums + 1): + block[str(i + 2)] = getattr(self, + 'block%d' % (i + 2))(block[str(i + 1)]) + + block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \ + ((block['1'] + block[str(self.srb_nums + 2)])) + + sr_img = torch.tanh(block[str(self.srb_nums + 3)]) + output["sr_img"] = sr_img + + if self.training: + hr_img = x[1] + + # add transformer + label = [str_filt(i, 'lower') + '-' for i in x[2]] + length_tensor, input_tensor, text_gt = self.label_encoder(label) + hr_pred, word_attention_map_gt, hr_correct_list = self.transformer(hr_img, length_tensor, + input_tensor) + sr_pred, word_attention_map_pred, sr_correct_list = self.transformer(sr_img, length_tensor, + input_tensor) + output["hr_img"] = hr_img + output["hr_pred"] = hr_pred + output["text_gt"] = text_gt + output["word_attention_map_gt"] = word_attention_map_gt + output["sr_pred"] = sr_pred + output["word_attention_map_pred"] = word_attention_map_pred + + return output + + +class RecurrentResidualBlock(nn.Module): + def __init__(self, channels): + super(RecurrentResidualBlock, self).__init__() + self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1) + self.bn1 = nn.BatchNorm2d(channels) + self.gru1 = GruBlock(channels, channels) + # self.prelu = nn.ReLU() + self.prelu = mish() + self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm2d(channels) + self.gru2 = GruBlock(channels, channels) + self.feature_enhancer = FeatureEnhancer() + + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def forward(self, x): + residual = self.conv1(x) + residual = self.bn1(residual) + residual = self.prelu(residual) + residual = self.conv2(residual) + residual = self.bn2(residual) + + size = residual.shape + residual = residual.reshape([size[0], size[1], -1]) + residual = self.feature_enhancer(residual) + residual = residual.reshape([size[0], size[1], size[2], size[3]]) + return x + residual \ No newline at end of file diff --git a/batch_running_task/pytorchocr/modeling/transforms/tps.py b/batch_running_task/pytorchocr/modeling/transforms/tps.py new file mode 100644 index 0000000..1fc0cc6 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/transforms/tps.py @@ -0,0 +1,301 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F +from pytorchocr.modeling.common import Activation +# import paddle +# from paddle import nn, ParamAttr +# from paddle.nn import functional as F +import numpy as np + + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False, + ) + bn_name = "bn_" + name + self.bn = nn.BatchNorm2d( + out_channels, ) + self.act = act + if act is not None: + self._act = Activation(act) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.act is not None: + x = self._act(x) + return x + + +class LocalizationNetwork(nn.Module): + def __init__(self, in_channels, num_fiducial, loc_lr, model_name): + super(LocalizationNetwork, self).__init__() + self.F = num_fiducial + F = num_fiducial + if model_name == "large": + num_filters_list = [64, 128, 256, 512] + fc_dim = 256 + else: + num_filters_list = [16, 32, 64, 128] + fc_dim = 64 + + # self.block_list = [] + self.block_list = nn.Sequential() + for fno in range(0, len(num_filters_list)): + num_filters = num_filters_list[fno] + name = "loc_conv%d" % fno + # conv = self.add_sublayer( + # name, + # ConvBNLayer( + # in_channels=in_channels, + # out_channels=num_filters, + # kernel_size=3, + # act='relu', + # name=name)) + conv = ConvBNLayer( + in_channels=in_channels, + out_channels=num_filters, + kernel_size=3, + act='relu', + name=name) + # self.block_list.append(conv) + self.block_list.add_module(name, conv) + if fno == len(num_filters_list) - 1: + pool = nn.AdaptiveAvgPool2d(1) + else: + # pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) + in_channels = num_filters + # self.block_list.append(pool) + self.block_list.add_module('{}_pool'.format(name), pool) + name = "loc_fc1" + stdv = 1.0 / math.sqrt(num_filters_list[-1] * 1.0) + self.fc1 = nn.Linear( + in_channels, + fc_dim, + bias=True, + ) + + + # Init fc2 in LocalizationNetwork + initial_bias = self.get_initial_fiducials() + initial_bias = initial_bias.reshape(-1) + name = "loc_fc2" + self.fc2 = nn.Linear( + fc_dim, + F * 2, + bias=True + ) + self.out_channels = F * 2 + + def forward(self, x): + """ + Estimating parameters of geometric transformation + Args: + image: input + Return: + batch_C_prime: the matrix of the geometric transformation + """ + B = x.shape[0] + i = 0 + for block in self.block_list: + x = block(x) + x = x.squeeze(dim=2).squeeze(dim=2) + x = self.fc1(x) + + x = F.relu(x) + x = self.fc2(x) + x = x.reshape(shape=[-1, self.F, 2]) + return x + + def get_initial_fiducials(self): + """ see RARE paper Fig. 6 (a) """ + F = self.F + ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2)) + ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(F / 2)) + ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(F / 2)) + ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) + ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) + initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0) + return initial_bias + + +class GridGenerator(nn.Module): + def __init__(self, in_channels, num_fiducial): + super(GridGenerator, self).__init__() + self.eps = 1e-6 + self.F = num_fiducial + + name = "ex_fc" + self.fc = nn.Linear( + in_channels, + 6, + bias=True + ) + + def forward(self, batch_C_prime, I_r_size): + """ + Generate the grid for the grid_sampler. + Args: + batch_C_prime: the matrix of the geometric transformation + I_r_size: the shape of the input image + Return: + batch_P_prime: the grid for the grid_sampler + """ + C = self.build_C_paddle() + P = self.build_P_paddle(I_r_size) + + inv_delta_C_tensor = self.build_inv_delta_C_paddle(C).type(torch.float32) + P_hat_tensor = self.build_P_hat_paddle( + C, torch.as_tensor(P)).type(torch.float32) + + inv_delta_C_tensor.stop_gradient = True + P_hat_tensor.stop_gradient = True + + batch_C_ex_part_tensor = self.get_expand_tensor(batch_C_prime) + + batch_C_ex_part_tensor.stop_gradient = True + + batch_C_prime_with_zeros = torch.cat( + [batch_C_prime, batch_C_ex_part_tensor], dim=1) + inv_delta_C_tensor = inv_delta_C_tensor.to(batch_C_prime_with_zeros.device) + batch_T = torch.matmul(inv_delta_C_tensor, batch_C_prime_with_zeros) + P_hat_tensor = P_hat_tensor.to(batch_T.device) + batch_P_prime = torch.matmul(P_hat_tensor, batch_T) + return batch_P_prime + + def build_C_paddle(self): + """ Return coordinates of fiducial points in I_r; C """ + F = self.F + ctrl_pts_x = torch.linspace(-1.0, 1.0, int(F / 2), dtype=torch.float64) + ctrl_pts_y_top = -1 * torch.ones([int(F / 2)], dtype=torch.float64) + ctrl_pts_y_bottom = torch.ones([int(F / 2)], dtype=torch.float64) + ctrl_pts_top = torch.stack([ctrl_pts_x, ctrl_pts_y_top], dim=1) + ctrl_pts_bottom = torch.stack([ctrl_pts_x, ctrl_pts_y_bottom], dim=1) + C = torch.cat([ctrl_pts_top, ctrl_pts_bottom], dim=0) + return C # F x 2 + + def build_P_paddle(self, I_r_size): + I_r_height, I_r_width = I_r_size + I_r_grid_x = (torch.arange( + -I_r_width, I_r_width, 2, dtype=torch.float64) + 1.0 + ) / torch.as_tensor(np.array([I_r_width]).astype(np.float64)) + + I_r_grid_y = (torch.arange( + -I_r_height, I_r_height, 2, dtype=torch.float64) + 1.0 + ) / torch.as_tensor(np.array([I_r_height]).astype(np.float64)) + + # P: self.I_r_width x self.I_r_height x 2 + P = torch.stack(torch.meshgrid([I_r_grid_x, I_r_grid_y]), dim=2) + # P = paddle.transpose(P, perm=[1, 0, 2]) + P = P.permute(1, 0, 2) + # n (= self.I_r_width x self.I_r_height) x 2 + return P.reshape([-1, 2]) + + def build_inv_delta_C_paddle(self, C): + """ Return inv_delta_C which is needed to calculate T """ + F = self.F + hat_C = torch.zeros((F, F), dtype=torch.float64) # F x F + for i in range(0, F): + for j in range(i, F): + if i == j: + hat_C[i, j] = 1 + else: + r = torch.norm(C[i] - C[j]) + hat_C[i, j] = r + hat_C[j, i] = r + hat_C = (hat_C**2) * torch.log(hat_C) + delta_C = torch.cat( # F+3 x F+3 + [ + torch.cat( + [torch.ones( + (F, 1), dtype=torch.float64), C, hat_C], dim=1), # F x F+3 + torch.cat( + [ + torch.zeros( + (2, 3), dtype=torch.float64), C.permute(1,0) + ], + dim=1), # 2 x F+3 + torch.cat( + [ + torch.zeros( + (1, 3), dtype=torch.float64), torch.ones( + (1, F), dtype=torch.float64) + ], + dim=1) # 1 x F+3 + ], + dim=0) + inv_delta_C = torch.inverse(delta_C) + return inv_delta_C # F+3 x F+3 + + def build_P_hat_paddle(self, C, P): + F = self.F + eps = self.eps + n = P.shape[0] # n (= self.I_r_width x self.I_r_height) + # P_tile: n x 2 -> n x 1 x 2 -> n x F x 2 + # P_tile = paddle.tile(paddle.unsqueeze(P, axis=1), (1, F, 1)) + P_tile = torch.unsqueeze(P, dim=1).repeat(1, F, 1) + C_tile = torch.unsqueeze(C, dim=0) # 1 x F x 2 + P_diff = P_tile - C_tile # n x F x 2 + # rbf_norm: n x F + rbf_norm = torch.norm(P_diff, p=2, dim=2, keepdim=False) + + # rbf: n x F + # rbf = torch.mul( + # torch.square(rbf_norm), torch.log(rbf_norm + eps)) + rbf = torch.mul( + rbf_norm**2, torch.log(rbf_norm + eps)) + P_hat = torch.cat( + [torch.ones( + (n, 1), dtype=torch.float64), P, rbf], dim=1) + return P_hat # n x F+3 + + def get_expand_tensor(self, batch_C_prime): + B, H, C = batch_C_prime.shape + batch_C_prime = batch_C_prime.reshape([B, H * C]) + batch_C_ex_part_tensor = self.fc(batch_C_prime) + batch_C_ex_part_tensor = batch_C_ex_part_tensor.reshape([-1, 3, 2]) + return batch_C_ex_part_tensor + + +class TPS(nn.Module): + def __init__(self, in_channels, num_fiducial, loc_lr, model_name): + super(TPS, self).__init__() + self.loc_net = LocalizationNetwork(in_channels, num_fiducial, loc_lr, + model_name) + self.grid_generator = GridGenerator(self.loc_net.out_channels, + num_fiducial) + self.out_channels = in_channels + + def forward(self, image): + image.stop_gradient = False + batch_C_prime = self.loc_net(image) + batch_P_prime = self.grid_generator(batch_C_prime, image.shape[2:]) + batch_P_prime = batch_P_prime.reshape( + [-1, image.shape[2], image.shape[3], 2]) + if torch.__version__ < '1.3.0': + batch_I_r = F.grid_sample(image, grid=batch_P_prime) + else: + batch_I_r = F.grid_sample(image, grid=batch_P_prime, align_corners=True) + return batch_I_r diff --git a/batch_running_task/pytorchocr/modeling/transforms/tps_spatial_transformer.py b/batch_running_task/pytorchocr/modeling/transforms/tps_spatial_transformer.py new file mode 100644 index 0000000..2241307 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/transforms/tps_spatial_transformer.py @@ -0,0 +1,136 @@ +""" +This code is refer from: +https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/tps_spatial_transformer.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import torch +from torch import nn +from torch.nn import functional as F +import numpy as np +import itertools + + +def grid_sample(input, grid, canvas=None): + input.stop_gradient = False + output = F.grid_sample(input, grid, align_corners=True) if torch.__version__ >= '1.3.0' else F.grid_sample(input, grid) + if canvas is None: + return output + else: + # input_mask = paddle.ones(shape=input.shape) + input_mask = input.data.new(input.size()).fill_(1) + output_mask = F.grid_sample(input_mask, grid) + padded_output = output * output_mask + canvas * (1 - output_mask) + return padded_output + + +# phi(x1, x2) = r^2 * log(r), where r = ||x1 - x2||_2 +def compute_partial_repr(input_points, control_points): + N = input_points.shape[0] + M = control_points.shape[0] + # pairwise_diff = input_points.view(N, 1, 2) - control_points.view(1, M, 2) + pairwise_diff = torch.reshape( + input_points, shape=[N, 1, 2]) - torch.reshape( + control_points, shape=[1, M, 2]) + # original implementation, very slow + # pairwise_dist = torch.sum(pairwise_diff ** 2, dim = 2) # square of distance + pairwise_diff_square = pairwise_diff * pairwise_diff + pairwise_dist = pairwise_diff_square[:, :, 0] + pairwise_diff_square[:, :, 1] + repr_matrix = 0.5 * pairwise_dist * torch.log(pairwise_dist) + # fix numerical error for 0 * log(0), substitute all nan with 0 + # mask = np.array(repr_matrix != repr_matrix) + # repr_matrix[mask] = 0 + mask = repr_matrix != repr_matrix + repr_matrix.masked_fill_(mask, 0) + return repr_matrix + + +# output_ctrl_pts are specified, according to our task. +def build_output_control_points(num_control_points, margins): + margin_x, margin_y = margins + num_ctrl_pts_per_side = num_control_points // 2 + ctrl_pts_x = np.linspace(margin_x, 1.0 - margin_x, num_ctrl_pts_per_side) + ctrl_pts_y_top = np.ones(num_ctrl_pts_per_side) * margin_y + ctrl_pts_y_bottom = np.ones(num_ctrl_pts_per_side) * (1.0 - margin_y) + ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) + ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) + output_ctrl_pts_arr = np.concatenate( + [ctrl_pts_top, ctrl_pts_bottom], axis=0) + output_ctrl_pts = torch.Tensor(output_ctrl_pts_arr) + return output_ctrl_pts + + +class TPSSpatialTransformer(nn.Module): + def __init__(self, + output_image_size=None, + num_control_points=None, + margins=None): + super(TPSSpatialTransformer, self).__init__() + self.output_image_size = output_image_size + self.num_control_points = num_control_points + self.margins = margins + + self.target_height, self.target_width = output_image_size + target_control_points = build_output_control_points(num_control_points, + margins) + N = num_control_points + + # create padded kernel matrix + forward_kernel = torch.zeros(N + 3, N + 3) + target_control_partial_repr = compute_partial_repr(target_control_points, target_control_points) + forward_kernel[:N, :N].copy_(target_control_partial_repr) + forward_kernel[:N, -3].fill_(1) + forward_kernel[-3, :N].fill_(1) + forward_kernel[:N, -2:].copy_(target_control_points) + forward_kernel[-2:, :N].copy_(target_control_points.transpose(0, 1)) + # compute inverse matrix + inverse_kernel = torch.inverse(forward_kernel) + + # create target cordinate matrix + HW = self.target_height * self.target_width + target_coordinate = list( + itertools.product( + range(self.target_height), range(self.target_width))) + target_coordinate = torch.Tensor(target_coordinate) # HW x 2 + Y, X = target_coordinate.split(1, dim = 1) + Y = Y / (self.target_height - 1) + X = X / (self.target_width - 1) + target_coordinate = torch.cat([X, Y], dim = 1) # convert from (y, x) to (x, y) + target_coordinate_partial_repr = compute_partial_repr( + target_coordinate, target_control_points) + target_coordinate_repr = torch.cat( + [ + target_coordinate_partial_repr, + torch.ones(HW, 1), + target_coordinate + ], + dim=1) + + # register precomputed matrices + self.inverse_kernel = inverse_kernel + self.padding_matrix = torch.zeros(3, 2) + self.target_coordinate_repr = target_coordinate_repr + self.target_control_points = target_control_points + + def forward(self, input, source_control_points): + assert source_control_points.ndimension() == 3 + assert source_control_points.shape[1] == self.num_control_points + assert source_control_points.shape[2] == 2 + batch_size = source_control_points.size(0) + + Y = torch.cat([source_control_points, self.padding_matrix.expand(batch_size, 3, 2)], 1) + mapping_matrix = torch.matmul(self.inverse_kernel, Y) + source_coordinate = torch.matmul(self.target_coordinate_repr, mapping_matrix) + + # grid = source_coordinate.view(-1, self.target_height, self.target_width, 2) + grid = torch.reshape( + source_coordinate, + shape=[-1, self.target_height, self.target_width, 2]) + grid = torch.clamp(grid, 0, 1) # the source_control_points may be out of [0, 1]. + # the input to grid_sample is normalized [-1, 1], but what we get is [0, 1] + grid = 2.0 * grid - 1.0 + output_maps = grid_sample(input, grid, canvas=None) + return output_maps, source_coordinate diff --git a/batch_running_task/pytorchocr/modeling/transforms/tsrn.py b/batch_running_task/pytorchocr/modeling/transforms/tsrn.py new file mode 100644 index 0000000..f7ee9c3 --- /dev/null +++ b/batch_running_task/pytorchocr/modeling/transforms/tsrn.py @@ -0,0 +1,220 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FudanVI/FudanOCR/blob/main/text-gestalt/model/tsrn.py +""" + +import math +import torch +import torch.nn.functional as F +from torch import nn +from collections import OrderedDict +import sys +import numpy as np +import warnings +import math, copy +import cv2 + +warnings.filterwarnings("ignore") + +from .tps_spatial_transformer import TPSSpatialTransformer +from .stn import STN as STN_model +from pytorchocr.modeling.heads.sr_rensnet_transformer import Transformer + + +class TSRN(nn.Module): + def __init__(self, + in_channels, + scale_factor=2, + width=128, + height=32, + STN=False, + srb_nums=5, + mask=False, + hidden_units=32, + infer_mode=False, + **kwargs): + super(TSRN, self).__init__() + in_planes = 3 + if mask: + in_planes = 4 + assert math.log(scale_factor, 2) % 1 == 0 + upsample_block_num = int(math.log(scale_factor, 2)) + self.block1 = nn.Sequential( + nn.Conv2d( + in_planes, 2 * hidden_units, kernel_size=9, padding=4), + nn.PReLU()) + self.srb_nums = srb_nums + for i in range(srb_nums): + setattr(self, 'block%d' % (i + 2), + RecurrentResidualBlock(2 * hidden_units)) + + setattr( + self, + 'block%d' % (srb_nums + 2), + nn.Sequential( + nn.Conv2d( + 2 * hidden_units, + 2 * hidden_units, + kernel_size=3, + padding=1), + nn.BatchNorm2d(2 * hidden_units))) + + block_ = [ + UpsampleBLock(2 * hidden_units, 2) + for _ in range(upsample_block_num) + ] + block_.append( + nn.Conv2d(2 * hidden_units, in_planes, kernel_size=9, padding=4) + ) + setattr(self, 'block%d' % (srb_nums + 3), nn.Sequential(*block_)) + self.tps_inputsize = [height // scale_factor, width // scale_factor] + tps_outputsize = [height // scale_factor, width // scale_factor] + num_control_points = 20 + tps_margins = [0.05, 0.05] + self.stn = STN + if self.stn: + self.tps = TPSSpatialTransformer( + output_image_size=tuple(tps_outputsize), + num_control_points=num_control_points, + margins=tuple(tps_margins)) + + self.stn_head = STN_model( + in_channels=in_planes, + num_ctrlpoints=num_control_points, + activation='none') + self.out_channels = in_channels + + self.r34_transformer = Transformer() + for param in self.r34_transformer.parameters(): + param.trainable = False + self.infer_mode = infer_mode + + def forward(self, x): + output = {} + if self.infer_mode: + output["lr_img"] = x + y = x + else: + output["lr_img"] = x[0] + output["hr_img"] = x[1] + y = x[0] + if self.stn and self.training: + _, ctrl_points_x = self.stn_head(y) + y, _ = self.tps(y, ctrl_points_x) + block = {'1': self.block1(y)} + for i in range(self.srb_nums + 1): + block[str(i + 2)] = getattr(self, + 'block%d' % (i + 2))(block[str(i + 1)]) + + block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \ + ((block['1'] + block[str(self.srb_nums + 2)])) + + sr_img = torch.tanh(block[str(self.srb_nums + 3)]) + + output["sr_img"] = sr_img + + if self.training: + hr_img = x[1] + length = x[2] + input_tensor = x[3] + + # add transformer + sr_pred, word_attention_map_pred, _ = self.r34_transformer( + sr_img, length, input_tensor) + + hr_pred, word_attention_map_gt, _ = self.r34_transformer( + hr_img, length, input_tensor) + + output["hr_img"] = hr_img + output["hr_pred"] = hr_pred + output["word_attention_map_gt"] = word_attention_map_gt + output["sr_pred"] = sr_pred + output["word_attention_map_pred"] = word_attention_map_pred + + return output + + +class RecurrentResidualBlock(nn.Module): + def __init__(self, channels): + super(RecurrentResidualBlock, self).__init__() + self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1) + self.bn1 = nn.BatchNorm2d(channels) + self.gru1 = GruBlock(channels, channels) + self.prelu = mish() + self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm2d(channels) + self.gru2 = GruBlock(channels, channels) + + def forward(self, x): + residual = self.conv1(x) + residual = self.bn1(residual) + residual = self.prelu(residual) + residual = self.conv2(residual) + residual = self.bn2(residual) + residual = self.gru1(residual.permute(0, 1, 3, 2).contiguous()).permute(0, 1, 3, 2).contiguous() + + return self.gru2(x + residual).contiguous() + + +class UpsampleBLock(nn.Module): + def __init__(self, in_channels, up_scale): + super(UpsampleBLock, self).__init__() + self.conv = nn.Conv2d( + in_channels, in_channels * up_scale**2, kernel_size=3, padding=1) + + self.pixel_shuffle = nn.PixelShuffle(up_scale) + self.prelu = mish() + + def forward(self, x): + x = self.conv(x) + x = self.pixel_shuffle(x) + x = self.prelu(x) + return x + + +class mish(nn.Module): + def __init__(self, ): + super(mish, self).__init__() + self.activated = True + + def forward(self, x): + if self.activated: + x = x * (torch.tanh(F.softplus(x))) + return x + + +class GruBlock(nn.Module): + def __init__(self, in_channels, out_channels): + super(GruBlock, self).__init__() + assert out_channels % 2 == 0 + self.conv1 = nn.Conv2d( + in_channels, out_channels, kernel_size=1, padding=0) + self.gru = nn.GRU(out_channels, + out_channels // 2, + bidirectional=True, + batch_first=True, + ) + + def forward(self, x): + # x: b, c, w, h + x = self.conv1(x) + x = x.permute(0, 2, 3, 1).contiguous() # b, w, h, c + batch_size, w, h, c = x.size() + x = x.view(batch_size * w, h, c) # b*w, h, c + x, _ = self.gru(x) + x = x.view(batch_size, w, h, c) + x = x.permute(0, 3, 1, 2).contiguous() + return x diff --git a/batch_running_task/pytorchocr/postprocess/__init__.py b/batch_running_task/pytorchocr/postprocess/__init__.py new file mode 100644 index 0000000..fc20314 --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/__init__.py @@ -0,0 +1,41 @@ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import copy + +__all__ = ['build_post_process'] + + +def build_post_process(config, global_config=None): + from .db_postprocess import DBPostProcess + from .east_postprocess import EASTPostProcess + from .sast_postprocess import SASTPostProcess + from .fce_postprocess import FCEPostProcess + from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, TableLabelDecode, \ + NRTRLabelDecode, SARLabelDecode, ViTSTRLabelDecode, RFLLabelDecode + from .cls_postprocess import ClsPostProcess + from .pg_postprocess import PGPostProcess + from .rec_postprocess import CANLabelDecode + + support_dict = [ + 'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'CTCLabelDecode', + 'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode', 'PGPostProcess', + 'TableLabelDecode', 'NRTRLabelDecode', 'SARLabelDecode', 'FCEPostProcess', + 'ViTSTRLabelDecode','CANLabelDecode', 'RFLLabelDecode' + ] + + if config['name'] == 'PSEPostProcess': + from .pse_postprocess import PSEPostProcess + support_dict.append('PSEPostProcess') + + config = copy.deepcopy(config) + module_name = config.pop('name') + if global_config is not None: + config.update(global_config) + assert module_name in support_dict, Exception( + 'post process only support {}, but got {}'.format(support_dict, module_name)) + module_class = eval(module_name)(**config) + return module_class \ No newline at end of file diff --git a/batch_running_task/pytorchocr/postprocess/cls_postprocess.py b/batch_running_task/pytorchocr/postprocess/cls_postprocess.py new file mode 100644 index 0000000..c9c6aff --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/cls_postprocess.py @@ -0,0 +1,20 @@ +import torch + + +class ClsPostProcess(object): + """ Convert between text-label and text-index """ + + def __init__(self, label_list, **kwargs): + super(ClsPostProcess, self).__init__() + self.label_list = label_list + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, torch.Tensor): + preds = preds.cpu().numpy() + pred_idxs = preds.argmax(axis=1) + decode_out = [(self.label_list[idx], preds[i, idx]) + for i, idx in enumerate(pred_idxs)] + if label is None: + return decode_out + label = [(self.label_list[idx], 1.0) for idx in label] + return decode_out, label \ No newline at end of file diff --git a/batch_running_task/pytorchocr/postprocess/db_postprocess.py b/batch_running_task/pytorchocr/postprocess/db_postprocess.py new file mode 100644 index 0000000..8765543 --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/db_postprocess.py @@ -0,0 +1,172 @@ +""" +This code is refered from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/post_processing/seg_detector_representer.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import cv2 +import torch +from shapely.geometry import Polygon +import pyclipper + + +class DBPostProcess(object): + """ + The post process for Differentiable Binarization (DB). + """ + + def __init__(self, + thresh=0.3, + box_thresh=0.7, + max_candidates=1000, + unclip_ratio=2.0, + use_dilation=False, + score_mode="fast", + **kwargs): + self.thresh = thresh + self.box_thresh = box_thresh + self.max_candidates = max_candidates + self.unclip_ratio = unclip_ratio + self.min_size = 3 + self.score_mode = score_mode + assert score_mode in [ + "slow", "fast" + ], "Score mode must be in [slow, fast] but got: {}".format(score_mode) + + self.dilation_kernel = None if not use_dilation else np.array( + [[1, 1], [1, 1]]) + + def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height): + ''' + _bitmap: single map with shape (1, H, W), + whose values are binarized as {0, 1} + ''' + + bitmap = _bitmap + height, width = bitmap.shape + + outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) + if len(outs) == 3: + img, contours, _ = outs[0], outs[1], outs[2] + elif len(outs) == 2: + contours, _ = outs[0], outs[1] + + num_contours = min(len(contours), self.max_candidates) + + boxes = [] + scores = [] + for index in range(num_contours): + contour = contours[index] + points, sside = self.get_mini_boxes(contour) + if sside < self.min_size: + continue + points = np.array(points) + if self.score_mode == "fast": + score = self.box_score_fast(pred, points.reshape(-1, 2)) + else: + score = self.box_score_slow(pred, contour) + if self.box_thresh > score:continue + + box = self.unclip(points).reshape(-1, 1, 2) + box, sside = self.get_mini_boxes(box) + if sside < self.min_size + 2:continue + box = np.array(box) + + box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width) + box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height) + boxes.append(box.astype(np.int16)) + scores.append(score) + return np.array(boxes, dtype=np.int16), scores + + def unclip(self, box): + unclip_ratio = self.unclip_ratio + poly = Polygon(box) + distance = poly.area * unclip_ratio / poly.length + offset = pyclipper.PyclipperOffset() + offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + expanded = np.array(offset.Execute(distance)) + return expanded + + def get_mini_boxes(self, contour): + bounding_box = cv2.minAreaRect(contour) + points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) + + index_1, index_2, index_3, index_4 = 0, 1, 2, 3 + if points[1][1] > points[0][1]: + index_1 = 0 + index_4 = 1 + else: + index_1 = 1 + index_4 = 0 + if points[3][1] > points[2][1]: + index_2 = 2 + index_3 = 3 + else: + index_2 = 3 + index_3 = 2 + + box = [ + points[index_1], points[index_2], points[index_3], points[index_4] + ] + return box, min(bounding_box[1]) + + def box_score_fast(self, bitmap, _box): + ''' + box_score_fast: use bbox mean score as the mean score + ''' + h, w = bitmap.shape[:2] + box = _box.copy() + xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int32), 0, w - 1) + xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int32), 0, w - 1) + ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int32), 0, h - 1) + ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int32), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + box[:, 0] = box[:, 0] - xmin + box[:, 1] = box[:, 1] - ymin + cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1) + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] + + def box_score_slow(self, bitmap, contour): + ''' + box_score_slow: use polyon mean score as the mean score + ''' + h, w = bitmap.shape[:2] + contour = contour.copy() + contour = np.reshape(contour, (-1, 2)) + + xmin = np.clip(np.min(contour[:, 0]), 0, w - 1) + xmax = np.clip(np.max(contour[:, 0]), 0, w - 1) + ymin = np.clip(np.min(contour[:, 1]), 0, h - 1) + ymax = np.clip(np.max(contour[:, 1]), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + + contour[:, 0] = contour[:, 0] - xmin + contour[:, 1] = contour[:, 1] - ymin + + cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1) + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] + + def __call__(self, outs_dict, shape_list): + pred = outs_dict['maps'] + pred = pred[:, 0, :, :] + segmentation = pred > self.thresh + if isinstance(segmentation, torch.Tensor): + segmentation = segmentation.cpu().numpy() + + + boxes_batch = [] + for batch_index in range(pred.shape[0]): + src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] + if self.dilation_kernel is not None: + mask = cv2.dilate(np.array(segmentation[batch_index]).astype(np.uint8),self.dilation_kernel) + else: + mask = segmentation[batch_index] + boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,src_w, src_h) + + boxes_batch.append({'points': boxes}) + return boxes_batch \ No newline at end of file diff --git a/batch_running_task/pytorchocr/postprocess/east_postprocess.py b/batch_running_task/pytorchocr/postprocess/east_postprocess.py new file mode 100644 index 0000000..10bb916 --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/east_postprocess.py @@ -0,0 +1,144 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from .locality_aware_nms import nms_locality +import cv2 +# import paddle +import torch + +import os +import sys + + +class EASTPostProcess(object): + """ + The post process for EAST. + """ + + def __init__(self, + score_thresh=0.8, + cover_thresh=0.1, + nms_thresh=0.2, + **kwargs): + + self.score_thresh = score_thresh + self.cover_thresh = cover_thresh + self.nms_thresh = nms_thresh + + # c++ la-nms is faster, but only support python 3.5 + self.is_python35 = False + if sys.version_info.major == 3 and sys.version_info.minor == 5: + self.is_python35 = True + + def restore_rectangle_quad(self, origin, geometry): + """ + Restore rectangle from quadrangle. + """ + # quad + origin_concat = np.concatenate( + (origin, origin, origin, origin), axis=1) # (n, 8) + pred_quads = origin_concat - geometry + pred_quads = pred_quads.reshape((-1, 4, 2)) # (n, 4, 2) + return pred_quads + + def detect(self, + score_map, + geo_map, + score_thresh=0.8, + cover_thresh=0.1, + nms_thresh=0.2): + """ + restore text boxes from score map and geo map + """ + score_map = score_map[0] + geo_map = np.swapaxes(geo_map, 1, 0) + geo_map = np.swapaxes(geo_map, 1, 2) + # filter the score map + xy_text = np.argwhere(score_map > score_thresh) + if len(xy_text) == 0: + return [] + # sort the text boxes via the y axis + xy_text = xy_text[np.argsort(xy_text[:, 0])] + # restore quad proposals + text_box_restored = self.restore_rectangle_quad( + xy_text[:, ::-1] * 4, geo_map[xy_text[:, 0], xy_text[:, 1], :]) + boxes = np.zeros((text_box_restored.shape[0], 9), dtype=np.float32) + boxes[:, :8] = text_box_restored.reshape((-1, 8)) + boxes[:, 8] = score_map[xy_text[:, 0], xy_text[:, 1]] + if self.is_python35: + import lanms + boxes = lanms.merge_quadrangle_n9(boxes, nms_thresh) + else: + boxes = nms_locality(boxes.astype(np.float64), nms_thresh) + if boxes.shape[0] == 0: + return [] + # Here we filter some low score boxes by the average score map, + # this is different from the orginal paper. + for i, box in enumerate(boxes): + mask = np.zeros_like(score_map, dtype=np.uint8) + cv2.fillPoly(mask, box[:8].reshape( + (-1, 4, 2)).astype(np.int32) // 4, 1) + boxes[i, 8] = cv2.mean(score_map, mask)[0] + boxes = boxes[boxes[:, 8] > cover_thresh] + return boxes + + def sort_poly(self, p): + """ + Sort polygons. + """ + min_axis = np.argmin(np.sum(p, axis=1)) + p = p[[min_axis, (min_axis + 1) % 4, \ + (min_axis + 2) % 4, (min_axis + 3) % 4]] + if abs(p[0, 0] - p[1, 0]) > abs(p[0, 1] - p[1, 1]): + return p + else: + return p[[0, 3, 2, 1]] + + def __call__(self, outs_dict, shape_list): + score_list = outs_dict['f_score'] + geo_list = outs_dict['f_geo'] + if isinstance(score_list, torch.Tensor): + score_list = score_list.cpu().numpy() + geo_list = geo_list.cpu().numpy() + img_num = len(shape_list) + dt_boxes_list = [] + for ino in range(img_num): + score = score_list[ino] + geo = geo_list[ino] + boxes = self.detect( + score_map=score, + geo_map=geo, + score_thresh=self.score_thresh, + cover_thresh=self.cover_thresh, + nms_thresh=self.nms_thresh) + boxes_norm = [] + if len(boxes) > 0: + h, w = score.shape[1:] + src_h, src_w, ratio_h, ratio_w = shape_list[ino] + boxes = boxes[:, :8].reshape((-1, 4, 2)) + boxes[:, :, 0] /= ratio_w + boxes[:, :, 1] /= ratio_h + for i_box, box in enumerate(boxes): + box = self.sort_poly(box.astype(np.int32)) + if np.linalg.norm(box[0] - box[1]) < 5 \ + or np.linalg.norm(box[3] - box[0]) < 5: + continue + boxes_norm.append(box) + dt_boxes_list.append({'points': np.array(boxes_norm)}) + return dt_boxes_list \ No newline at end of file diff --git a/batch_running_task/pytorchocr/postprocess/fce_postprocess.py b/batch_running_task/pytorchocr/postprocess/fce_postprocess.py new file mode 100644 index 0000000..ae4633f --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/fce_postprocess.py @@ -0,0 +1,228 @@ +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/v0.3.0/mmocr/models/textdet/postprocess/wrapper.py +""" + +import cv2 +import torch +import numpy as np +from numpy.fft import ifft +from pytorchocr.utils.poly_nms import poly_nms, valid_boundary + + +def fill_hole(input_mask): + h, w = input_mask.shape + canvas = np.zeros((h + 2, w + 2), np.uint8) + canvas[1:h + 1, 1:w + 1] = input_mask.copy() + + mask = np.zeros((h + 4, w + 4), np.uint8) + + cv2.floodFill(canvas, mask, (0, 0), 1) + canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool) + + return ~canvas | input_mask + + +def fourier2poly(fourier_coeff, num_reconstr_points=50): + """ Inverse Fourier transform + Args: + fourier_coeff (ndarray): Fourier coefficients shaped (n, 2k+1), + with n and k being candidates number and Fourier degree + respectively. + num_reconstr_points (int): Number of reconstructed polygon points. + Returns: + Polygons (ndarray): The reconstructed polygons shaped (n, n') + """ + + a = np.zeros((len(fourier_coeff), num_reconstr_points), dtype='complex') + k = (len(fourier_coeff[0]) - 1) // 2 + + a[:, 0:k + 1] = fourier_coeff[:, k:] + a[:, -k:] = fourier_coeff[:, :k] + + poly_complex = ifft(a) * num_reconstr_points + polygon = np.zeros((len(fourier_coeff), num_reconstr_points, 2)) + polygon[:, :, 0] = poly_complex.real + polygon[:, :, 1] = poly_complex.imag + return polygon.astype('int32').reshape((len(fourier_coeff), -1)) + + +class FCEPostProcess(object): + """ + The post process for FCENet. + """ + + def __init__(self, + scales, + fourier_degree=5, + num_reconstr_points=50, + decoding_type='fcenet', + score_thr=0.3, + nms_thr=0.1, + alpha=1.0, + beta=1.0, + box_type='poly', + **kwargs): + + self.scales = scales + self.fourier_degree = fourier_degree + self.num_reconstr_points = num_reconstr_points + self.decoding_type = decoding_type + self.score_thr = score_thr + self.nms_thr = nms_thr + self.alpha = alpha + self.beta = beta + self.box_type = box_type + + def __call__(self, preds, shape_list): + score_maps = [] + for key, value in preds.items(): + if isinstance(value, torch.Tensor): + value = value.numpy() + cls_res = value[:, :4, :, :] + reg_res = value[:, 4:, :, :] + score_maps.append([cls_res, reg_res]) + + return self.get_boundary(score_maps, shape_list) + + def resize_boundary(self, boundaries, scale_factor): + """Rescale boundaries via scale_factor. + + Args: + boundaries (list[list[float]]): The boundary list. Each boundary + with size 2k+1 with k>=4. + scale_factor(ndarray): The scale factor of size (4,). + + Returns: + boundaries (list[list[float]]): The scaled boundaries. + """ + boxes = [] + scores = [] + for b in boundaries: + sz = len(b) + valid_boundary(b, True) + scores.append(b[-1]) + b = (np.array(b[:sz - 1]) * + (np.tile(scale_factor[:2], int( + (sz - 1) / 2)).reshape(1, sz - 1))).flatten().tolist() + boxes.append(np.array(b).reshape([-1, 2])) + + return np.array(boxes, dtype=np.float32), scores + + def get_boundary(self, score_maps, shape_list): + assert len(score_maps) == len(self.scales) + boundaries = [] + for idx, score_map in enumerate(score_maps): + scale = self.scales[idx] + boundaries = boundaries + self._get_boundary_single(score_map, + scale) + + # nms + boundaries = poly_nms(boundaries, self.nms_thr) + boundaries, scores = self.resize_boundary( + boundaries, (1 / shape_list[0, 2:]).tolist()[::-1]) + + boxes_batch = [dict(points=boundaries, scores=scores)] + return boxes_batch + + def _get_boundary_single(self, score_map, scale): + assert len(score_map) == 2 + assert score_map[1].shape[1] == 4 * self.fourier_degree + 2 + + return self.fcenet_decode( + preds=score_map, + fourier_degree=self.fourier_degree, + num_reconstr_points=self.num_reconstr_points, + scale=scale, + alpha=self.alpha, + beta=self.beta, + box_type=self.box_type, + score_thr=self.score_thr, + nms_thr=self.nms_thr) + + def fcenet_decode(self, + preds, + fourier_degree, + num_reconstr_points, + scale, + alpha=1.0, + beta=2.0, + box_type='poly', + score_thr=0.3, + nms_thr=0.1): + """Decoding predictions of FCENet to instances. + + Args: + preds (list(Tensor)): The head output tensors. + fourier_degree (int): The maximum Fourier transform degree k. + num_reconstr_points (int): The points number of the polygon + reconstructed from predicted Fourier coefficients. + scale (int): The down-sample scale of the prediction. + alpha (float) : The parameter to calculate final scores. Score_{final} + = (Score_{text region} ^ alpha) + * (Score_{text center region}^ beta) + beta (float) : The parameter to calculate final score. + box_type (str): Boundary encoding type 'poly' or 'quad'. + score_thr (float) : The threshold used to filter out the final + candidates. + nms_thr (float) : The threshold of nms. + + Returns: + boundaries (list[list[float]]): The instance boundary and confidence + list. + """ + assert isinstance(preds, list) + assert len(preds) == 2 + assert box_type in ['poly', 'quad'] + + cls_pred = preds[0][0] + tr_pred = cls_pred[0:2] + tcl_pred = cls_pred[2:] + + reg_pred = preds[1][0].transpose([1, 2, 0]) + x_pred = reg_pred[:, :, :2 * fourier_degree + 1] + y_pred = reg_pred[:, :, 2 * fourier_degree + 1:] + + score_pred = (tr_pred[1]**alpha) * (tcl_pred[1]**beta) + tr_pred_mask = (score_pred) > score_thr + tr_mask = fill_hole(tr_pred_mask) + + tr_contours, _ = cv2.findContours( + tr_mask.astype(np.uint8), cv2.RETR_TREE, + cv2.CHAIN_APPROX_SIMPLE) # opencv4 + + mask = np.zeros_like(tr_mask) + boundaries = [] + for cont in tr_contours: + deal_map = mask.copy().astype(np.int8) + cv2.drawContours(deal_map, [cont], -1, 1, -1) + + score_map = score_pred * deal_map + score_mask = score_map > 0 + xy_text = np.argwhere(score_mask) + dxy = xy_text[:, 1] + xy_text[:, 0] * 1j + + x, y = x_pred[score_mask], y_pred[score_mask] + c = x + y * 1j + c[:, fourier_degree] = c[:, fourier_degree] + dxy + c *= scale + + polygons = fourier2poly(c, num_reconstr_points) + score = score_map[score_mask].reshape(-1, 1) + polygons = poly_nms(np.hstack((polygons, score)).tolist(), nms_thr) + + boundaries = boundaries + polygons + + boundaries = poly_nms(boundaries, nms_thr) + + if box_type == 'quad': + new_boundaries = [] + for boundary in boundaries: + poly = np.array(boundary[:-1]).reshape(-1, 2).astype(np.float32) + score = boundary[-1] + points = cv2.boxPoints(cv2.minAreaRect(poly)) + points = np.int0(points) + new_boundaries.append(points.reshape(-1).tolist() + [score]) + boundaries = new_boundaries + + return boundaries diff --git a/batch_running_task/pytorchocr/postprocess/locality_aware_nms.py b/batch_running_task/pytorchocr/postprocess/locality_aware_nms.py new file mode 100644 index 0000000..53280cc --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/locality_aware_nms.py @@ -0,0 +1,199 @@ +""" +Locality aware nms. +""" + +import numpy as np +from shapely.geometry import Polygon + + +def intersection(g, p): + """ + Intersection. + """ + g = Polygon(g[:8].reshape((4, 2))) + p = Polygon(p[:8].reshape((4, 2))) + g = g.buffer(0) + p = p.buffer(0) + if not g.is_valid or not p.is_valid: + return 0 + inter = Polygon(g).intersection(Polygon(p)).area + union = g.area + p.area - inter + if union == 0: + return 0 + else: + return inter / union + + +def intersection_iog(g, p): + """ + Intersection_iog. + """ + g = Polygon(g[:8].reshape((4, 2))) + p = Polygon(p[:8].reshape((4, 2))) + if not g.is_valid or not p.is_valid: + return 0 + inter = Polygon(g).intersection(Polygon(p)).area + #union = g.area + p.area - inter + union = p.area + if union == 0: + print("p_area is very small") + return 0 + else: + return inter / union + + +def weighted_merge(g, p): + """ + Weighted merge. + """ + g[:8] = (g[8] * g[:8] + p[8] * p[:8]) / (g[8] + p[8]) + g[8] = (g[8] + p[8]) + return g + + +def standard_nms(S, thres): + """ + Standard nms. + """ + order = np.argsort(S[:, 8])[::-1] + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + ovr = np.array([intersection(S[i], S[t]) for t in order[1:]]) + + inds = np.where(ovr <= thres)[0] + order = order[inds + 1] + + return S[keep] + + +def standard_nms_inds(S, thres): + """ + Standard nms, retun inds. + """ + order = np.argsort(S[:, 8])[::-1] + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + ovr = np.array([intersection(S[i], S[t]) for t in order[1:]]) + + inds = np.where(ovr <= thres)[0] + order = order[inds + 1] + + return keep + + +def nms(S, thres): + """ + nms. + """ + order = np.argsort(S[:, 8])[::-1] + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + ovr = np.array([intersection(S[i], S[t]) for t in order[1:]]) + + inds = np.where(ovr <= thres)[0] + order = order[inds + 1] + + return keep + + +def soft_nms(boxes_in, Nt_thres=0.3, threshold=0.8, sigma=0.5, method=2): + """ + soft_nms + :para boxes_in, N x 9 (coords + score) + :para threshould, eliminate cases min score(0.001) + :para Nt_thres, iou_threshi + :para sigma, gaussian weght + :method, linear or gaussian + """ + boxes = boxes_in.copy() + N = boxes.shape[0] + if N is None or N < 1: + return np.array([]) + pos, maxpos = 0, 0 + weight = 0.0 + inds = np.arange(N) + tbox, sbox = boxes[0].copy(), boxes[0].copy() + for i in range(N): + maxscore = boxes[i, 8] + maxpos = i + tbox = boxes[i].copy() + ti = inds[i] + pos = i + 1 + #get max box + while pos < N: + if maxscore < boxes[pos, 8]: + maxscore = boxes[pos, 8] + maxpos = pos + pos = pos + 1 + #add max box as a detection + boxes[i, :] = boxes[maxpos, :] + inds[i] = inds[maxpos] + #swap + boxes[maxpos, :] = tbox + inds[maxpos] = ti + tbox = boxes[i].copy() + pos = i + 1 + #NMS iteration + while pos < N: + sbox = boxes[pos].copy() + ts_iou_val = intersection(tbox, sbox) + if ts_iou_val > 0: + if method == 1: + if ts_iou_val > Nt_thres: + weight = 1 - ts_iou_val + else: + weight = 1 + elif method == 2: + weight = np.exp(-1.0 * ts_iou_val**2 / sigma) + else: + if ts_iou_val > Nt_thres: + weight = 0 + else: + weight = 1 + boxes[pos, 8] = weight * boxes[pos, 8] + #if box score falls below thresold, discard the box by + #swaping last box update N + if boxes[pos, 8] < threshold: + boxes[pos, :] = boxes[N - 1, :] + inds[pos] = inds[N - 1] + N = N - 1 + pos = pos - 1 + pos = pos + 1 + + return boxes[:N] + + +def nms_locality(polys, thres=0.3): + """ + locality aware nms of EAST + :param polys: a N*9 numpy array. first 8 coordinates, then prob + :return: boxes after nms + """ + S = [] + p = None + for g in polys: + if p is not None and intersection(g, p) > thres: + p = weighted_merge(g, p) + else: + if p is not None: + S.append(p) + p = g + if p is not None: + S.append(p) + + if len(S) == 0: + return np.array([]) + return standard_nms(np.array(S), thres) + + +if __name__ == '__main__': + # 343,350,448,135,474,143,369,359 + print( + Polygon(np.array([[343, 350], [448, 135], [474, 143], [369, 359]])) + .area) \ No newline at end of file diff --git a/batch_running_task/pytorchocr/postprocess/pg_postprocess.py b/batch_running_task/pytorchocr/postprocess/pg_postprocess.py new file mode 100644 index 0000000..81a8e08 --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/pg_postprocess.py @@ -0,0 +1,52 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +__dir__ = os.path.dirname(__file__) +sys.path.append(__dir__) +sys.path.append(os.path.join(__dir__, '..')) +from pytorchocr.utils.e2e_utils.pgnet_pp_utils import PGNet_PostProcess + + +class PGPostProcess(object): + """ + The post process for PGNet. + """ + + def __init__(self, character_dict_path, valid_set, score_thresh, mode, + **kwargs): + self.character_dict_path = character_dict_path + self.valid_set = valid_set + self.score_thresh = score_thresh + self.mode = mode + + # c++ la-nms is faster, but only support python 3.5 + self.is_python35 = False + if sys.version_info.major == 3 and sys.version_info.minor == 5: + self.is_python35 = True + + def __call__(self, outs_dict, shape_list): + post = PGNet_PostProcess(self.character_dict_path, self.valid_set, + self.score_thresh, outs_dict, shape_list) + if self.mode == 'fast': + data = post.pg_postprocess_fast() + else: + data = post.pg_postprocess_slow() + return data diff --git a/batch_running_task/pytorchocr/postprocess/pse_postprocess/__init__.py b/batch_running_task/pytorchocr/postprocess/pse_postprocess/__init__.py new file mode 100644 index 0000000..680473b --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/pse_postprocess/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .pse_postprocess import PSEPostProcess \ No newline at end of file diff --git a/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse/README.md b/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse/README.md new file mode 100644 index 0000000..6a19d5d --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse/README.md @@ -0,0 +1,6 @@ +## 编译 +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/post_processing/pse +```python +python3 setup.py build_ext --inplace +``` diff --git a/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse/__init__.py b/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse/__init__.py new file mode 100644 index 0000000..ce0142f --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse/__init__.py @@ -0,0 +1,29 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import os +import subprocess + +python_path = sys.executable + +ori_path = os.getcwd() +os.chdir('pytorchocr/postprocess/pse_postprocess/pse') +if subprocess.call( + '{} setup.py build_ext --inplace'.format(python_path), shell=True) != 0: + raise RuntimeError( + 'Cannot compile pse: {}, if your system is windows, you need to install all the default components of `desktop development using C++` in visual studio 2019+'. + format(os.path.dirname(os.path.realpath(__file__)))) +os.chdir(ori_path) + +from .pse import pse diff --git a/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse/pse.pyx b/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse/pse.pyx new file mode 100644 index 0000000..b2be49e --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse/pse.pyx @@ -0,0 +1,70 @@ + +import numpy as np +import cv2 +cimport numpy as np +cimport cython +cimport libcpp +cimport libcpp.pair +cimport libcpp.queue +from libcpp.pair cimport * +from libcpp.queue cimport * + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef np.ndarray[np.int32_t, ndim=2] _pse(np.ndarray[np.uint8_t, ndim=3] kernels, + np.ndarray[np.int32_t, ndim=2] label, + int kernel_num, + int label_num, + float min_area=0): + cdef np.ndarray[np.int32_t, ndim=2] pred + pred = np.zeros((label.shape[0], label.shape[1]), dtype=np.int32) + + for label_idx in range(1, label_num): + if np.sum(label == label_idx) < min_area: + label[label == label_idx] = 0 + + cdef libcpp.queue.queue[libcpp.pair.pair[np.int16_t,np.int16_t]] que = \ + queue[libcpp.pair.pair[np.int16_t,np.int16_t]]() + cdef libcpp.queue.queue[libcpp.pair.pair[np.int16_t,np.int16_t]] nxt_que = \ + queue[libcpp.pair.pair[np.int16_t,np.int16_t]]() + cdef np.int16_t* dx = [-1, 1, 0, 0] + cdef np.int16_t* dy = [0, 0, -1, 1] + cdef np.int16_t tmpx, tmpy + + points = np.array(np.where(label > 0)).transpose((1, 0)) + for point_idx in range(points.shape[0]): + tmpx, tmpy = points[point_idx, 0], points[point_idx, 1] + que.push(pair[np.int16_t,np.int16_t](tmpx, tmpy)) + pred[tmpx, tmpy] = label[tmpx, tmpy] + + cdef libcpp.pair.pair[np.int16_t,np.int16_t] cur + cdef int cur_label + for kernel_idx in range(kernel_num - 1, -1, -1): + while not que.empty(): + cur = que.front() + que.pop() + cur_label = pred[cur.first, cur.second] + + is_edge = True + for j in range(4): + tmpx = cur.first + dx[j] + tmpy = cur.second + dy[j] + if tmpx < 0 or tmpx >= label.shape[0] or tmpy < 0 or tmpy >= label.shape[1]: + continue + if kernels[kernel_idx, tmpx, tmpy] == 0 or pred[tmpx, tmpy] > 0: + continue + + que.push(pair[np.int16_t,np.int16_t](tmpx, tmpy)) + pred[tmpx, tmpy] = cur_label + is_edge = False + if is_edge: + nxt_que.push(cur) + + que, nxt_que = nxt_que, que + + return pred + +def pse(kernels, min_area): + kernel_num = kernels.shape[0] + label_num, label = cv2.connectedComponents(kernels[-1], connectivity=4) + return _pse(kernels[:-1], label, kernel_num, label_num, min_area) \ No newline at end of file diff --git a/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse/setup.py b/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse/setup.py new file mode 100644 index 0000000..0374678 --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse/setup.py @@ -0,0 +1,14 @@ +from distutils.core import setup, Extension +from Cython.Build import cythonize +import numpy + +setup(ext_modules=cythonize(Extension( + 'pse', + sources=['pse.pyx'], + language='c++', + include_dirs=[numpy.get_include()], + library_dirs=[], + libraries=[], + extra_compile_args=['-O3'], + extra_link_args=[] +))) diff --git a/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse_postprocess.py b/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse_postprocess.py new file mode 100644 index 0000000..531bbcf --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/pse_postprocess/pse_postprocess.py @@ -0,0 +1,105 @@ +""" +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import cv2 +import torch +from torch.nn import functional as F + +from pytorchocr.postprocess.pse_postprocess.pse import pse + + +class PSEPostProcess(object): + """ + The post process for PSE. + """ + + def __init__(self, + thresh=0.5, + box_thresh=0.85, + min_area=16, + box_type='box', + scale=4, + **kwargs): + assert box_type in ['box', 'poly'], 'Only box and poly is supported' + self.thresh = thresh + self.box_thresh = box_thresh + self.min_area = min_area + self.box_type = box_type + self.scale = scale + + def __call__(self, outs_dict, shape_list): + pred = outs_dict['maps'] + if not isinstance(pred, torch.Tensor): + pred = torch.as_tensor(pred) + pred = F.interpolate( + pred, scale_factor=4 // self.scale, mode='bilinear') + + score = F.sigmoid(pred[:, 0, :, :]) + + kernels = (pred > self.thresh).type(torch.float32) + text_mask = kernels[:, 0, :, :] + kernels[:, 0:, :, :] = kernels[:, 0:, :, :] * text_mask + + score = score.numpy() + kernels = kernels.numpy().astype(np.uint8) + + boxes_batch = [] + for batch_index in range(pred.shape[0]): + boxes, scores = self.boxes_from_bitmap(score[batch_index], + kernels[batch_index], + shape_list[batch_index]) + + boxes_batch.append({'points': boxes, 'scores': scores}) + return boxes_batch + + def boxes_from_bitmap(self, score, kernels, shape): + label = pse(kernels, self.min_area) + return self.generate_box(score, label, shape) + + def generate_box(self, score, label, shape): + src_h, src_w, ratio_h, ratio_w = shape + label_num = np.max(label) + 1 + + boxes = [] + scores = [] + for i in range(1, label_num): + ind = label == i + points = np.array(np.where(ind)).transpose((1, 0))[:, ::-1] + + if points.shape[0] < self.min_area: + label[ind] = 0 + continue + + score_i = np.mean(score[ind]) + if score_i < self.box_thresh: + label[ind] = 0 + continue + + if self.box_type == 'box': + rect = cv2.minAreaRect(points) + bbox = cv2.boxPoints(rect) + elif self.box_type == 'poly': + box_height = np.max(points[:, 1]) + 10 + box_width = np.max(points[:, 0]) + 10 + + mask = np.zeros((box_height, box_width), np.uint8) + mask[points[:, 1], points[:, 0]] = 255 + + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, + cv2.CHAIN_APPROX_SIMPLE) + bbox = np.squeeze(contours[0], 1) + else: + raise NotImplementedError + + bbox[:, 0] = np.clip(np.round(bbox[:, 0] / ratio_w), 0, src_w) + bbox[:, 1] = np.clip(np.round(bbox[:, 1] / ratio_h), 0, src_h) + boxes.append(bbox) + scores.append(score_i) + return boxes, scores diff --git a/batch_running_task/pytorchocr/postprocess/rec_postprocess.py b/batch_running_task/pytorchocr/postprocess/rec_postprocess.py new file mode 100644 index 0000000..b163a5a --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/rec_postprocess.py @@ -0,0 +1,693 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import string +#import paddle +# from paddle.nn import functional as F +import torch + + +class BaseRecLabelDecode(object): + """ Convert between text-label and text-index """ + + def __init__(self, + character_dict_path=None, + use_space_char=False): + + self.beg_str = "sos" + self.end_str = "eos" + + self.character_str = [] + if character_dict_path is None: + self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" + dict_character = list(self.character_str) + else: + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + self.character_str.append(line) + if use_space_char: + self.character_str.append(" ") + dict_character = list(self.character_str) + + dict_character = self.add_special_char(dict_character) + self.dict = {} + for i, char in enumerate(dict_character): + self.dict[char] = i + self.character = dict_character + + def add_special_char(self, dict_character): + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list))) + return result_list + + def get_ignored_tokens(self): + return [0] # for ctc blank + + +class CTCLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(CTCLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, torch.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True) + + if label is None: + return text + label = self.decode(label) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['blank'] + dict_character + return dict_character + + +class NRTRLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=True, **kwargs): + super(NRTRLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + + if len(preds) == 2: + preds_id = preds[0] + preds_prob = preds[1] + if isinstance(preds_id, torch.Tensor): + preds_id = preds_id.numpy() + if isinstance(preds_prob, torch.Tensor): + preds_prob = preds_prob.numpy() + if preds_id[0][0] == 2: + preds_idx = preds_id[:, 1:] + preds_prob = preds_prob[:, 1:] + else: + preds_idx = preds_id + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + else: + if isinstance(preds, torch.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['blank', '', '', ''] + dict_character + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + try: + char_idx = self.character[int(text_index[batch_idx][idx])] + except: + continue + if char_idx == '': # end + break + char_list.append(char_idx) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text.lower(), np.mean(conf_list).tolist())) + return result_list + +class ViTSTRLabelDecode(NRTRLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(ViTSTRLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, torch.Tensor): + preds = preds[:, 1:].numpy() + else: + preds = preds[:, 1:] + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['', ''] + dict_character + return dict_character + + +class AttnLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(AttnLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = dict_character + dict_character = [self.beg_str] + dict_character + [self.end_str] + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + [beg_idx, end_idx] = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(end_idx): + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list))) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + """ + text = self.decode(text) + if label is None: + return text + else: + label = self.decode(label, is_remove_duplicate=False) + return text, label + """ + if isinstance(preds, torch.Tensor): + preds = preds.cpu().numpy() + + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class RFLLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(RFLLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = dict_character + dict_character = [self.beg_str] + dict_character + [self.end_str] + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + [beg_idx, end_idx] = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(end_idx): + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + # if seq_outputs is not None: + if isinstance(preds, tuple) or isinstance(preds, list): + cnt_outputs, seq_outputs = preds + if isinstance(seq_outputs, torch.Tensor): + seq_outputs = seq_outputs.numpy() + preds_idx = seq_outputs.argmax(axis=2) + preds_prob = seq_outputs.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + else: + cnt_outputs = preds + if isinstance(cnt_outputs, torch.Tensor): + cnt_outputs = cnt_outputs.numpy() + cnt_length = [] + for lens in cnt_outputs: + length = round(np.sum(lens)) + cnt_length.append(length) + if label is None: + return cnt_length + label = self.decode(label, is_remove_duplicate=False) + length = [len(res[0]) for res in label] + return cnt_length, length + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class SRNLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, + character_dict_path=None, + use_space_char=False, + **kwargs): + self.max_text_length = kwargs.get('max_text_length', 25) + super(SRNLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + pred = preds['predict'] + char_num = len(self.character_str) + 2 + if isinstance(pred, torch.Tensor): + pred = pred.numpy() + pred = np.reshape(pred, [-1, char_num]) + + preds_idx = np.argmax(pred, axis=1) + preds_prob = np.max(pred, axis=1) + + preds_idx = np.reshape(preds_idx, [-1, self.max_text_length]) + + preds_prob = np.reshape(preds_prob, [-1, self.max_text_length]) + + text = self.decode(preds_idx, preds_prob) + + if label is None: + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + return text + label = self.decode(label) + return text, label + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + batch_size = len(text_index) + + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list))) + return result_list + + def add_special_char(self, dict_character): + dict_character = dict_character + [self.beg_str, self.end_str] + return dict_character + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class TableLabelDecode(object): + """ """ + + def __init__(self, + character_dict_path, + **kwargs): + list_character, list_elem = self.load_char_elem_dict(character_dict_path) + list_character = self.add_special_char(list_character) + list_elem = self.add_special_char(list_elem) + self.dict_character = {} + self.dict_idx_character = {} + for i, char in enumerate(list_character): + self.dict_idx_character[i] = char + self.dict_character[char] = i + self.dict_elem = {} + self.dict_idx_elem = {} + for i, elem in enumerate(list_elem): + self.dict_idx_elem[i] = elem + self.dict_elem[elem] = i + + def load_char_elem_dict(self, character_dict_path): + list_character = [] + list_elem = [] + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + substr = lines[0].decode('utf-8').strip("\n").strip("\r\n").split("\t") + character_num = int(substr[0]) + elem_num = int(substr[1]) + for cno in range(1, 1 + character_num): + character = lines[cno].decode('utf-8').strip("\n").strip("\r\n") + list_character.append(character) + for eno in range(1 + character_num, 1 + character_num + elem_num): + elem = lines[eno].decode('utf-8').strip("\n").strip("\r\n") + list_elem.append(elem) + return list_character, list_elem + + def add_special_char(self, list_character): + self.beg_str = "sos" + self.end_str = "eos" + list_character = [self.beg_str] + list_character + [self.end_str] + return list_character + + def __call__(self, preds): + structure_probs = preds['structure_probs'] + loc_preds = preds['loc_preds'] + if isinstance(structure_probs,torch.Tensor): + structure_probs = structure_probs.numpy() + if isinstance(loc_preds,torch.Tensor): + loc_preds = loc_preds.numpy() + structure_idx = structure_probs.argmax(axis=2) + structure_probs = structure_probs.max(axis=2) + structure_str, structure_pos, result_score_list, result_elem_idx_list = self.decode(structure_idx, + structure_probs, 'elem') + res_html_code_list = [] + res_loc_list = [] + batch_num = len(structure_str) + for bno in range(batch_num): + res_loc = [] + for sno in range(len(structure_str[bno])): + text = structure_str[bno][sno] + if text in ['', ' 0 and tmp_elem_idx == end_idx: + break + if tmp_elem_idx in ignored_tokens: + continue + + char_list.append(current_dict[tmp_elem_idx]) + elem_pos_list.append(idx) + score_list.append(structure_probs[batch_idx, idx]) + elem_idx_list.append(tmp_elem_idx) + result_list.append(char_list) + result_pos_list.append(elem_pos_list) + result_score_list.append(score_list) + result_elem_idx_list.append(elem_idx_list) + return result_list, result_pos_list, result_score_list, result_elem_idx_list + + def get_ignored_tokens(self, char_or_elem): + beg_idx = self.get_beg_end_flag_idx("beg", char_or_elem) + end_idx = self.get_beg_end_flag_idx("end", char_or_elem) + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end, char_or_elem): + if char_or_elem == "char": + if beg_or_end == "beg": + idx = self.dict_character[self.beg_str] + elif beg_or_end == "end": + idx = self.dict_character[self.end_str] + else: + assert False, "Unsupport type %s in get_beg_end_flag_idx of char" \ + % beg_or_end + elif char_or_elem == "elem": + if beg_or_end == "beg": + idx = self.dict_elem[self.beg_str] + elif beg_or_end == "end": + idx = self.dict_elem[self.end_str] + else: + assert False, "Unsupport type %s in get_beg_end_flag_idx of elem" \ + % beg_or_end + else: + assert False, "Unsupport type %s in char_or_elem" \ + % char_or_elem + return idx + + +class SARLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(SARLabelDecode, self).__init__(character_dict_path, + use_space_char) + + self.rm_symbol = kwargs.get('rm_symbol', False) + + def add_special_char(self, dict_character): + beg_end_str = "" + unknown_str = "" + padding_str = "" + dict_character = dict_character + [unknown_str] + self.unknown_idx = len(dict_character) - 1 + dict_character = dict_character + [beg_end_str] + self.start_idx = len(dict_character) - 1 + self.end_idx = len(dict_character) - 1 + dict_character = dict_character + [padding_str] + self.padding_idx = len(dict_character) - 1 + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(self.end_idx): + if text_prob is None and idx == 0: + continue + else: + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + if self.rm_symbol: + comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') + text = text.lower() + text = comp.sub('', text) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, torch.Tensor): + preds = preds.cpu().numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + def get_ignored_tokens(self): + return [self.padding_idx] + + +class CANLabelDecode(BaseRecLabelDecode): + """ Convert between latex-symbol and symbol-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(CANLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def decode(self, text_index, preds_prob=None): + result_list = [] + batch_size = len(text_index) + for batch_idx in range(batch_size): + seq_end = text_index[batch_idx].argmin(0) + idx_list = text_index[batch_idx][:seq_end].tolist() + symbol_list = [self.character[idx] for idx in idx_list] + probs = [] + if preds_prob is not None: + probs = preds_prob[batch_idx][:len(symbol_list)].tolist() + + result_list.append([' '.join(symbol_list), probs]) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + pred_prob, _, _, _ = preds + preds_idx = pred_prob.argmax(axis=2) + + text = self.decode(preds_idx) + if label is None: + return text + label = self.decode(label) + return text, label \ No newline at end of file diff --git a/batch_running_task/pytorchocr/postprocess/sast_postprocess.py b/batch_running_task/pytorchocr/postprocess/sast_postprocess.py new file mode 100644 index 0000000..26a03c8 --- /dev/null +++ b/batch_running_task/pytorchocr/postprocess/sast_postprocess.py @@ -0,0 +1,302 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +__dir__ = os.path.dirname(__file__) +sys.path.append(__dir__) +sys.path.append(os.path.join(__dir__, '..')) + +import numpy as np +from .locality_aware_nms import nms_locality +# import paddle +import torch +import cv2 +import time + + +class SASTPostProcess(object): + """ + The post process for SAST. + """ + + def __init__(self, + score_thresh=0.5, + nms_thresh=0.2, + sample_pts_num=2, + shrink_ratio_of_width=0.3, + expand_scale=1.0, + tcl_map_thresh=0.5, + **kwargs): + + self.score_thresh = score_thresh + self.nms_thresh = nms_thresh + self.sample_pts_num = sample_pts_num + self.shrink_ratio_of_width = shrink_ratio_of_width + self.expand_scale = expand_scale + self.tcl_map_thresh = tcl_map_thresh + + # c++ la-nms is faster, but only support python 3.5 + self.is_python35 = False + if sys.version_info.major == 3 and sys.version_info.minor == 5: + self.is_python35 = True + + def point_pair2poly(self, point_pair_list): + """ + Transfer vertical point_pairs into poly point in clockwise. + """ + # constract poly + point_num = len(point_pair_list) * 2 + point_list = [0] * point_num + for idx, point_pair in enumerate(point_pair_list): + point_list[idx] = point_pair[0] + point_list[point_num - 1 - idx] = point_pair[1] + return np.array(point_list).reshape(-1, 2) + + def shrink_quad_along_width(self, quad, begin_width_ratio=0., end_width_ratio=1.): + """ + Generate shrink_quad_along_width. + """ + ratio_pair = np.array([[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + def expand_poly_along_width(self, poly, shrink_ratio_of_width=0.3): + """ + expand poly along width. + """ + point_num = poly.shape[0] + left_quad = np.array([poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32) + left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \ + (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6) + left_quad_expand = self.shrink_quad_along_width(left_quad, left_ratio, 1.0) + right_quad = np.array([poly[point_num // 2 - 2], poly[point_num // 2 - 1], + poly[point_num // 2], poly[point_num // 2 + 1]], dtype=np.float32) + right_ratio = 1.0 + \ + shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \ + (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6) + right_quad_expand = self.shrink_quad_along_width(right_quad, 0.0, right_ratio) + poly[0] = left_quad_expand[0] + poly[-1] = left_quad_expand[-1] + poly[point_num // 2 - 1] = right_quad_expand[1] + poly[point_num // 2] = right_quad_expand[2] + return poly + + def restore_quad(self, tcl_map, tcl_map_thresh, tvo_map): + """Restore quad.""" + xy_text = np.argwhere(tcl_map[:, :, 0] > tcl_map_thresh) + xy_text = xy_text[:, ::-1] # (n, 2) + + # Sort the text boxes via the y axis + xy_text = xy_text[np.argsort(xy_text[:, 1])] + + scores = tcl_map[xy_text[:, 1], xy_text[:, 0], 0] + scores = scores[:, np.newaxis] + + # Restore + point_num = int(tvo_map.shape[-1] / 2) + assert point_num == 4 + tvo_map = tvo_map[xy_text[:, 1], xy_text[:, 0], :] + xy_text_tile = np.tile(xy_text, (1, point_num)) # (n, point_num * 2) + quads = xy_text_tile - tvo_map + + return scores, quads, xy_text + + def quad_area(self, quad): + """ + compute area of a quad. + """ + edge = [ + (quad[1][0] - quad[0][0]) * (quad[1][1] + quad[0][1]), + (quad[2][0] - quad[1][0]) * (quad[2][1] + quad[1][1]), + (quad[3][0] - quad[2][0]) * (quad[3][1] + quad[2][1]), + (quad[0][0] - quad[3][0]) * (quad[0][1] + quad[3][1]) + ] + return np.sum(edge) / 2. + + def nms(self, dets): + if self.is_python35: + import lanms + dets = lanms.merge_quadrangle_n9(dets, self.nms_thresh) + else: + dets = nms_locality(dets, self.nms_thresh) + return dets + + def cluster_by_quads_tco(self, tcl_map, tcl_map_thresh, quads, tco_map): + """ + Cluster pixels in tcl_map based on quads. + """ + instance_count = quads.shape[0] + 1 # contain background + instance_label_map = np.zeros(tcl_map.shape[:2], dtype=np.int32) + if instance_count == 1: + return instance_count, instance_label_map + + # predict text center + xy_text = np.argwhere(tcl_map[:, :, 0] > tcl_map_thresh) + n = xy_text.shape[0] + xy_text = xy_text[:, ::-1] # (n, 2) + tco = tco_map[xy_text[:, 1], xy_text[:, 0], :] # (n, 2) + pred_tc = xy_text - tco + + # get gt text center + m = quads.shape[0] + gt_tc = np.mean(quads, axis=1) # (m, 2) + + pred_tc_tile = np.tile(pred_tc[:, np.newaxis, :], (1, m, 1)) # (n, m, 2) + gt_tc_tile = np.tile(gt_tc[np.newaxis, :, :], (n, 1, 1)) # (n, m, 2) + dist_mat = np.linalg.norm(pred_tc_tile - gt_tc_tile, axis=2) # (n, m) + xy_text_assign = np.argmin(dist_mat, axis=1) + 1 # (n,) + + instance_label_map[xy_text[:, 1], xy_text[:, 0]] = xy_text_assign + return instance_count, instance_label_map + + def estimate_sample_pts_num(self, quad, xy_text): + """ + Estimate sample points number. + """ + eh = (np.linalg.norm(quad[0] - quad[3]) + np.linalg.norm(quad[1] - quad[2])) / 2.0 + ew = (np.linalg.norm(quad[0] - quad[1]) + np.linalg.norm(quad[2] - quad[3])) / 2.0 + + dense_sample_pts_num = max(2, int(ew)) + dense_xy_center_line = xy_text[np.linspace(0, xy_text.shape[0] - 1, dense_sample_pts_num, + endpoint=True, dtype=np.float32).astype(np.int32)] + + dense_xy_center_line_diff = dense_xy_center_line[1:] - dense_xy_center_line[:-1] + estimate_arc_len = np.sum(np.linalg.norm(dense_xy_center_line_diff, axis=1)) + + sample_pts_num = max(2, int(estimate_arc_len / eh)) + return sample_pts_num + + def detect_sast(self, tcl_map, tvo_map, tbo_map, tco_map, ratio_w, ratio_h, src_w, src_h, + shrink_ratio_of_width=0.3, tcl_map_thresh=0.5, offset_expand=1.0, out_strid=4.0): + """ + first resize the tcl_map, tvo_map and tbo_map to the input_size, then restore the polys + """ + # restore quad + scores, quads, xy_text = self.restore_quad(tcl_map, tcl_map_thresh, tvo_map) + dets = np.hstack((quads, scores)).astype(np.float32, copy=False) + dets = self.nms(dets) + if dets.shape[0] == 0: + return [] + quads = dets[:, :-1].reshape(-1, 4, 2) + + # Compute quad area + quad_areas = [] + for quad in quads: + quad_areas.append(-self.quad_area(quad)) + + # instance segmentation + # instance_count, instance_label_map = cv2.connectedComponents(tcl_map.astype(np.uint8), connectivity=8) + instance_count, instance_label_map = self.cluster_by_quads_tco(tcl_map, tcl_map_thresh, quads, tco_map) + + # restore single poly with tcl instance. + poly_list = [] + for instance_idx in range(1, instance_count): + xy_text = np.argwhere(instance_label_map == instance_idx)[:, ::-1] + quad = quads[instance_idx - 1] + q_area = quad_areas[instance_idx - 1] + if q_area < 5: + continue + + # + len1 = float(np.linalg.norm(quad[0] - quad[1])) + len2 = float(np.linalg.norm(quad[1] - quad[2])) + min_len = min(len1, len2) + if min_len < 3: + continue + + # filter small CC + if xy_text.shape[0] <= 0: + continue + + # filter low confidence instance + xy_text_scores = tcl_map[xy_text[:, 1], xy_text[:, 0], 0] + if np.sum(xy_text_scores) / quad_areas[instance_idx - 1] < 0.1: + # if np.sum(xy_text_scores) / quad_areas[instance_idx - 1] < 0.05: + continue + + # sort xy_text + left_center_pt = np.array([[(quad[0, 0] + quad[-1, 0]) / 2.0, + (quad[0, 1] + quad[-1, 1]) / 2.0]]) # (1, 2) + right_center_pt = np.array([[(quad[1, 0] + quad[2, 0]) / 2.0, + (quad[1, 1] + quad[2, 1]) / 2.0]]) # (1, 2) + proj_unit_vec = (right_center_pt - left_center_pt) / \ + (np.linalg.norm(right_center_pt - left_center_pt) + 1e-6) + proj_value = np.sum(xy_text * proj_unit_vec, axis=1) + xy_text = xy_text[np.argsort(proj_value)] + + # Sample pts in tcl map + if self.sample_pts_num == 0: + sample_pts_num = self.estimate_sample_pts_num(quad, xy_text) + else: + sample_pts_num = self.sample_pts_num + xy_center_line = xy_text[np.linspace(0, xy_text.shape[0] - 1, sample_pts_num, + endpoint=True, dtype=np.float32).astype(np.int32)] + + point_pair_list = [] + for x, y in xy_center_line: + # get corresponding offset + offset = tbo_map[y, x, :].reshape(2, 2) + if offset_expand != 1.0: + offset_length = np.linalg.norm(offset, axis=1, keepdims=True) + expand_length = np.clip(offset_length * (offset_expand - 1), a_min=0.5, a_max=3.0) + offset_detal = offset / offset_length * expand_length + offset = offset + offset_detal + # original point + ori_yx = np.array([y, x], dtype=np.float32) + point_pair = (ori_yx + offset)[:, ::-1] * out_strid / np.array([ratio_w, ratio_h]).reshape(-1, 2) + point_pair_list.append(point_pair) + + # ndarry: (x, 2), expand poly along width + detected_poly = self.point_pair2poly(point_pair_list) + detected_poly = self.expand_poly_along_width(detected_poly, shrink_ratio_of_width) + detected_poly[:, 0] = np.clip(detected_poly[:, 0], a_min=0, a_max=src_w) + detected_poly[:, 1] = np.clip(detected_poly[:, 1], a_min=0, a_max=src_h) + poly_list.append(detected_poly) + + return poly_list + + def __call__(self, outs_dict, shape_list): + score_list = outs_dict['f_score'] + border_list = outs_dict['f_border'] + tvo_list = outs_dict['f_tvo'] + tco_list = outs_dict['f_tco'] + if isinstance(score_list, torch.Tensor): + score_list = score_list.cpu().numpy() + border_list = border_list.cpu().numpy() + tvo_list = tvo_list.cpu().numpy() + tco_list = tco_list.cpu().numpy() + + img_num = len(shape_list) + poly_lists = [] + for ino in range(img_num): + p_score = score_list[ino].transpose((1, 2, 0)) + p_border = border_list[ino].transpose((1, 2, 0)) + p_tvo = tvo_list[ino].transpose((1, 2, 0)) + p_tco = tco_list[ino].transpose((1, 2, 0)) + src_h, src_w, ratio_h, ratio_w = shape_list[ino] + + poly_list = self.detect_sast(p_score, p_tvo, p_border, p_tco, ratio_w, ratio_h, src_w, src_h, + shrink_ratio_of_width=self.shrink_ratio_of_width, + tcl_map_thresh=self.tcl_map_thresh, offset_expand=self.expand_scale) + poly_lists.append({'points': np.array(poly_list)}) + + return poly_lists + diff --git a/batch_running_task/pytorchocr/pytorchocr_utility.py b/batch_running_task/pytorchocr/pytorchocr_utility.py new file mode 100644 index 0000000..8a1fc31 --- /dev/null +++ b/batch_running_task/pytorchocr/pytorchocr_utility.py @@ -0,0 +1,555 @@ +import os, sys +import math +import numpy as np +import cv2 +from PIL import Image, ImageDraw, ImageFont +import argparse + +def init_args(): + def str2bool(v): + return v.lower() in ("true", "t", "1") + + parser = argparse.ArgumentParser() + # params for prediction engine + parser.add_argument("--use_gpu", type=str2bool, default=True) + # parser.add_argument("--ir_optim", type=str2bool, default=True) + # parser.add_argument("--use_tensorrt", type=str2bool, default=False) + # parser.add_argument("--use_fp16", type=str2bool, default=False) + parser.add_argument("--gpu_mem", type=int, default=500) + parser.add_argument("--warmup", type=str2bool, default=False) + + # params for text detector + parser.add_argument("--image_dir", type=str) + parser.add_argument("--det_algorithm", type=str, default='DB') + parser.add_argument("--det_model_path", type=str) + parser.add_argument("--det_limit_side_len", type=float, default=960) + parser.add_argument("--det_limit_type", type=str, default='max') + + # DB parmas + parser.add_argument("--det_db_thresh", type=float, default=0.3) + parser.add_argument("--det_db_box_thresh", type=float, default=0.6) + parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5) + parser.add_argument("--max_batch_size", type=int, default=10) + parser.add_argument("--use_dilation", type=str2bool, default=False) + parser.add_argument("--det_db_score_mode", type=str, default="fast") + + # EAST parmas + parser.add_argument("--det_east_score_thresh", type=float, default=0.8) + parser.add_argument("--det_east_cover_thresh", type=float, default=0.1) + parser.add_argument("--det_east_nms_thresh", type=float, default=0.2) + + # SAST parmas + parser.add_argument("--det_sast_score_thresh", type=float, default=0.5) + parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2) + parser.add_argument("--det_sast_polygon", type=str2bool, default=False) + + # PSE parmas + parser.add_argument("--det_pse_thresh", type=float, default=0) + parser.add_argument("--det_pse_box_thresh", type=float, default=0.85) + parser.add_argument("--det_pse_min_area", type=float, default=16) + parser.add_argument("--det_pse_box_type", type=str, default='box') + parser.add_argument("--det_pse_scale", type=int, default=1) + + # FCE parmas + parser.add_argument("--scales", type=list, default=[8, 16, 32]) + parser.add_argument("--alpha", type=float, default=1.0) + parser.add_argument("--beta", type=float, default=1.0) + parser.add_argument("--fourier_degree", type=int, default=5) + parser.add_argument("--det_fce_box_type", type=str, default='poly') + + # params for text recognizer + parser.add_argument("--rec_algorithm", type=str, default='CRNN') + parser.add_argument("--rec_model_path", type=str) + parser.add_argument("--rec_image_inverse", type=str2bool, default=True) + parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320") + parser.add_argument("--rec_char_type", type=str, default='ch') + parser.add_argument("--rec_batch_num", type=int, default=6) + parser.add_argument("--max_text_length", type=int, default=25) + + parser.add_argument("--use_space_char", type=str2bool, default=True) + parser.add_argument("--drop_score", type=float, default=0.5) + parser.add_argument("--limited_max_width", type=int, default=1280) + parser.add_argument("--limited_min_width", type=int, default=16) + + parser.add_argument( + "--vis_font_path", type=str, + default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'doc/fonts/simfang.ttf')) + parser.add_argument( + "--rec_char_dict_path", + type=str, + default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + 'pytorchocr/utils/ppocr_keys_v1.txt')) + + # params for text classifier + parser.add_argument("--use_angle_cls", type=str2bool, default=False) + parser.add_argument("--cls_model_path", type=str) + parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192") + parser.add_argument("--label_list", type=list, default=['0', '180']) + parser.add_argument("--cls_batch_num", type=int, default=6) + parser.add_argument("--cls_thresh", type=float, default=0.9) + + parser.add_argument("--enable_mkldnn", type=str2bool, default=False) + parser.add_argument("--use_pdserving", type=str2bool, default=False) + + # params for e2e + parser.add_argument("--e2e_algorithm", type=str, default='PGNet') + parser.add_argument("--e2e_model_path", type=str) + parser.add_argument("--e2e_limit_side_len", type=float, default=768) + parser.add_argument("--e2e_limit_type", type=str, default='max') + + # PGNet parmas + parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5) + parser.add_argument( + "--e2e_char_dict_path", type=str, + default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + 'pytorchocr/utils/ic15_dict.txt')) + parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext') + parser.add_argument("--e2e_pgnet_polygon", type=bool, default=True) + parser.add_argument("--e2e_pgnet_mode", type=str, default='fast') + + # SR parmas + parser.add_argument("--sr_model_path", type=str) + parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128") + parser.add_argument("--sr_batch_num", type=int, default=1) + + # params .yaml + parser.add_argument("--det_yaml_path", type=str, default=None) + parser.add_argument("--rec_yaml_path", type=str, default=None) + parser.add_argument("--cls_yaml_path", type=str, default=None) + parser.add_argument("--e2e_yaml_path", type=str, default=None) + parser.add_argument("--sr_yaml_path", type=str, default=None) + + # multi-process + parser.add_argument("--use_mp", type=str2bool, default=False) + parser.add_argument("--total_process_num", type=int, default=1) + parser.add_argument("--process_id", type=int, default=0) + + parser.add_argument("--benchmark", type=str2bool, default=False) + parser.add_argument("--save_log_path", type=str, default="./log_output/") + + parser.add_argument("--show_log", type=str2bool, default=True) + + return parser + +def parse_args(): + parser = init_args() + return parser.parse_args() + +def get_default_config(args): + return vars(args) + + +def read_network_config_from_yaml(yaml_path, char_num=None): + if not os.path.exists(yaml_path): + raise FileNotFoundError('{} is not existed.'.format(yaml_path)) + import yaml + with open(yaml_path, encoding='utf-8') as f: + res = yaml.safe_load(f) + if res.get('Architecture') is None: + raise ValueError('{} has no Architecture'.format(yaml_path)) + if res['Architecture']['Head']['name'] == 'MultiHead' and char_num is not None: + res['Architecture']['Head']['out_channels_list'] = { + 'CTCLabelDecode': char_num, + 'SARLabelDecode': char_num + 2, + 'NRTRLabelDecode': char_num + 3 + } + return res['Architecture'] + +def AnalysisConfig(weights_path, yaml_path=None, char_num=None): + if not os.path.exists(os.path.abspath(weights_path)): + raise FileNotFoundError('{} is not found.'.format(weights_path)) + + if yaml_path is not None: + return read_network_config_from_yaml(yaml_path, char_num=char_num) + + weights_basename = os.path.basename(weights_path) + weights_name = weights_basename.lower() + + # supported_weights = ['ch_ptocr_server_v2.0_det_infer.pth', + # 'ch_ptocr_server_v2.0_rec_infer.pth', + # 'ch_ptocr_mobile_v2.0_det_infer.pth', + # 'ch_ptocr_mobile_v2.0_rec_infer.pth', + # 'ch_ptocr_mobile_v2.0_cls_infer.pth', + # ] + # assert weights_name in supported_weights, \ + # "supported weights are {} but input weights is {}".format(supported_weights, weights_name) + + if weights_name == 'ch_ptocr_server_v2.0_det_infer.pth': + network_config = {'model_type':'det', + 'algorithm':'DB', + 'Transform':None, + 'Backbone':{'name':'ResNet_vd', 'layers':18, 'disable_se':True}, + 'Neck':{'name':'DBFPN', 'out_channels':256}, + 'Head':{'name':'DBHead', 'k':50}} + + elif weights_name == 'ch_ptocr_server_v2.0_rec_infer.pth': + network_config = {'model_type':'rec', + 'algorithm':'CRNN', + 'Transform':None, + 'Backbone':{'name':'ResNet', 'layers':34}, + 'Neck':{'name':'SequenceEncoder', 'hidden_size':256, 'encoder_type':'rnn'}, + 'Head':{'name':'CTCHead', 'fc_decay': 4e-05}} + + elif weights_name in ['ch_ptocr_mobile_v2.0_det_infer.pth']: + network_config = {'model_type': 'det', + 'algorithm': 'DB', + 'Transform': None, + 'Backbone': {'name': 'MobileNetV3', 'model_name': 'large', 'scale': 0.5, 'disable_se': True}, + 'Neck': {'name': 'DBFPN', 'out_channels': 96}, + 'Head': {'name': 'DBHead', 'k': 50}} + + elif weights_name =='ch_ptocr_mobile_v2.0_rec_infer.pth': + network_config = {'model_type':'rec', + 'algorithm':'CRNN', + 'Transform':None, + 'Backbone':{'model_name':'small', 'name':'MobileNetV3', 'scale':0.5, 'small_stride':[1,2,2,2]}, + 'Neck':{'name':'SequenceEncoder', 'hidden_size':48, 'encoder_type':'rnn'}, + 'Head':{'name':'CTCHead', 'fc_decay': 4e-05}} + + elif weights_name == 'ch_ptocr_mobile_v2.0_cls_infer.pth': + network_config = {'model_type':'cls', + 'algorithm':'CLS', + 'Transform':None, + 'Backbone':{'name':'MobileNetV3', 'model_name':'small', 'scale':0.35}, + 'Neck':None, + 'Head':{'name':'ClsHead', 'class_dim':2}} + + elif weights_name == 'ch_ptocr_v2_rec_infer.pth': + network_config = {'model_type': 'rec', + 'algorithm': 'CRNN', + 'Transform': None, + 'Backbone': {'name': 'MobileNetV1Enhance', 'scale': 0.5}, + 'Neck': {'name': 'SequenceEncoder', 'hidden_size': 64, 'encoder_type': 'rnn'}, + 'Head': {'name': 'CTCHead', 'mid_channels': 96, 'fc_decay': 2e-05}} + + elif weights_name == 'ch_ptocr_v2_det_infer.pth': + network_config = {'model_type': 'det', + 'algorithm': 'DB', + 'Transform': None, + 'Backbone': {'name': 'MobileNetV3', 'model_name': 'large', 'scale': 0.5, 'disable_se': True}, + 'Neck': {'name': 'DBFPN', 'out_channels': 96}, + 'Head': {'name': 'DBHead', 'k': 50}} + + elif weights_name == 'ch_ptocr_v3_rec_infer.pth': + network_config = {'model_type':'rec', + 'algorithm':'CRNN', + 'Transform':None, + 'Backbone':{'name':'MobileNetV1Enhance', + 'scale':0.5, + 'last_conv_stride': [1, 2], + 'last_pool_type': 'avg'}, + 'Neck':{'name':'SequenceEncoder', + 'dims': 64, + 'depth': 2, + 'hidden_dims': 120, + 'use_guide': True, + 'encoder_type':'svtr'}, + 'Head':{'name':'CTCHead', 'fc_decay': 2e-05} + } + + elif weights_name == 'ch_ptocr_v3_det_infer.pth': + network_config = {'model_type': 'det', + 'algorithm': 'DB', + 'Transform': None, + 'Backbone': {'name': 'MobileNetV3', 'model_name': 'large', 'scale': 0.5, 'disable_se': True}, + 'Neck': {'name': 'RSEFPN', 'out_channels': 96, 'shortcut': True}, + 'Head': {'name': 'DBHead', 'k': 50}} + + elif weights_name == 'det_mv3_db_v2.0_infer.pth': + network_config = {'model_type': 'det', + 'algorithm': 'DB', + 'Transform': None, + 'Backbone': {'name': 'MobileNetV3', 'model_name': 'large'}, + 'Neck': {'name': 'DBFPN', 'out_channels': 256}, + 'Head': {'name': 'DBHead', 'k': 50}} + + elif weights_name == 'det_r50_vd_db_v2.0_infer.pth': + network_config = {'model_type': 'det', + 'algorithm': 'DB', + 'Transform': None, + 'Backbone': {'name': 'ResNet_vd', 'layers': 50}, + 'Neck': {'name': 'DBFPN', 'out_channels': 256}, + 'Head': {'name': 'DBHead', 'k': 50}} + + elif weights_name == 'det_mv3_east_v2.0_infer.pth': + network_config = {'model_type': 'det', + 'algorithm': 'EAST', + 'Transform': None, + 'Backbone': {'name': 'MobileNetV3', 'model_name': 'large'}, + 'Neck': {'name': 'EASTFPN', 'model_name': 'small'}, + 'Head': {'name': 'EASTHead', 'model_name': 'small'}} + + elif weights_name == 'det_r50_vd_east_v2.0_infer.pth': + network_config = {'model_type': 'det', + 'algorithm': 'EAST', + 'Transform': None, + 'Backbone': {'name': 'ResNet_vd', 'layers': 50}, + 'Neck': {'name': 'EASTFPN', 'model_name': 'large'}, + 'Head': {'name': 'EASTHead', 'model_name': 'large'}} + + elif weights_name == 'det_r50_vd_sast_icdar15_v2.0_infer.pth': + network_config = {'model_type': 'det', + 'algorithm': 'SAST', + 'Transform': None, + 'Backbone': {'name': 'ResNet_SAST', 'layers': 50}, + 'Neck': {'name': 'SASTFPN', 'with_cab': True}, + 'Head': {'name': 'SASTHead'}} + + elif weights_name == 'det_r50_vd_sast_totaltext_v2.0_infer.pth': + network_config = {'model_type': 'det', + 'algorithm': 'SAST', + 'Transform': None, + 'Backbone': {'name': 'ResNet_SAST', 'layers': 50}, + 'Neck': {'name': 'SASTFPN', 'with_cab': True}, + 'Head': {'name': 'SASTHead'}} + + elif weights_name == 'en_server_pgneta_infer.pth': + network_config = {'model_type': 'e2e', + 'algorithm': 'PGNet', + 'Transform': None, + 'Backbone': {'name': 'ResNet', 'layers': 50}, + 'Neck': {'name': 'PGFPN'}, + 'Head': {'name': 'PGHead'}} + + elif weights_name == 'en_ptocr_mobile_v2.0_table_det_infer.pth': + network_config = {'model_type': 'det','algorithm': 'DB', + 'Transform': None, + 'Backbone': {'name': 'MobileNetV3', 'model_name': 'large', 'scale': 0.5, 'disable_se': False}, + 'Neck': {'name': 'DBFPN', 'out_channels': 96}, + 'Head': {'name': 'DBHead', 'k': 50}} + + elif weights_name == 'en_ptocr_mobile_v2.0_table_rec_infer.pth': + network_config = {'model_type': 'rec', + 'algorithm': 'CRNN', + 'Transform': None, + 'Backbone': {'model_name': 'large', 'name': 'MobileNetV3', }, + 'Neck': {'name': 'SequenceEncoder', 'hidden_size': 96, 'encoder_type': 'rnn'}, + 'Head': {'name': 'CTCHead', 'fc_decay': 4e-05}} + + elif 'om_' in weights_name and '_rec_' in weights_name: + network_config = {'model_type': 'rec', + 'algorithm': 'CRNN', + 'Transform': None, + 'Backbone': {'model_name': 'small', 'name': 'MobileNetV3', 'scale': 0.5, + 'small_stride': [1, 2, 2, 2]}, + 'Neck': {'name': 'SequenceEncoder', 'hidden_size': 48, 'encoder_type': 'om'}, + 'Head': {'name': 'CTCHead', 'fc_decay': 4e-05}} + + else: + network_config = {'model_type': 'rec', + 'algorithm': 'CRNN', + 'Transform': None, + 'Backbone': {'model_name': 'small', 'name': 'MobileNetV3', 'scale': 0.5, + 'small_stride': [1, 2, 2, 2]}, + 'Neck': {'name': 'SequenceEncoder', 'hidden_size': 48, 'encoder_type': 'rnn'}, + 'Head': {'name': 'CTCHead', 'fc_decay': 4e-05}} + # raise NotImplementedError + + return network_config + + +def draw_e2e_res(dt_boxes, strs, img_path): + src_im = cv2.imread(img_path) + for box, str in zip(dt_boxes, strs): + box = box.astype(np.int32).reshape((-1, 1, 2)) + cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2) + cv2.putText( + src_im, + str, + org=(int(box[0, 0, 0]), int(box[0, 0, 1])), + fontFace=cv2.FONT_HERSHEY_COMPLEX, + fontScale=0.7, + color=(0, 255, 0), + thickness=1) + return src_im + + +def draw_text_det_res(dt_boxes, img_path): + src_im = cv2.imread(img_path) + for box in dt_boxes: + box = np.array(box).astype(np.int32).reshape(-1, 2) + cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2) + return src_im + + +def resize_img(img, input_size=600): + """ + resize img and limit the longest side of the image to input_size + """ + img = np.array(img) + im_shape = img.shape + im_size_max = np.max(im_shape[0:2]) + im_scale = float(input_size) / float(im_size_max) + img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale) + return img + + +def draw_ocr_box_txt(image, + boxes, + txts, + scores=None, + drop_score=0.5, + font_path="./doc/simfang.ttf"): + h, w = image.height, image.width + img_left = image.copy() + img_right = Image.new('RGB', (w, h), (255, 255, 255)) + + import random + + random.seed(0) + draw_left = ImageDraw.Draw(img_left) + draw_right = ImageDraw.Draw(img_right) + for idx, (box, txt) in enumerate(zip(boxes, txts)): + if scores is not None and scores[idx] < drop_score: + continue + color = (random.randint(0, 255), random.randint(0, 255), + random.randint(0, 255)) + draw_left.polygon(box, fill=color) + draw_right.polygon( + [ + box[0][0], box[0][1], box[1][0], box[1][1], box[2][0], + box[2][1], box[3][0], box[3][1] + ], + outline=color) + box_height = math.sqrt((box[0][0] - box[3][0])**2 + (box[0][1] - box[3][ + 1])**2) + box_width = math.sqrt((box[0][0] - box[1][0])**2 + (box[0][1] - box[1][ + 1])**2) + if box_height > 2 * box_width: + font_size = max(int(box_width * 0.9), 10) + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + cur_y = box[0][1] + for c in txt: + char_size = font.getsize(c) + draw_right.text( + (box[0][0] + 3, cur_y), c, fill=(0, 0, 0), font=font) + cur_y += char_size[1] + else: + font_size = max(int(box_height * 0.8), 10) + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + draw_right.text( + [box[0][0], box[0][1]], txt, fill=(0, 0, 0), font=font) + img_left = Image.blend(image, img_left, 0.5) + img_show = Image.new('RGB', (w * 2, h), (255, 255, 255)) + img_show.paste(img_left, (0, 0, w, h)) + img_show.paste(img_right, (w, 0, w * 2, h)) + return np.array(img_show) + + +def str_count(s): + """ + Count the number of Chinese characters, + a single English character and a single number + equal to half the length of Chinese characters. + args: + s(string): the input of string + return(int): + the number of Chinese characters + """ + import string + count_zh = count_pu = 0 + s_len = len(s) + en_dg_count = 0 + for c in s: + if c in string.ascii_letters or c.isdigit() or c.isspace(): + en_dg_count += 1 + elif c.isalpha(): + count_zh += 1 + else: + count_pu += 1 + return s_len - math.ceil(en_dg_count / 2) + + +def text_visual(texts, + scores, + img_h=400, + img_w=600, + threshold=0., + font_path="./doc/simfang.ttf"): + """ + create new blank img and draw txt on it + args: + texts(list): the text will be draw + scores(list|None): corresponding score of each txt + img_h(int): the height of blank img + img_w(int): the width of blank img + font_path: the path of font which is used to draw text + return(array): + """ + if scores is not None: + assert len(texts) == len( + scores), "The number of txts and corresponding scores must match" + + def create_blank_img(): + blank_img = np.ones(shape=[img_h, img_w], dtype=np.int8) * 255 + blank_img[:, img_w - 1:] = 0 + blank_img = Image.fromarray(blank_img).convert("RGB") + draw_txt = ImageDraw.Draw(blank_img) + return blank_img, draw_txt + + blank_img, draw_txt = create_blank_img() + + font_size = 20 + txt_color = (0, 0, 0) + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + + gap = font_size + 5 + txt_img_list = [] + count, index = 1, 0 + for idx, txt in enumerate(texts): + index += 1 + if scores[idx] < threshold or math.isnan(scores[idx]): + index -= 1 + continue + first_line = True + while str_count(txt) >= img_w // font_size - 4: + tmp = txt + txt = tmp[:img_w // font_size - 4] + if first_line: + new_txt = str(index) + ': ' + txt + first_line = False + else: + new_txt = ' ' + txt + draw_txt.text((0, gap * count), new_txt, txt_color, font=font) + txt = tmp[img_w // font_size - 4:] + if count >= img_h // gap - 1: + txt_img_list.append(np.array(blank_img)) + blank_img, draw_txt = create_blank_img() + count = 0 + count += 1 + if first_line: + new_txt = str(index) + ': ' + txt + ' ' + '%.3f' % (scores[idx]) + else: + new_txt = " " + txt + " " + '%.3f' % (scores[idx]) + draw_txt.text((0, gap * count), new_txt, txt_color, font=font) + # whether add new blank img or not + if count >= img_h // gap - 1 and idx + 1 < len(texts): + txt_img_list.append(np.array(blank_img)) + blank_img, draw_txt = create_blank_img() + count = 0 + count += 1 + txt_img_list.append(np.array(blank_img)) + if len(txt_img_list) == 1: + blank_img = np.array(txt_img_list[0]) + else: + blank_img = np.concatenate(txt_img_list, axis=1) + return np.array(blank_img) + + +def base64_to_cv2(b64str): + import base64 + data = base64.b64decode(b64str.encode('utf8')) + data = np.fromstring(data, np.uint8) + data = cv2.imdecode(data, cv2.IMREAD_COLOR) + return data + + +def draw_boxes(image, boxes, scores=None, drop_score=0.5): + if scores is None: + scores = [1] * len(boxes) + for (box, score) in zip(boxes, scores): + if score < drop_score: + continue + box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64) + image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2) + return image \ No newline at end of file diff --git a/batch_running_task/pytorchocr/utils/EN_symbol_dict.txt b/batch_running_task/pytorchocr/utils/EN_symbol_dict.txt new file mode 100644 index 0000000..1aef43d --- /dev/null +++ b/batch_running_task/pytorchocr/utils/EN_symbol_dict.txt @@ -0,0 +1,94 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +: +; +< += +> +? +@ +[ +\ +] +^ +_ +` +{ +| +} +~ \ No newline at end of file diff --git a/batch_running_task/pytorchocr/utils/__init__.py b/batch_running_task/pytorchocr/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/batch_running_task/pytorchocr/utils/dict/ar_dict.txt b/batch_running_task/pytorchocr/utils/dict/ar_dict.txt new file mode 100644 index 0000000..fc63802 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/ar_dict.txt @@ -0,0 +1,117 @@ +a +r +b +i +c +_ +m +g +/ +1 +0 +I +L +S +V +R +C +2 +v +l +6 +3 +9 +. +j +p +ا +ل +م +ر +ج +و +ح +ي +ة +5 +8 +7 +أ +ب +ض +4 +ك +س +ه +ث +ن +ط +ع +ت +غ +خ +ف +ئ +ز +إ +د +ص +ظ +ذ +ش +ى +ق +ؤ +آ +ء +s +e +n +w +t +u +z +d +A +N +G +h +o +E +T +H +O +B +y +F +U +J +X +W +P +Z +M +k +q +Y +Q +D +f +K +x +' +% +- +# +@ +! +& +$ +, +: +é +? ++ +É +( + diff --git a/batch_running_task/pytorchocr/utils/dict/arabic_dict.txt b/batch_running_task/pytorchocr/utils/dict/arabic_dict.txt new file mode 100644 index 0000000..e97abf3 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/arabic_dict.txt @@ -0,0 +1,162 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ء +آ +أ +ؤ +إ +ئ +ا +ب +ة +ت +ث +ج +ح +خ +د +ذ +ر +ز +س +ش +ص +ض +ط +ظ +ع +غ +ف +ق +ك +ل +م +ن +ه +و +ى +ي +ً +ٌ +ٍ +َ +ُ +ِ +ّ +ْ +ٓ +ٔ +ٰ +ٱ +ٹ +پ +چ +ڈ +ڑ +ژ +ک +ڭ +گ +ں +ھ +ۀ +ہ +ۂ +ۃ +ۆ +ۇ +ۈ +ۋ +ی +ې +ے +ۓ +ە +١ +٢ +٣ +٤ +٥ +٦ +٧ +٨ +٩ diff --git a/batch_running_task/pytorchocr/utils/dict/be_dict.txt b/batch_running_task/pytorchocr/utils/dict/be_dict.txt new file mode 100644 index 0000000..f8458ba --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/be_dict.txt @@ -0,0 +1,145 @@ +b +e +_ +i +m +g +/ +2 +0 +I +L +S +V +R +C +1 +v +a +l +6 +9 +4 +3 +. +j +p +п +а +з +б +у +г +н +ц +ь +8 +м +л +і +о +ў +ы +7 +5 +М +х +с +р +ф +я +е +д +ж +ю +ч +й +к +Д +в +Б +т +І +ш +ё +э +К +Л +Н +А +Ж +Г +В +П +З +Е +О +Р +С +У +Ё +Й +Т +Ч +Э +Ц +Ю +Ш +Ф +Х +Я +Ь +Ы +Ў +s +c +n +w +M +o +t +T +E +A +B +u +h +y +k +r +H +d +Y +O +U +F +f +x +D +G +N +K +P +z +J +X +W +Z +Q +% +- +q +@ +' +! +# +& +, +: +$ +( +? +é ++ +É + diff --git a/batch_running_task/pytorchocr/utils/dict/bg_dict.txt b/batch_running_task/pytorchocr/utils/dict/bg_dict.txt new file mode 100644 index 0000000..84713c3 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/bg_dict.txt @@ -0,0 +1,140 @@ +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +А +Б +В +Г +Д +Е +Ж +З +И +Й +К +Л +М +Н +О +П +Р +С +Т +У +Ф +Х +Ц +Ч +Ш +Щ +Ъ +Ю +Я +а +б +в +г +д +е +ж +з +и +й +к +л +м +н +о +п +р +с +т +у +ф +х +ц +ч +ш +щ +ъ +ь +ю +я + diff --git a/batch_running_task/pytorchocr/utils/dict/ch_tra_dict.txt b/batch_running_task/pytorchocr/utils/dict/ch_tra_dict.txt new file mode 100644 index 0000000..cc1aa47 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/ch_tra_dict.txt @@ -0,0 +1,8421 @@ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +¥ +® +° +± +² +´ +· +» +É +Ë +Ó +× +Ü +à +á +ä +è +é +ì +í +ò +ó +÷ +ú +ü +ā +ē +ī +ō +ū +ǐ +ǒ +ɔ +ɡ +ʌ +ˋ +Λ +Ο +Φ +Ω +α +β +ε +θ +μ +π +З +И +Й +П +Я +г +— +‖ +‘ +’ +“ +” +• +… +‧ +′ +″ +※ +℃ +№ +™ +Ⅱ +Ⅲ +Ⅳ +← +↑ +→ +↓ +⇋ +∈ +∑ +√ +∞ +∣ +∧ +∩ +∫ +∶ +≈ +≠ +≤ +≥ +⊙ +⊥ +① +② +③ +④ +⑧ +⑴ +⑵ +⑶ +─ +│ +┅ +┌ +├ +█ +▎ +▏ +▕ +■ +□ +▪ +▲ +△ +▼ +◆ +◇ +○ +◎ +● +◥ +★ +☆ +❋ +❤ +  +、 +。 +〇 +〉 +《 +》 +「 +」 +『 +』 +【 +】 +〔 +〕 +〖 +〗 +の +サ +シ +ジ +マ +ㄱ +ㆍ +㎏ +㎡ +㐂 +㐱 +㙟 +㴪 +㸃 +䖝 +䝉 +䰾 +䲁 +一 +丁 +七 +丄 +丈 +三 +上 +下 +丌 +不 +与 +丏 +丐 +丑 +且 +丕 +世 +丘 +丙 +丞 +丟 +両 +並 +丨 +丫 +中 +丰 +串 +丶 +丸 +丹 +主 +丼 +丿 +乂 +乃 +久 +么 +之 +乍 +乎 +乏 +乒 +乓 +乖 +乗 +乘 +乙 +乚 +乜 +九 +乞 +也 +乩 +乭 +乳 +乸 +乹 +乾 +亀 +亂 +亅 +了 +予 +亊 +事 +二 +亍 +云 +互 +亓 +五 +井 +亘 +些 +亜 +亞 +亟 +亠 +亡 +亢 +交 +亥 +亦 +亨 +享 +京 +亭 +亮 +亰 +亳 +亶 +亹 +人 +亻 +什 +仁 +仂 +仃 +仄 +仇 +仉 +今 +介 +仍 +仏 +仔 +仕 +他 +仗 +付 +仙 +仛 +仝 +仞 +仟 +仡 +代 +令 +以 +仨 +仫 +仮 +仰 +仲 +仳 +仵 +件 +仺 +任 +仼 +份 +仿 +企 +伃 +伈 +伉 +伊 +伋 +伍 +伎 +伏 +伐 +休 +伕 +伙 +伝 +伢 +伯 +估 +伱 +伴 +伶 +伷 +伸 +伺 +似 +伽 +伾 +佀 +佁 +佃 +但 +佇 +佈 +佉 +佋 +位 +低 +住 +佐 +佑 +体 +佔 +何 +佗 +佘 +余 +佚 +佛 +作 +佝 +佞 +佟 +你 +佣 +佤 +佧 +佩 +佬 +佯 +佰 +佳 +併 +佶 +佹 +佺 +佼 +佾 +使 +侁 +侃 +侄 +侅 +來 +侈 +侊 +例 +侍 +侏 +侑 +侖 +侗 +侘 +侚 +供 +依 +侞 +価 +侮 +侯 +侵 +侶 +侷 +侹 +便 +俁 +係 +促 +俄 +俅 +俊 +俋 +俌 +俍 +俎 +俏 +俐 +俑 +俗 +俘 +俚 +俛 +保 +俞 +俟 +俠 +信 +俬 +修 +俯 +俱 +俳 +俴 +俵 +俶 +俸 +俺 +俽 +俾 +倆 +倈 +倉 +個 +倌 +倍 +們 +倒 +倓 +倔 +倖 +倗 +倘 +候 +倚 +倜 +倞 +借 +倡 +倢 +倣 +値 +倦 +倧 +倩 +倪 +倫 +倬 +倭 +倮 +倻 +值 +偁 +偃 +假 +偈 +偉 +偊 +偌 +偍 +偎 +偏 +偓 +偕 +做 +停 +健 +偪 +偲 +側 +偵 +偶 +偷 +偸 +偽 +傀 +傃 +傅 +傈 +傉 +傍 +傑 +傒 +傕 +傖 +傘 +備 +傜 +傢 +傣 +催 +傭 +傲 +傳 +債 +傷 +傻 +傾 +僅 +僉 +僊 +働 +像 +僑 +僔 +僕 +僖 +僙 +僚 +僜 +僡 +僧 +僩 +僭 +僮 +僰 +僱 +僳 +僴 +僵 +價 +僻 +儀 +儁 +儂 +億 +儆 +儇 +儈 +儉 +儋 +儐 +儒 +儔 +儕 +儘 +儚 +儞 +償 +儡 +儥 +儦 +優 +儫 +儱 +儲 +儷 +儺 +儻 +儼 +兀 +允 +元 +兄 +充 +兆 +先 +光 +克 +兌 +免 +児 +兒 +兔 +兕 +兗 +兜 +入 +內 +全 +兩 +兪 +八 +公 +六 +兮 +共 +兵 +其 +具 +典 +兼 +兿 +冀 +冂 +円 +冇 +冉 +冊 +再 +冏 +冑 +冒 +冕 +冖 +冗 +冚 +冠 +冢 +冤 +冥 +冧 +冨 +冪 +冫 +冬 +冮 +冰 +冴 +冶 +冷 +冼 +冽 +凃 +凄 +准 +凈 +凋 +凌 +凍 +凖 +凜 +凝 +凞 +几 +凡 +処 +凪 +凬 +凰 +凱 +凳 +凵 +凶 +凸 +凹 +出 +函 +刀 +刁 +刂 +刃 +刄 +分 +切 +刈 +刊 +刎 +刑 +划 +列 +初 +判 +別 +刦 +刧 +刨 +利 +刪 +刮 +到 +制 +刷 +券 +刺 +刻 +刼 +剁 +剃 +則 +削 +剋 +剌 +前 +剎 +剏 +剔 +剖 +剛 +剝 +剡 +剣 +剩 +剪 +剮 +副 +割 +創 +剿 +劃 +劄 +劇 +劈 +劉 +劊 +劌 +劍 +劑 +劔 +力 +功 +加 +劣 +助 +努 +劫 +劬 +劭 +劵 +効 +劼 +劾 +勁 +勃 +勅 +勇 +勉 +勐 +勑 +勒 +勔 +動 +勖 +勗 +勘 +務 +勛 +勝 +勞 +募 +勢 +勣 +勤 +勦 +勰 +勱 +勲 +勳 +勵 +勷 +勸 +勺 +勻 +勾 +勿 +匂 +匄 +包 +匆 +匈 +匋 +匍 +匏 +匐 +匕 +化 +北 +匙 +匚 +匝 +匠 +匡 +匣 +匪 +匯 +匱 +匸 +匹 +匾 +匿 +區 +十 +千 +卅 +升 +午 +卉 +半 +卋 +卍 +卐 +卑 +卒 +卓 +協 +南 +博 +卜 +卞 +卟 +占 +卡 +卣 +卦 +卧 +卩 +卬 +卮 +卯 +印 +危 +卲 +即 +卵 +卷 +卸 +卹 +卺 +卻 +卽 +卿 +厄 +厓 +厔 +厙 +厚 +厝 +原 +厥 +厭 +厰 +厲 +厴 +厶 +去 +參 +叄 +又 +叉 +及 +友 +反 +収 +叔 +叕 +取 +受 +叛 +叟 +叡 +叢 +口 +古 +句 +另 +叨 +叩 +只 +叫 +召 +叭 +叮 +可 +台 +叱 +史 +右 +叵 +司 +叻 +叼 +吁 +吃 +各 +吆 +合 +吉 +吊 +吋 +同 +名 +后 +吏 +吐 +向 +吒 +吔 +吖 +君 +吝 +吞 +吟 +吠 +吡 +吥 +否 +吧 +吩 +含 +吮 +吱 +吲 +吳 +吵 +吶 +吸 +吹 +吻 +吼 +吾 +呀 +呂 +呃 +呈 +呉 +告 +呋 +呎 +呢 +呤 +呦 +周 +呱 +味 +呵 +呷 +呸 +呼 +命 +呾 +咀 +咁 +咂 +咄 +咅 +咆 +咋 +和 +咎 +咑 +咒 +咔 +咕 +咖 +咗 +咘 +咚 +咟 +咤 +咥 +咧 +咨 +咩 +咪 +咫 +咬 +咭 +咯 +咱 +咲 +咳 +咸 +咻 +咼 +咽 +咾 +咿 +哀 +品 +哂 +哄 +哆 +哇 +哈 +哉 +哌 +哎 +哏 +哐 +哖 +哚 +哞 +員 +哥 +哦 +哨 +哩 +哪 +哭 +哮 +哱 +哲 +哺 +哼 +唃 +唄 +唆 +唇 +唉 +唏 +唐 +唑 +唔 +唘 +唧 +唫 +唬 +唭 +售 +唯 +唱 +唳 +唵 +唷 +唸 +唻 +唾 +啁 +啃 +啄 +商 +啉 +啊 +啍 +問 +啓 +啖 +啚 +啜 +啞 +啟 +啡 +啣 +啤 +啥 +啦 +啪 +啫 +啯 +啰 +啱 +啲 +啵 +啶 +啷 +啻 +啼 +啾 +喀 +喂 +喃 +善 +喆 +喇 +喈 +喉 +喊 +喋 +喏 +喔 +喘 +喙 +喚 +喜 +喝 +喢 +喦 +喧 +喪 +喫 +喬 +單 +喰 +喱 +喲 +喳 +喵 +喹 +喻 +喼 +嗄 +嗅 +嗆 +嗇 +嗊 +嗎 +嗑 +嗒 +嗓 +嗔 +嗖 +嗚 +嗜 +嗝 +嗞 +嗡 +嗢 +嗣 +嗦 +嗨 +嗩 +嗪 +嗮 +嗯 +嗲 +嗶 +嗹 +嗽 +嘀 +嘅 +嘆 +嘉 +嘌 +嘍 +嘎 +嘏 +嘔 +嘗 +嘚 +嘛 +嘜 +嘞 +嘟 +嘢 +嘣 +嘥 +嘧 +嘩 +嘬 +嘮 +嘯 +嘰 +嘲 +嘴 +嘶 +嘸 +嘹 +嘻 +嘿 +噁 +噌 +噍 +噏 +噓 +噗 +噝 +噠 +噢 +噤 +噥 +噦 +器 +噩 +噪 +噬 +噯 +噰 +噲 +噴 +噶 +噸 +噹 +噻 +嚇 +嚈 +嚎 +嚏 +嚐 +嚒 +嚓 +嚕 +嚗 +嚙 +嚞 +嚟 +嚤 +嚦 +嚧 +嚨 +嚩 +嚮 +嚳 +嚴 +嚶 +嚷 +嚼 +嚿 +囀 +囂 +囃 +囉 +囊 +囍 +囑 +囒 +囓 +囗 +囚 +四 +囝 +回 +因 +囡 +団 +囤 +囧 +囪 +囮 +囯 +困 +囲 +図 +囶 +囷 +囹 +固 +囿 +圂 +圃 +圄 +圈 +圉 +國 +圍 +圏 +園 +圓 +圖 +圗 +團 +圜 +土 +圧 +在 +圩 +圪 +圭 +圯 +地 +圳 +圻 +圾 +址 +均 +坊 +坋 +坌 +坍 +坎 +坐 +坑 +坖 +坡 +坣 +坤 +坦 +坨 +坩 +坪 +坫 +坬 +坭 +坮 +坯 +坳 +坵 +坶 +坷 +坻 +垂 +垃 +垈 +型 +垍 +垓 +垕 +垚 +垛 +垞 +垟 +垠 +垢 +垣 +垮 +垯 +垰 +垵 +垸 +垻 +垿 +埃 +埅 +埇 +埈 +埋 +埌 +城 +埏 +埒 +埔 +埕 +埗 +埜 +域 +埠 +埡 +埤 +埧 +埨 +埪 +埭 +埮 +埴 +埵 +執 +培 +基 +埻 +埼 +堀 +堂 +堃 +堅 +堆 +堇 +堈 +堉 +堊 +堍 +堖 +堝 +堡 +堤 +堦 +堪 +堮 +堯 +堰 +報 +場 +堵 +堷 +堺 +塀 +塅 +塆 +塊 +塋 +塌 +塍 +塏 +塑 +塔 +塗 +塘 +塙 +塜 +塞 +塡 +塢 +塤 +塨 +塩 +填 +塬 +塭 +塰 +塱 +塲 +塵 +塹 +塽 +塾 +墀 +境 +墅 +墉 +墊 +墎 +墓 +増 +墘 +墜 +增 +墟 +墡 +墣 +墨 +墩 +墫 +墬 +墮 +墱 +墳 +墺 +墼 +墾 +壁 +壄 +壆 +壇 +壋 +壌 +壎 +壐 +壑 +壓 +壔 +壕 +壘 +壙 +壞 +壟 +壠 +壢 +壤 +壩 +士 +壬 +壯 +壱 +壴 +壹 +壺 +壽 +夀 +夆 +変 +夊 +夋 +夌 +夏 +夔 +夕 +外 +夙 +多 +夜 +夠 +夢 +夤 +夥 +大 +天 +太 +夫 +夬 +夭 +央 +夯 +失 +夷 +夾 +奀 +奄 +奇 +奈 +奉 +奎 +奏 +奐 +契 +奓 +奔 +奕 +套 +奘 +奚 +奠 +奢 +奣 +奧 +奩 +奪 +奫 +奭 +奮 +女 +奴 +奶 +她 +好 +妀 +妁 +如 +妃 +妄 +妊 +妍 +妏 +妑 +妒 +妓 +妖 +妙 +妝 +妞 +妠 +妤 +妥 +妧 +妨 +妭 +妮 +妯 +妲 +妳 +妸 +妹 +妺 +妻 +妾 +姀 +姁 +姃 +姆 +姈 +姉 +姊 +始 +姌 +姍 +姐 +姑 +姒 +姓 +委 +姚 +姜 +姝 +姣 +姥 +姦 +姨 +姪 +姫 +姬 +姮 +姵 +姶 +姸 +姻 +姿 +威 +娃 +娉 +娋 +娌 +娍 +娎 +娑 +娖 +娘 +娛 +娜 +娟 +娠 +娣 +娥 +娩 +娫 +娳 +娶 +娸 +娼 +娽 +婀 +婁 +婆 +婉 +婊 +婑 +婕 +婚 +婢 +婦 +婧 +婪 +婭 +婯 +婷 +婺 +婻 +婼 +婿 +媃 +媄 +媊 +媐 +媒 +媓 +媖 +媗 +媚 +媛 +媜 +媞 +媧 +媭 +媯 +媲 +媳 +媺 +媼 +媽 +媾 +媿 +嫁 +嫂 +嫄 +嫈 +嫉 +嫌 +嫖 +嫘 +嫚 +嫡 +嫣 +嫦 +嫩 +嫪 +嫲 +嫳 +嫵 +嫺 +嫻 +嬅 +嬈 +嬉 +嬋 +嬌 +嬗 +嬛 +嬝 +嬡 +嬤 +嬨 +嬪 +嬬 +嬭 +嬰 +嬴 +嬸 +嬾 +嬿 +孀 +孃 +孆 +孋 +孌 +子 +孑 +孔 +孕 +孖 +字 +存 +孚 +孛 +孜 +孝 +孟 +孢 +季 +孤 +孩 +孫 +孬 +孮 +孰 +孳 +孵 +學 +孺 +孻 +孽 +孿 +宀 +它 +宅 +宇 +守 +安 +宋 +完 +宍 +宏 +宓 +宕 +宗 +官 +宙 +定 +宛 +宜 +実 +客 +宣 +室 +宥 +宦 +宧 +宮 +宰 +害 +宴 +宵 +家 +宸 +容 +宿 +寀 +寁 +寂 +寄 +寅 +密 +寇 +寈 +寊 +富 +寐 +寒 +寓 +寔 +寕 +寖 +寗 +寘 +寛 +寜 +寞 +察 +寡 +寢 +寤 +寥 +實 +寧 +寨 +審 +寫 +寬 +寮 +寯 +寰 +寳 +寵 +寶 +寸 +寺 +対 +封 +専 +尃 +射 +將 +專 +尉 +尊 +尋 +對 +導 +小 +尐 +少 +尓 +尕 +尖 +尗 +尙 +尚 +尢 +尤 +尨 +尪 +尬 +就 +尷 +尹 +尺 +尻 +尼 +尾 +尿 +局 +屁 +屄 +居 +屆 +屇 +屈 +屋 +屌 +屍 +屎 +屏 +屐 +屑 +屓 +展 +屚 +屜 +屠 +屢 +層 +履 +屬 +屭 +屯 +山 +屹 +屺 +屻 +岀 +岈 +岌 +岐 +岑 +岔 +岡 +岢 +岣 +岧 +岩 +岪 +岫 +岬 +岰 +岱 +岳 +岵 +岷 +岸 +岻 +峁 +峅 +峇 +峋 +峍 +峒 +峘 +峙 +峚 +峠 +峨 +峩 +峪 +峭 +峯 +峰 +峴 +島 +峻 +峼 +峽 +崁 +崆 +崇 +崈 +崋 +崍 +崎 +崐 +崑 +崒 +崔 +崖 +崗 +崘 +崙 +崚 +崛 +崞 +崟 +崠 +崢 +崤 +崧 +崩 +崬 +崮 +崱 +崴 +崵 +崶 +崽 +嵇 +嵊 +嵋 +嵌 +嵎 +嵐 +嵒 +嵕 +嵖 +嵗 +嵙 +嵛 +嵜 +嵨 +嵩 +嵬 +嵮 +嵯 +嵰 +嵴 +嵻 +嵿 +嶁 +嶂 +嶃 +嶄 +嶇 +嶋 +嶌 +嶍 +嶒 +嶔 +嶗 +嶝 +嶠 +嶢 +嶦 +嶧 +嶪 +嶬 +嶰 +嶲 +嶴 +嶷 +嶸 +嶺 +嶼 +嶽 +巂 +巄 +巆 +巋 +巌 +巍 +巎 +巑 +巒 +巔 +巖 +巘 +巛 +川 +州 +巡 +巢 +工 +左 +巧 +巨 +巫 +差 +巰 +己 +已 +巳 +巴 +巶 +巷 +巻 +巽 +巾 +巿 +市 +布 +帆 +希 +帑 +帔 +帕 +帖 +帘 +帙 +帚 +帛 +帝 +帡 +帢 +帥 +師 +席 +帯 +帰 +帳 +帶 +帷 +常 +帽 +幀 +幃 +幄 +幅 +幌 +幔 +幕 +幗 +幚 +幛 +幟 +幡 +幢 +幣 +幪 +幫 +干 +平 +年 +幵 +幷 +幸 +幹 +幺 +幻 +幼 +幽 +幾 +庀 +庁 +広 +庇 +床 +序 +底 +庖 +店 +庚 +府 +庠 +庢 +庥 +度 +座 +庫 +庭 +庲 +庵 +庶 +康 +庸 +庹 +庼 +庾 +廁 +廂 +廄 +廆 +廈 +廉 +廊 +廋 +廌 +廍 +廑 +廓 +廔 +廕 +廖 +廙 +廚 +廝 +廞 +廟 +廠 +廡 +廢 +廣 +廧 +廨 +廩 +廬 +廰 +廱 +廳 +延 +廷 +廸 +建 +廻 +廼 +廿 +弁 +弄 +弅 +弇 +弈 +弉 +弊 +弋 +弍 +式 +弐 +弒 +弓 +弔 +引 +弖 +弗 +弘 +弛 +弟 +弢 +弦 +弧 +弨 +弩 +弭 +弱 +張 +強 +弸 +弼 +弾 +彀 +彄 +彅 +彆 +彈 +彊 +彌 +彎 +彐 +彔 +彖 +彗 +彘 +彙 +彜 +彞 +彠 +彡 +形 +彣 +彤 +彥 +彧 +彩 +彪 +彫 +彬 +彭 +彰 +影 +彳 +彷 +役 +彼 +彿 +往 +征 +徂 +待 +徇 +很 +徉 +徊 +律 +後 +徐 +徑 +徒 +得 +徘 +徙 +徜 +從 +徠 +御 +徧 +徨 +復 +循 +徫 +徬 +徭 +微 +徳 +徴 +徵 +德 +徸 +徹 +徽 +心 +忄 +必 +忉 +忌 +忍 +忐 +忑 +忒 +志 +忘 +忙 +応 +忝 +忞 +忠 +快 +忬 +忯 +忱 +忳 +念 +忻 +忽 +忿 +怍 +怎 +怒 +怕 +怖 +怙 +怛 +思 +怠 +怡 +急 +怦 +性 +怨 +怪 +怯 +怵 +恁 +恂 +恃 +恆 +恊 +恍 +恐 +恕 +恙 +恢 +恣 +恤 +恥 +恨 +恩 +恪 +恬 +恭 +息 +恰 +恵 +恿 +悄 +悅 +悆 +悉 +悌 +悍 +悔 +悖 +悚 +悛 +悝 +悞 +悟 +悠 +患 +悧 +您 +悪 +悰 +悲 +悳 +悵 +悶 +悸 +悼 +情 +惆 +惇 +惑 +惔 +惕 +惘 +惚 +惜 +惟 +惠 +惡 +惣 +惦 +惰 +惱 +惲 +想 +惶 +惹 +惺 +愁 +愃 +愆 +愈 +愉 +愍 +意 +愐 +愒 +愔 +愕 +愚 +愛 +愜 +感 +愣 +愧 +愨 +愫 +愭 +愴 +愷 +愼 +愾 +愿 +慄 +慈 +態 +慌 +慎 +慕 +慘 +慚 +慜 +慟 +慢 +慣 +慥 +慧 +慨 +慮 +慰 +慳 +慵 +慶 +慷 +慾 +憂 +憊 +憋 +憍 +憎 +憐 +憑 +憓 +憕 +憙 +憚 +憤 +憧 +憨 +憩 +憫 +憬 +憲 +憶 +憺 +憻 +憾 +懂 +懃 +懇 +懈 +應 +懋 +懌 +懍 +懐 +懣 +懦 +懮 +懲 +懵 +懶 +懷 +懸 +懺 +懼 +懽 +懾 +懿 +戀 +戇 +戈 +戊 +戌 +戍 +戎 +成 +我 +戒 +戔 +戕 +或 +戙 +戚 +戛 +戟 +戡 +戢 +戥 +戦 +戩 +截 +戮 +戰 +戱 +戲 +戳 +戴 +戶 +戸 +戻 +戽 +戾 +房 +所 +扁 +扆 +扇 +扈 +扉 +手 +扌 +才 +扎 +扒 +打 +扔 +托 +扙 +扛 +扞 +扣 +扥 +扦 +扭 +扮 +扯 +扳 +扶 +批 +扼 +找 +承 +技 +抃 +抄 +抇 +抉 +把 +抑 +抒 +抓 +投 +抖 +抗 +折 +抦 +披 +抬 +抱 +抵 +抹 +抻 +押 +抽 +抿 +拂 +拆 +拇 +拈 +拉 +拋 +拌 +拍 +拎 +拏 +拐 +拒 +拓 +拔 +拖 +拗 +拘 +拙 +拚 +招 +拜 +拝 +拡 +括 +拭 +拮 +拯 +拱 +拳 +拴 +拷 +拺 +拼 +拽 +拾 +拿 +持 +指 +按 +挎 +挑 +挖 +挙 +挨 +挪 +挫 +振 +挲 +挵 +挹 +挺 +挻 +挾 +捂 +捆 +捉 +捌 +捍 +捎 +捏 +捐 +捒 +捕 +捜 +捦 +捧 +捨 +捩 +捫 +捭 +捱 +捲 +捶 +捷 +捺 +捻 +掀 +掂 +掃 +掄 +掇 +授 +掉 +掌 +掏 +掐 +排 +掖 +掘 +掙 +掛 +掞 +掟 +掠 +採 +探 +掣 +接 +控 +推 +掩 +措 +掬 +掰 +掾 +揀 +揄 +揆 +揉 +揍 +描 +提 +插 +揔 +揖 +揚 +換 +握 +揪 +揭 +揮 +援 +揸 +揺 +損 +搏 +搐 +搓 +搔 +搖 +搗 +搜 +搞 +搠 +搢 +搪 +搬 +搭 +搳 +搴 +搵 +搶 +搽 +搾 +摂 +摒 +摔 +摘 +摜 +摞 +摟 +摠 +摧 +摩 +摭 +摯 +摳 +摴 +摵 +摶 +摸 +摹 +摺 +摻 +摽 +撃 +撇 +撈 +撐 +撒 +撓 +撕 +撖 +撙 +撚 +撞 +撣 +撤 +撥 +撩 +撫 +撬 +播 +撮 +撰 +撲 +撳 +撻 +撼 +撾 +撿 +擀 +擁 +擂 +擅 +擇 +擊 +擋 +操 +擎 +擒 +擔 +擘 +據 +擠 +擢 +擥 +擦 +擬 +擯 +擰 +擱 +擲 +擴 +擷 +擺 +擼 +擾 +攀 +攏 +攔 +攖 +攘 +攜 +攝 +攞 +攢 +攣 +攤 +攪 +攫 +攬 +支 +攴 +攵 +收 +攷 +攸 +改 +攻 +攽 +放 +政 +故 +效 +敍 +敎 +敏 +救 +敔 +敕 +敖 +敗 +敘 +教 +敝 +敞 +敟 +敢 +散 +敦 +敫 +敬 +敭 +敲 +整 +敵 +敷 +數 +敻 +敾 +斂 +斃 +文 +斌 +斎 +斐 +斑 +斕 +斖 +斗 +料 +斛 +斜 +斝 +斟 +斡 +斤 +斥 +斧 +斬 +斯 +新 +斷 +方 +於 +施 +斿 +旁 +旂 +旃 +旄 +旅 +旉 +旋 +旌 +旎 +族 +旖 +旗 +旙 +旛 +旡 +既 +日 +旦 +旨 +早 +旬 +旭 +旱 +旲 +旳 +旺 +旻 +旼 +旽 +旾 +旿 +昀 +昂 +昃 +昆 +昇 +昉 +昊 +昌 +昍 +明 +昏 +昐 +易 +昔 +昕 +昚 +昛 +昜 +昝 +昞 +星 +映 +昡 +昣 +昤 +春 +昧 +昨 +昪 +昫 +昭 +是 +昰 +昱 +昴 +昵 +昶 +昺 +晁 +時 +晃 +晈 +晉 +晊 +晏 +晗 +晙 +晚 +晛 +晝 +晞 +晟 +晤 +晦 +晧 +晨 +晩 +晪 +晫 +晭 +普 +景 +晰 +晳 +晴 +晶 +晷 +晸 +智 +晾 +暃 +暄 +暅 +暇 +暈 +暉 +暊 +暌 +暎 +暏 +暐 +暑 +暕 +暖 +暗 +暘 +暝 +暟 +暠 +暢 +暦 +暨 +暫 +暮 +暱 +暲 +暴 +暸 +暹 +暻 +暾 +曄 +曅 +曆 +曇 +曉 +曌 +曔 +曖 +曙 +曜 +曝 +曠 +曦 +曧 +曨 +曩 +曬 +曮 +曰 +曲 +曳 +更 +曶 +曷 +書 +曹 +曺 +曼 +曽 +曾 +替 +最 +會 +月 +有 +朊 +朋 +服 +朏 +朐 +朓 +朔 +朕 +朖 +朗 +望 +朝 +期 +朦 +朧 +木 +未 +末 +本 +札 +朱 +朴 +朵 +朶 +朽 +朿 +杁 +杉 +杋 +杌 +李 +杏 +材 +村 +杓 +杖 +杙 +杜 +杞 +束 +杠 +杣 +杤 +杧 +杬 +杭 +杯 +東 +杲 +杳 +杴 +杵 +杷 +杻 +杼 +松 +板 +极 +枇 +枉 +枋 +枏 +析 +枕 +枖 +林 +枚 +枛 +果 +枝 +枠 +枡 +枯 +枰 +枱 +枲 +枳 +架 +枷 +枸 +枹 +枼 +柁 +柃 +柄 +柉 +柊 +柎 +柏 +某 +柑 +柒 +染 +柔 +柘 +柚 +柜 +柝 +柞 +柟 +查 +柩 +柬 +柯 +柰 +柱 +柳 +柴 +柵 +柶 +柷 +査 +柾 +柿 +栃 +栄 +栐 +栒 +栓 +栜 +栝 +栞 +校 +栢 +栨 +栩 +株 +栲 +栴 +核 +根 +栻 +格 +栽 +桀 +桁 +桂 +桃 +桄 +桅 +框 +案 +桉 +桌 +桎 +桐 +桑 +桓 +桔 +桕 +桖 +桙 +桜 +桝 +桫 +桱 +桲 +桴 +桶 +桷 +桼 +桿 +梀 +梁 +梂 +梃 +梅 +梆 +梉 +梏 +梓 +梔 +梗 +梘 +條 +梟 +梠 +梢 +梣 +梧 +梨 +梫 +梭 +梯 +械 +梱 +梳 +梵 +梶 +梽 +棄 +棆 +棉 +棋 +棍 +棐 +棒 +棓 +棕 +棖 +棗 +棘 +棚 +棛 +棟 +棠 +棡 +棣 +棧 +棨 +棩 +棪 +棫 +森 +棱 +棲 +棵 +棶 +棹 +棺 +棻 +棼 +棽 +椅 +椆 +椇 +椋 +植 +椎 +椏 +椒 +椙 +椥 +椪 +椰 +椲 +椴 +椵 +椹 +椽 +椿 +楂 +楊 +楓 +楔 +楗 +楙 +楚 +楝 +楞 +楠 +楡 +楢 +楣 +楤 +楦 +楧 +楨 +楫 +業 +楮 +楯 +楳 +極 +楷 +楸 +楹 +楽 +楿 +概 +榆 +榊 +榍 +榎 +榑 +榔 +榕 +榖 +榗 +榘 +榛 +榜 +榞 +榢 +榣 +榤 +榦 +榧 +榨 +榫 +榭 +榮 +榲 +榴 +榷 +榻 +榿 +槀 +槁 +槃 +槊 +構 +槌 +槍 +槎 +槐 +槓 +槔 +槗 +様 +槙 +槤 +槩 +槭 +槰 +槱 +槲 +槳 +槺 +槻 +槼 +槽 +槿 +樀 +樁 +樂 +樅 +樆 +樊 +樋 +樑 +樓 +樗 +樘 +標 +樞 +樟 +模 +樣 +樨 +権 +樫 +樵 +樸 +樹 +樺 +樻 +樽 +樾 +橄 +橇 +橈 +橋 +橐 +橒 +橓 +橘 +橙 +橚 +機 +橡 +橢 +橪 +橫 +橿 +檀 +檄 +檇 +檉 +檊 +檎 +檐 +檔 +檗 +檜 +檞 +檠 +檡 +檢 +檣 +檦 +檨 +檫 +檬 +檯 +檳 +檵 +檸 +檻 +檽 +櫂 +櫃 +櫆 +櫈 +櫓 +櫚 +櫛 +櫞 +櫟 +櫥 +櫨 +櫪 +櫱 +櫸 +櫻 +櫾 +櫿 +欄 +欉 +權 +欏 +欒 +欖 +欞 +欠 +次 +欣 +欥 +欲 +欸 +欹 +欺 +欽 +款 +歆 +歇 +歉 +歊 +歌 +歎 +歐 +歓 +歙 +歛 +歡 +止 +正 +此 +步 +武 +歧 +歩 +歪 +歲 +歳 +歴 +歷 +歸 +歹 +死 +歿 +殂 +殃 +殄 +殆 +殉 +殊 +殑 +殖 +殘 +殛 +殞 +殟 +殤 +殭 +殮 +殯 +殲 +殳 +段 +殷 +殺 +殻 +殼 +殿 +毀 +毅 +毆 +毉 +毋 +毌 +母 +毎 +每 +毐 +毒 +毓 +比 +毖 +毗 +毘 +毛 +毫 +毬 +毯 +毴 +毸 +毽 +毿 +氂 +氈 +氍 +氏 +氐 +民 +氓 +氖 +気 +氘 +氙 +氚 +氛 +氟 +氣 +氦 +氧 +氨 +氪 +氫 +氬 +氮 +氯 +氰 +水 +氵 +氷 +永 +氹 +氻 +氽 +氾 +汀 +汁 +求 +汊 +汎 +汐 +汕 +汗 +汛 +汜 +汝 +汞 +江 +池 +污 +汧 +汨 +汩 +汪 +汭 +汰 +汲 +汴 +汶 +決 +汽 +汾 +沁 +沂 +沃 +沄 +沅 +沆 +沇 +沈 +沉 +沌 +沍 +沏 +沐 +沒 +沓 +沔 +沖 +沘 +沙 +沚 +沛 +沜 +沢 +沨 +沫 +沭 +沮 +沯 +沱 +河 +沸 +油 +沺 +治 +沼 +沽 +沾 +沿 +況 +泂 +泄 +泆 +泇 +泉 +泊 +泌 +泐 +泓 +泔 +法 +泖 +泗 +泚 +泛 +泠 +泡 +波 +泣 +泥 +泩 +泫 +泮 +泯 +泰 +泱 +泳 +泵 +洄 +洋 +洌 +洎 +洗 +洙 +洛 +洞 +洢 +洣 +洤 +津 +洨 +洩 +洪 +洮 +洱 +洲 +洳 +洵 +洸 +洹 +洺 +活 +洽 +派 +流 +浄 +浙 +浚 +浛 +浜 +浞 +浟 +浠 +浡 +浣 +浤 +浥 +浦 +浩 +浪 +浮 +浯 +浴 +浵 +海 +浸 +浹 +涅 +涇 +消 +涉 +涌 +涎 +涑 +涓 +涔 +涕 +涙 +涪 +涫 +涮 +涯 +液 +涵 +涸 +涼 +涿 +淄 +淅 +淆 +淇 +淋 +淌 +淍 +淎 +淏 +淑 +淓 +淖 +淘 +淙 +淚 +淛 +淝 +淞 +淠 +淡 +淤 +淥 +淦 +淨 +淩 +淪 +淫 +淬 +淮 +淯 +淰 +深 +淳 +淵 +淶 +混 +淸 +淹 +淺 +添 +淼 +淽 +渃 +清 +済 +渉 +渋 +渕 +渙 +渚 +減 +渝 +渟 +渠 +渡 +渣 +渤 +渥 +渦 +渫 +測 +渭 +港 +渲 +渴 +游 +渺 +渼 +渽 +渾 +湃 +湄 +湉 +湊 +湍 +湓 +湔 +湖 +湘 +湛 +湜 +湞 +湟 +湣 +湥 +湧 +湫 +湮 +湯 +湳 +湴 +湼 +満 +溁 +溇 +溈 +溉 +溋 +溎 +溏 +源 +準 +溙 +溜 +溝 +溟 +溢 +溥 +溦 +溧 +溪 +溫 +溯 +溱 +溲 +溴 +溵 +溶 +溺 +溼 +滀 +滁 +滂 +滄 +滅 +滇 +滈 +滉 +滋 +滌 +滎 +滏 +滑 +滓 +滔 +滕 +滘 +滙 +滝 +滬 +滯 +滲 +滴 +滷 +滸 +滹 +滻 +滽 +滾 +滿 +漁 +漂 +漆 +漇 +漈 +漎 +漏 +漓 +演 +漕 +漚 +漠 +漢 +漣 +漩 +漪 +漫 +漬 +漯 +漱 +漲 +漳 +漴 +漵 +漷 +漸 +漼 +漾 +漿 +潁 +潑 +潔 +潘 +潛 +潞 +潟 +潢 +潤 +潭 +潮 +潯 +潰 +潲 +潺 +潼 +潽 +潾 +潿 +澀 +澁 +澂 +澄 +澆 +澇 +澈 +澉 +澋 +澌 +澍 +澎 +澔 +澗 +澠 +澡 +澣 +澤 +澥 +澧 +澪 +澮 +澯 +澱 +澳 +澶 +澹 +澻 +激 +濁 +濂 +濃 +濉 +濊 +濋 +濕 +濘 +濙 +濛 +濞 +濟 +濠 +濡 +濤 +濫 +濬 +濮 +濯 +濰 +濱 +濲 +濶 +濺 +濼 +濾 +瀁 +瀅 +瀆 +瀉 +瀍 +瀏 +瀑 +瀔 +瀕 +瀘 +瀚 +瀛 +瀝 +瀞 +瀟 +瀠 +瀣 +瀦 +瀧 +瀨 +瀬 +瀰 +瀲 +瀴 +瀶 +瀹 +瀾 +灃 +灊 +灌 +灑 +灘 +灝 +灞 +灡 +灣 +灤 +灧 +火 +灰 +灴 +灸 +灼 +災 +炁 +炅 +炆 +炊 +炎 +炒 +炔 +炕 +炘 +炙 +炟 +炣 +炤 +炫 +炬 +炭 +炮 +炯 +炱 +炲 +炳 +炷 +炸 +為 +炻 +烈 +烉 +烊 +烋 +烏 +烒 +烔 +烘 +烙 +烜 +烝 +烤 +烯 +烱 +烴 +烷 +烹 +烺 +烽 +焃 +焄 +焉 +焊 +焌 +焓 +焗 +焙 +焚 +焜 +焞 +無 +焦 +焯 +焰 +焱 +焴 +然 +焻 +焼 +焿 +煇 +煉 +煊 +煌 +煎 +煐 +煒 +煔 +煕 +煖 +煙 +煚 +煜 +煞 +煠 +煤 +煥 +煦 +照 +煨 +煩 +煬 +煮 +煲 +煳 +煵 +煶 +煸 +煽 +熄 +熅 +熇 +熈 +熊 +熏 +熒 +熔 +熖 +熗 +熘 +熙 +熜 +熟 +熠 +熤 +熥 +熨 +熬 +熯 +熱 +熲 +熳 +熵 +熹 +熺 +熼 +熾 +熿 +燁 +燃 +燄 +燈 +燉 +燊 +燎 +燏 +燐 +燒 +燔 +燕 +燘 +燙 +燚 +燜 +燝 +營 +燥 +燦 +燧 +燫 +燬 +燭 +燮 +燴 +燹 +燻 +燼 +燾 +燿 +爀 +爆 +爌 +爍 +爐 +爔 +爚 +爛 +爝 +爨 +爪 +爬 +爭 +爯 +爰 +爲 +爵 +父 +爸 +爹 +爺 +爻 +爽 +爾 +爿 +牁 +牂 +牆 +片 +版 +牌 +牒 +牕 +牖 +牘 +牙 +牛 +牝 +牟 +牠 +牡 +牢 +牧 +物 +牯 +牲 +特 +牻 +牼 +牽 +犀 +犁 +犂 +犇 +犍 +犎 +犖 +犛 +犢 +犧 +犨 +犬 +犯 +犰 +犴 +犽 +狀 +狂 +狄 +狍 +狎 +狐 +狒 +狓 +狗 +狙 +狛 +狟 +狠 +狡 +狦 +狨 +狩 +狳 +狶 +狷 +狸 +狹 +狻 +狼 +猁 +猄 +猇 +猊 +猗 +猙 +猛 +猜 +猝 +猞 +猢 +猥 +猨 +猩 +猳 +猴 +猶 +猷 +猺 +猻 +猾 +猿 +獁 +獃 +獄 +獅 +獇 +獎 +獏 +獐 +獒 +獠 +獢 +獣 +獨 +獬 +獮 +獯 +獰 +獲 +獴 +獵 +獷 +獸 +獺 +獻 +獼 +獾 +玀 +玄 +玆 +率 +玉 +王 +玎 +玏 +玓 +玕 +玖 +玗 +玘 +玙 +玟 +玠 +玡 +玢 +玥 +玧 +玨 +玩 +玫 +玭 +玲 +玳 +玶 +玷 +玹 +玻 +玾 +珀 +珂 +珅 +珈 +珉 +珊 +珌 +珍 +珎 +珏 +珖 +珙 +珝 +珞 +珠 +珡 +珣 +珤 +珥 +珦 +珧 +珩 +珪 +班 +珮 +珵 +珹 +珺 +珽 +現 +琁 +球 +琄 +琅 +理 +琇 +琉 +琊 +琍 +琎 +琚 +琛 +琡 +琢 +琤 +琥 +琦 +琨 +琪 +琬 +琮 +琯 +琰 +琱 +琳 +琴 +琵 +琶 +琹 +琺 +琿 +瑀 +瑁 +瑂 +瑄 +瑅 +瑆 +瑈 +瑊 +瑋 +瑑 +瑒 +瑕 +瑗 +瑙 +瑚 +瑛 +瑜 +瑝 +瑞 +瑟 +瑠 +瑢 +瑣 +瑤 +瑥 +瑧 +瑨 +瑩 +瑪 +瑭 +瑯 +瑰 +瑱 +瑳 +瑴 +瑺 +瑾 +璀 +璁 +璃 +璄 +璆 +璇 +璈 +璉 +璋 +璌 +璐 +璕 +璘 +璙 +璚 +璜 +璞 +璟 +璠 +璡 +璣 +璥 +璦 +璧 +璨 +璩 +璪 +璫 +璬 +璮 +環 +璱 +璵 +璸 +璹 +璽 +璿 +瓈 +瓊 +瓌 +瓏 +瓑 +瓔 +瓖 +瓘 +瓚 +瓛 +瓜 +瓞 +瓠 +瓢 +瓣 +瓤 +瓦 +瓮 +瓴 +瓶 +瓷 +瓿 +甂 +甄 +甌 +甍 +甑 +甕 +甘 +甙 +甚 +甜 +生 +甡 +產 +産 +甥 +甦 +用 +甩 +甪 +甫 +甬 +甯 +田 +由 +甲 +申 +男 +甸 +甹 +町 +甾 +畀 +畇 +畈 +畊 +畋 +界 +畎 +畏 +畐 +畑 +畔 +留 +畜 +畝 +畠 +畢 +略 +畦 +畧 +番 +畫 +畬 +畯 +異 +畲 +畳 +畵 +當 +畷 +畸 +畹 +畿 +疃 +疆 +疇 +疊 +疋 +疌 +疍 +疏 +疑 +疒 +疕 +疙 +疚 +疝 +疣 +疤 +疥 +疫 +疲 +疳 +疵 +疸 +疹 +疼 +疽 +疾 +痂 +病 +症 +痊 +痍 +痔 +痕 +痘 +痙 +痛 +痞 +痟 +痠 +痢 +痣 +痤 +痧 +痩 +痰 +痱 +痲 +痴 +痹 +痺 +痿 +瘀 +瘁 +瘊 +瘋 +瘍 +瘓 +瘙 +瘜 +瘞 +瘟 +瘠 +瘡 +瘢 +瘤 +瘦 +瘧 +瘩 +瘰 +瘴 +瘺 +癀 +療 +癆 +癇 +癌 +癒 +癖 +癘 +癜 +癟 +癡 +癢 +癤 +癥 +癩 +癬 +癭 +癮 +癯 +癰 +癱 +癲 +癸 +発 +登 +發 +白 +百 +皂 +的 +皆 +皇 +皈 +皋 +皎 +皐 +皓 +皖 +皙 +皚 +皛 +皝 +皞 +皮 +皰 +皴 +皷 +皸 +皺 +皿 +盂 +盃 +盅 +盆 +盈 +益 +盋 +盌 +盎 +盒 +盔 +盛 +盜 +盞 +盟 +盡 +監 +盤 +盥 +盦 +盧 +盨 +盩 +盪 +盫 +目 +盯 +盱 +盲 +直 +盷 +相 +盹 +盺 +盼 +盾 +眀 +省 +眉 +看 +県 +眙 +眛 +眜 +眞 +真 +眠 +眥 +眨 +眩 +眭 +眯 +眵 +眶 +眷 +眸 +眺 +眼 +眾 +着 +睇 +睛 +睜 +睞 +睡 +睢 +督 +睥 +睦 +睨 +睪 +睫 +睭 +睹 +睺 +睽 +睾 +睿 +瞄 +瞅 +瞋 +瞌 +瞎 +瞑 +瞓 +瞞 +瞢 +瞥 +瞧 +瞪 +瞫 +瞬 +瞭 +瞰 +瞳 +瞻 +瞼 +瞽 +瞿 +矇 +矍 +矗 +矚 +矛 +矜 +矞 +矢 +矣 +知 +矧 +矩 +短 +矮 +矯 +石 +矸 +矽 +砂 +砋 +砌 +砍 +砒 +研 +砝 +砢 +砥 +砦 +砧 +砩 +砫 +砭 +砮 +砯 +砰 +砲 +砳 +破 +砵 +砷 +砸 +砼 +硂 +硃 +硅 +硇 +硏 +硐 +硒 +硓 +硚 +硜 +硝 +硤 +硨 +硫 +硬 +硭 +硯 +硼 +碁 +碇 +碉 +碌 +碎 +碑 +碓 +碕 +碗 +碘 +碚 +碟 +碡 +碣 +碧 +碩 +碪 +碭 +碰 +碲 +碳 +碴 +碶 +碸 +確 +碻 +碼 +碽 +碾 +磁 +磅 +磊 +磋 +磐 +磔 +磕 +磘 +磙 +磚 +磜 +磡 +磨 +磪 +磬 +磯 +磱 +磲 +磵 +磷 +磺 +磻 +磾 +礁 +礄 +礎 +礐 +礑 +礒 +礙 +礠 +礦 +礪 +礫 +礬 +礮 +礱 +礴 +示 +礻 +礽 +社 +祀 +祁 +祂 +祆 +祇 +祈 +祉 +祋 +祏 +祐 +祓 +祕 +祖 +祗 +祙 +祚 +祛 +祜 +祝 +神 +祟 +祠 +祥 +祧 +票 +祭 +祹 +祺 +祼 +祿 +禁 +禃 +禇 +禍 +禎 +福 +禑 +禓 +禔 +禕 +禘 +禛 +禟 +禠 +禤 +禦 +禧 +禨 +禩 +禪 +禮 +禰 +禱 +禵 +禹 +禺 +禼 +禽 +禾 +禿 +秀 +私 +秈 +秉 +秋 +科 +秒 +秕 +秘 +租 +秠 +秣 +秤 +秦 +秧 +秩 +秭 +秳 +秸 +移 +稀 +稅 +稈 +稉 +程 +稍 +稑 +稔 +稗 +稘 +稙 +稚 +稜 +稞 +稟 +稠 +種 +稱 +稲 +稷 +稹 +稺 +稻 +稼 +稽 +稾 +稿 +穀 +穂 +穆 +穈 +穉 +穌 +積 +穎 +穗 +穟 +穠 +穡 +穢 +穣 +穩 +穫 +穰 +穴 +穵 +究 +穹 +空 +穿 +突 +窄 +窅 +窈 +窋 +窒 +窕 +窖 +窗 +窘 +窟 +窠 +窣 +窨 +窩 +窪 +窮 +窯 +窰 +窶 +窺 +窿 +竄 +竅 +竇 +竈 +竊 +立 +竑 +站 +竜 +竟 +章 +竣 +童 +竦 +竩 +竭 +端 +競 +竹 +竺 +竻 +竿 +笄 +笆 +笈 +笏 +笑 +笘 +笙 +笛 +笞 +笠 +笥 +符 +笨 +笩 +笪 +第 +笭 +笮 +笯 +笱 +笳 +笹 +筅 +筆 +等 +筊 +筋 +筌 +筍 +筏 +筐 +筒 +答 +策 +筘 +筠 +筥 +筦 +筧 +筬 +筭 +筱 +筲 +筳 +筵 +筶 +筷 +筻 +箆 +箇 +箋 +箍 +箏 +箐 +箑 +箒 +箔 +箕 +算 +箜 +管 +箬 +箭 +箱 +箴 +箸 +節 +篁 +範 +篆 +篇 +築 +篊 +篋 +篌 +篔 +篙 +篝 +篠 +篡 +篤 +篥 +篦 +篩 +篪 +篭 +篯 +篳 +篷 +簀 +簃 +簇 +簉 +簋 +簍 +簑 +簕 +簗 +簞 +簠 +簡 +簧 +簪 +簫 +簷 +簸 +簹 +簺 +簽 +簾 +簿 +籀 +籃 +籌 +籍 +籐 +籙 +籛 +籜 +籝 +籟 +籠 +籣 +籤 +籥 +籪 +籬 +籮 +籲 +米 +籽 +籾 +粄 +粉 +粍 +粑 +粒 +粕 +粗 +粘 +粟 +粢 +粥 +粦 +粧 +粩 +粱 +粲 +粳 +粵 +粹 +粼 +粽 +精 +粿 +糀 +糅 +糊 +糌 +糍 +糎 +糕 +糖 +糙 +糜 +糝 +糞 +糟 +糠 +糢 +糧 +糬 +糯 +糰 +糴 +糶 +糸 +糹 +糺 +系 +糾 +紀 +紂 +約 +紅 +紆 +紇 +紈 +紉 +紊 +紋 +納 +紐 +紑 +紓 +純 +紕 +紗 +紘 +紙 +級 +紛 +紜 +紝 +紞 +素 +紡 +索 +紫 +紮 +累 +細 +紱 +紲 +紳 +紵 +紹 +紺 +紿 +終 +絃 +組 +絆 +経 +絎 +結 +絕 +絛 +絜 +絞 +絡 +絢 +給 +絨 +絪 +絮 +統 +絲 +絳 +絵 +絶 +絹 +絺 +綁 +綃 +綈 +綉 +綎 +綏 +經 +綖 +継 +続 +綜 +綝 +綞 +綠 +綢 +綣 +綦 +綧 +綫 +綬 +維 +綮 +綰 +綱 +網 +綳 +綴 +綸 +綺 +綻 +綽 +綾 +綿 +緁 +緃 +緄 +緈 +緊 +緋 +総 +緑 +緒 +緖 +緘 +線 +緜 +緝 +緞 +締 +緡 +緣 +緤 +編 +緩 +緬 +緯 +緱 +緲 +練 +緹 +緻 +縂 +縄 +縈 +縉 +縊 +縕 +縛 +縝 +縞 +縠 +縡 +縣 +縤 +縫 +縮 +縯 +縱 +縴 +縵 +縷 +縹 +縻 +總 +績 +繁 +繃 +繆 +繇 +繒 +織 +繕 +繖 +繙 +繚 +繞 +繡 +繩 +繪 +繫 +繭 +繰 +繳 +繹 +繻 +繼 +繽 +繾 +纁 +纂 +纈 +續 +纍 +纏 +纓 +纔 +纕 +纖 +纘 +纛 +纜 +缐 +缶 +缸 +缺 +缽 +罃 +罄 +罅 +罈 +罉 +罌 +罍 +罐 +罔 +罕 +罘 +罟 +罡 +罨 +罩 +罪 +置 +罰 +罱 +署 +罳 +罵 +罶 +罷 +罹 +罽 +羂 +羅 +羆 +羈 +羊 +羋 +羌 +美 +羔 +羕 +羗 +羙 +羚 +羞 +羡 +羣 +群 +羥 +羧 +羨 +義 +羯 +羰 +羱 +羲 +羸 +羹 +羽 +羿 +翀 +翁 +翂 +翃 +翅 +翊 +翌 +翎 +翏 +習 +翔 +翕 +翙 +翜 +翟 +翠 +翡 +翥 +翦 +翩 +翬 +翮 +翰 +翱 +翳 +翹 +翻 +翼 +耀 +老 +考 +耄 +者 +耆 +而 +耍 +耎 +耐 +耑 +耒 +耔 +耕 +耗 +耘 +耙 +耜 +耦 +耨 +耬 +耳 +耵 +耶 +耷 +耽 +耿 +聃 +聆 +聊 +聒 +聖 +聘 +聚 +聞 +聟 +聨 +聯 +聰 +聱 +聲 +聳 +聴 +聶 +職 +聽 +聾 +聿 +肄 +肅 +肆 +肇 +肉 +肋 +肌 +肏 +肖 +肘 +肚 +肛 +肜 +肝 +肟 +股 +肢 +肥 +肩 +肪 +肫 +肯 +肱 +育 +肸 +肹 +肺 +肼 +肽 +胂 +胃 +胄 +胅 +胇 +胊 +背 +胍 +胎 +胖 +胗 +胙 +胚 +胛 +胝 +胞 +胡 +胤 +胥 +胬 +胭 +胰 +胱 +胳 +胴 +胸 +胺 +胼 +能 +脂 +脅 +脆 +脇 +脈 +脊 +脒 +脖 +脘 +脛 +脣 +脩 +脫 +脬 +脭 +脯 +脲 +脳 +脷 +脹 +脾 +腆 +腈 +腊 +腋 +腌 +腎 +腐 +腑 +腓 +腔 +腕 +腥 +腦 +腧 +腩 +腫 +腮 +腰 +腱 +腳 +腴 +腸 +腹 +腺 +腿 +膀 +膂 +膈 +膊 +膏 +膚 +膛 +膜 +膝 +膠 +膣 +膥 +膦 +膨 +膩 +膮 +膳 +膺 +膽 +膾 +膿 +臀 +臂 +臃 +臆 +臉 +臊 +臍 +臏 +臘 +臚 +臞 +臟 +臠 +臣 +臧 +臨 +自 +臭 +臯 +至 +致 +臺 +臻 +臼 +臾 +舂 +舅 +與 +興 +舉 +舊 +舌 +舍 +舎 +舒 +舔 +舖 +舘 +舛 +舜 +舞 +舟 +舢 +舥 +舨 +舩 +航 +舫 +般 +舲 +舵 +舶 +舷 +舸 +船 +舺 +艅 +艇 +艉 +艋 +艎 +艏 +艔 +艘 +艙 +艚 +艦 +艮 +良 +艱 +色 +艶 +艷 +艸 +艽 +艾 +艿 +芃 +芊 +芋 +芍 +芎 +芑 +芒 +芘 +芙 +芛 +芝 +芡 +芥 +芨 +芩 +芪 +芫 +芬 +芭 +芮 +芯 +花 +芳 +芴 +芷 +芸 +芹 +芻 +芽 +芾 +苄 +苅 +苑 +苒 +苓 +苔 +苕 +苗 +苛 +苜 +苝 +苞 +苟 +苡 +苣 +苤 +若 +苦 +苧 +苪 +苫 +苯 +英 +苳 +苴 +苷 +苺 +苻 +苼 +苾 +茀 +茁 +茂 +范 +茄 +茅 +茆 +茇 +茈 +茉 +茌 +茗 +茘 +茚 +茛 +茜 +茝 +茨 +茫 +茬 +茭 +茮 +茯 +茱 +茲 +茴 +茵 +茶 +茷 +茸 +茹 +茺 +茼 +荀 +荃 +荅 +荇 +草 +荊 +荎 +荏 +荒 +荔 +荖 +荘 +荳 +荷 +荸 +荻 +荼 +荽 +莆 +莉 +莊 +莎 +莒 +莓 +莕 +莖 +莘 +莙 +莛 +莜 +莞 +莠 +莢 +莧 +莨 +莩 +莪 +莫 +莽 +莿 +菀 +菁 +菅 +菇 +菈 +菉 +菊 +菌 +菍 +菏 +菑 +菓 +菔 +菖 +菘 +菜 +菝 +菟 +菠 +菡 +菥 +菩 +菪 +菫 +華 +菰 +菱 +菲 +菴 +菶 +菸 +菹 +菺 +菼 +菽 +菾 +萁 +萃 +萄 +萇 +萊 +萌 +萍 +萎 +萐 +萘 +萜 +萠 +萡 +萣 +萩 +萬 +萭 +萱 +萵 +萸 +萹 +萼 +落 +葃 +葆 +葉 +葊 +葎 +葑 +葒 +著 +葙 +葚 +葛 +葜 +葝 +葡 +董 +葦 +葩 +葫 +葬 +葭 +葯 +葰 +葳 +葵 +葶 +葷 +葺 +蒂 +蒄 +蒍 +蒎 +蒐 +蒓 +蒔 +蒗 +蒙 +蒜 +蒞 +蒟 +蒡 +蒢 +蒤 +蒧 +蒨 +蒭 +蒯 +蒲 +蒴 +蒸 +蒹 +蒺 +蒻 +蒼 +蒽 +蒾 +蒿 +蓀 +蓁 +蓂 +蓄 +蓆 +蓉 +蓋 +蓍 +蓑 +蓓 +蓖 +蓘 +蓚 +蓧 +蓨 +蓪 +蓬 +蓭 +蓮 +蓯 +蓳 +蓼 +蓽 +蓿 +蔆 +蔎 +蔑 +蔓 +蔔 +蔕 +蔗 +蔘 +蔚 +蔝 +蔞 +蔡 +蔣 +蔥 +蔦 +蔬 +蔭 +蔴 +蔵 +蔻 +蔽 +蕁 +蕃 +蕅 +蕈 +蕉 +蕊 +蕎 +蕑 +蕒 +蕖 +蕘 +蕙 +蕚 +蕟 +蕡 +蕢 +蕤 +蕨 +蕩 +蕪 +蕭 +蕷 +蕹 +蕺 +蕻 +蕾 +薀 +薄 +薆 +薇 +薈 +薊 +薌 +薏 +薐 +薑 +薔 +薗 +薘 +薙 +薛 +薜 +薞 +薟 +薡 +薦 +薨 +薩 +薪 +薫 +薬 +薯 +薰 +薲 +薷 +薸 +薹 +薺 +薾 +薿 +藁 +藉 +藍 +藎 +藏 +藐 +藔 +藕 +藜 +藝 +藟 +藤 +藥 +藦 +藨 +藩 +藪 +藶 +藸 +藹 +藺 +藻 +藿 +蘂 +蘄 +蘅 +蘆 +蘇 +蘊 +蘋 +蘐 +蘑 +蘓 +蘗 +蘘 +蘚 +蘞 +蘢 +蘧 +蘩 +蘭 +蘵 +蘶 +蘸 +蘼 +蘿 +虉 +虎 +虐 +虓 +虔 +處 +虖 +虛 +虜 +虞 +號 +虢 +虧 +虨 +虯 +虱 +虵 +虹 +虺 +虻 +蚆 +蚊 +蚋 +蚌 +蚍 +蚓 +蚖 +蚜 +蚝 +蚡 +蚢 +蚣 +蚤 +蚧 +蚨 +蚩 +蚪 +蚯 +蚱 +蚴 +蚵 +蚶 +蚺 +蚼 +蛀 +蛄 +蛇 +蛉 +蛋 +蛍 +蛐 +蛑 +蛔 +蛙 +蛛 +蛞 +蛟 +蛤 +蛭 +蛯 +蛸 +蛹 +蛺 +蛻 +蛾 +蜀 +蜂 +蜃 +蜆 +蜇 +蜈 +蜉 +蜊 +蜍 +蜑 +蜒 +蜓 +蜘 +蜚 +蜛 +蜜 +蜞 +蜢 +蜣 +蜥 +蜨 +蜮 +蜯 +蜱 +蜴 +蜷 +蜻 +蜾 +蜿 +蝀 +蝌 +蝍 +蝎 +蝓 +蝕 +蝗 +蝘 +蝙 +蝚 +蝟 +蝠 +蝣 +蝤 +蝦 +蝨 +蝮 +蝯 +蝰 +蝲 +蝴 +蝶 +蝸 +蝽 +螂 +螃 +螄 +螅 +螈 +螋 +融 +螐 +螔 +螞 +螟 +螠 +螢 +螣 +螥 +螫 +螭 +螯 +螳 +螶 +螺 +螻 +螽 +螾 +蟀 +蟄 +蟅 +蟆 +蟊 +蟋 +蟌 +蟎 +蟑 +蟒 +蟜 +蟠 +蟥 +蟪 +蟫 +蟬 +蟯 +蟲 +蟳 +蟴 +蟶 +蟹 +蟻 +蟾 +蠂 +蠃 +蠄 +蠅 +蠆 +蠊 +蠋 +蠍 +蠐 +蠑 +蠓 +蠔 +蠕 +蠖 +蠘 +蠙 +蠟 +蠡 +蠢 +蠣 +蠱 +蠲 +蠵 +蠶 +蠷 +蠹 +蠻 +血 +衂 +衆 +行 +衍 +衎 +術 +衕 +衖 +街 +衙 +衚 +衛 +衜 +衝 +衞 +衡 +衢 +衣 +表 +衩 +衫 +衰 +衲 +衷 +衽 +衾 +衿 +袁 +袂 +袈 +袋 +袍 +袓 +袖 +袛 +袞 +袤 +袪 +被 +袱 +袴 +袾 +裁 +裂 +裊 +裎 +裒 +裔 +裕 +裖 +裘 +裙 +補 +裝 +裟 +裡 +裨 +裬 +裱 +裳 +裴 +裵 +裸 +裹 +製 +裾 +裿 +褀 +褂 +複 +褌 +褍 +褎 +褐 +褒 +褓 +褔 +褘 +褙 +褚 +褞 +褥 +褧 +褪 +褫 +褭 +褲 +褶 +褸 +褻 +襄 +襌 +襖 +襞 +襟 +襠 +襤 +襦 +襪 +襯 +襲 +襴 +襶 +襻 +襾 +西 +要 +覃 +覆 +覇 +覈 +見 +覌 +規 +覓 +視 +覚 +覡 +覦 +覧 +親 +覬 +覲 +観 +覺 +覽 +覿 +觀 +角 +觔 +觙 +觚 +觜 +解 +觭 +觱 +觴 +觶 +觸 +觿 +言 +訁 +訂 +訃 +訇 +計 +訊 +訌 +討 +訏 +訐 +訒 +訓 +訔 +訕 +訖 +託 +記 +訛 +訝 +訟 +訣 +訥 +訪 +設 +許 +訴 +訶 +診 +註 +証 +訾 +詁 +詆 +詈 +詐 +詒 +詔 +評 +詛 +詞 +詠 +詡 +詢 +詣 +詥 +試 +詧 +詩 +詫 +詭 +詮 +詰 +話 +該 +詳 +詵 +詹 +詼 +誄 +誅 +誇 +誌 +認 +誒 +誓 +誕 +誘 +語 +誠 +誡 +誣 +誤 +誥 +誦 +誨 +說 +説 +読 +誰 +課 +誴 +誹 +誼 +誾 +調 +談 +請 +諍 +諏 +諒 +論 +諗 +諜 +諟 +諠 +諡 +諤 +諦 +諧 +諪 +諫 +諭 +諮 +諱 +諲 +諳 +諴 +諶 +諷 +諸 +諺 +諼 +諾 +謀 +謁 +謂 +謄 +謇 +謊 +謌 +謎 +謏 +謐 +謔 +謖 +謗 +謙 +謚 +講 +謜 +謝 +謠 +謢 +謤 +謨 +謩 +謫 +謬 +謳 +謹 +謾 +證 +譏 +譓 +譔 +識 +譙 +譚 +譜 +譞 +警 +譫 +譬 +譭 +譯 +議 +譲 +譳 +譴 +護 +譽 +譿 +讀 +讃 +變 +讌 +讎 +讓 +讖 +讙 +讚 +讜 +讞 +谷 +谿 +豁 +豆 +豇 +豈 +豉 +豊 +豌 +豎 +豐 +豔 +豕 +豚 +象 +豢 +豨 +豪 +豫 +豬 +豳 +豸 +豹 +豺 +豿 +貂 +貅 +貉 +貊 +貌 +貐 +貒 +貓 +貔 +貘 +貝 +貞 +負 +財 +貢 +貤 +貧 +貨 +販 +貪 +貫 +責 +貭 +貮 +貯 +貲 +貳 +貴 +貶 +買 +貸 +貺 +費 +貼 +貽 +貿 +賀 +賁 +賂 +賃 +賄 +資 +賈 +賊 +賑 +賒 +賓 +賔 +賕 +賚 +賜 +賞 +賠 +賡 +賢 +賣 +賤 +賦 +賨 +質 +賬 +賭 +賴 +賹 +賺 +賻 +購 +賽 +賾 +贄 +贅 +贇 +贈 +贊 +贌 +贍 +贏 +贓 +贔 +贖 +贛 +赤 +赦 +赧 +赫 +赬 +赭 +走 +赳 +赴 +起 +趁 +超 +越 +趐 +趕 +趖 +趙 +趟 +趣 +趨 +足 +趴 +趵 +趺 +趼 +趾 +跅 +跆 +跋 +跌 +跏 +跑 +跖 +跗 +跛 +距 +跟 +跡 +跣 +跤 +跨 +跩 +跪 +路 +跳 +踎 +踏 +踐 +踝 +踞 +踢 +踩 +踰 +踴 +踹 +踺 +蹂 +蹄 +蹇 +蹈 +蹉 +蹊 +蹋 +蹕 +蹙 +蹟 +蹠 +蹤 +蹦 +蹬 +蹭 +蹯 +蹲 +蹴 +蹶 +蹺 +蹻 +蹼 +躁 +躂 +躄 +躉 +躋 +躍 +躑 +躒 +躔 +躝 +躪 +身 +躬 +躰 +躲 +躺 +軀 +車 +軋 +軌 +軍 +軎 +軒 +軔 +軛 +軟 +転 +軫 +軲 +軸 +軹 +軺 +軻 +軼 +軽 +軾 +較 +輄 +輅 +載 +輋 +輒 +輓 +輔 +輕 +輛 +輝 +輞 +輟 +輥 +輦 +輩 +輪 +輬 +輭 +輯 +輶 +輸 +輻 +輾 +輿 +轀 +轂 +轄 +轅 +轆 +轉 +轍 +轎 +轘 +轝 +轟 +轤 +辛 +辜 +辟 +辣 +辦 +辧 +辨 +辭 +辮 +辯 +辰 +辱 +農 +辵 +辺 +辻 +込 +迂 +迄 +迅 +迎 +近 +返 +迢 +迤 +迥 +迦 +迪 +迫 +迭 +迮 +述 +迴 +迵 +迷 +迸 +迺 +追 +退 +送 +逃 +逄 +逅 +逆 +逈 +逋 +逌 +逍 +逎 +透 +逐 +逑 +途 +逕 +逖 +逗 +這 +通 +逛 +逝 +逞 +速 +造 +逢 +連 +逤 +逨 +逮 +逯 +進 +逴 +逵 +逸 +逹 +逺 +逼 +逾 +遁 +遂 +遄 +遇 +遊 +運 +遍 +過 +遏 +遐 +遒 +道 +達 +違 +遘 +遙 +遛 +遜 +遞 +遠 +遢 +遣 +遨 +適 +遭 +遮 +遯 +遲 +遴 +遵 +遶 +遷 +選 +遹 +遺 +遼 +避 +邀 +邁 +邂 +邃 +還 +邇 +邈 +邉 +邊 +邋 +邏 +邑 +邕 +邗 +邙 +邛 +邠 +邡 +邢 +那 +邦 +邨 +邪 +邯 +邰 +邱 +邲 +邳 +邴 +邵 +邸 +邽 +邾 +郁 +郃 +郄 +郅 +郇 +郊 +郋 +郎 +郗 +郛 +郜 +郝 +郞 +郟 +郡 +郢 +郤 +部 +郪 +郫 +郭 +郯 +郳 +郴 +郵 +郷 +都 +郾 +郿 +鄂 +鄃 +鄄 +鄆 +鄉 +鄋 +鄑 +鄒 +鄔 +鄖 +鄗 +鄘 +鄙 +鄚 +鄜 +鄞 +鄠 +鄢 +鄣 +鄤 +鄧 +鄩 +鄫 +鄭 +鄯 +鄰 +鄱 +鄲 +鄳 +鄴 +鄺 +酃 +酆 +酈 +酉 +酊 +酋 +酌 +配 +酎 +酏 +酐 +酒 +酔 +酗 +酚 +酞 +酡 +酢 +酣 +酥 +酩 +酪 +酬 +酮 +酯 +酰 +酴 +酵 +酶 +酷 +酸 +酺 +酼 +醁 +醂 +醃 +醅 +醇 +醉 +醋 +醌 +醍 +醐 +醒 +醚 +醛 +醜 +醞 +醢 +醣 +醪 +醫 +醬 +醮 +醯 +醴 +醺 +醾 +醿 +釀 +釁 +釆 +采 +釉 +釋 +里 +重 +野 +量 +釐 +金 +釒 +釓 +釔 +釕 +釗 +釘 +釙 +釚 +釜 +針 +釣 +釤 +釦 +釧 +釩 +釪 +釭 +釴 +釵 +釷 +釹 +釺 +鈀 +鈁 +鈄 +鈇 +鈈 +鈉 +鈊 +鈍 +鈏 +鈐 +鈑 +鈔 +鈕 +鈖 +鈞 +鈢 +鈣 +鈥 +鈦 +鈫 +鈮 +鈰 +鈳 +鈴 +鈷 +鈸 +鈹 +鈺 +鈾 +鈿 +鉀 +鉄 +鉅 +鉆 +鉈 +鉉 +鉋 +鉌 +鉍 +鉏 +鉑 +鉓 +鉗 +鉚 +鉛 +鉞 +鉟 +鉤 +鉦 +鉬 +鉭 +鉲 +鉶 +鉷 +鉸 +鉻 +鉾 +鉿 +銀 +銂 +銃 +銅 +銋 +銍 +銑 +銓 +銕 +銖 +銘 +銚 +銜 +銠 +銣 +銥 +銦 +銨 +銩 +銪 +銫 +銬 +銭 +銱 +銲 +銳 +銶 +銷 +銹 +銻 +銼 +銾 +鋁 +鋅 +鋆 +鋇 +鋌 +鋏 +鋐 +鋒 +鋕 +鋗 +鋙 +鋡 +鋤 +鋥 +鋦 +鋨 +鋪 +鋮 +鋯 +鋰 +鋱 +鋳 +鋶 +鋸 +鋹 +鋼 +錀 +錄 +錏 +錐 +錒 +錕 +錘 +錚 +錞 +錟 +錠 +錡 +錢 +錦 +錨 +錫 +錬 +錮 +錯 +錳 +錶 +錸 +錻 +鍀 +鍇 +鍈 +鍉 +鍊 +鍋 +鍍 +鍏 +鍔 +鍘 +鍛 +鍝 +鍟 +鍠 +鍥 +鍩 +鍬 +鍱 +鍳 +鍵 +鍶 +鍷 +鍺 +鍼 +鍾 +鎂 +鎅 +鎊 +鎌 +鎏 +鎓 +鎔 +鎖 +鎗 +鎘 +鎚 +鎛 +鎢 +鎣 +鎦 +鎧 +鎪 +鎬 +鎭 +鎮 +鎰 +鎳 +鎵 +鎻 +鏃 +鏇 +鏈 +鏊 +鏌 +鏐 +鏑 +鏓 +鏖 +鏗 +鏘 +鏜 +鏝 +鏞 +鏟 +鏡 +鏢 +鏤 +鏦 +鏳 +鏴 +鏵 +鏷 +鏻 +鏽 +鐃 +鐇 +鐈 +鐓 +鐔 +鐘 +鐙 +鐠 +鐡 +鐤 +鐦 +鐧 +鐫 +鐬 +鐭 +鐮 +鐲 +鐳 +鐵 +鐸 +鐺 +鐽 +鐿 +鑀 +鑁 +鑂 +鑄 +鑅 +鑊 +鑌 +鑑 +鑒 +鑛 +鑠 +鑣 +鑨 +鑪 +鑫 +鑭 +鑰 +鑲 +鑴 +鑷 +鑼 +鑽 +鑾 +鑿 +長 +門 +閂 +閃 +閆 +閉 +開 +閎 +閏 +閑 +閒 +間 +閔 +閘 +閜 +閞 +閟 +関 +閣 +閥 +閦 +閨 +閩 +閬 +閭 +閰 +閱 +閶 +閹 +閻 +閼 +閾 +閿 +闆 +闇 +闈 +闊 +闋 +闌 +闍 +闐 +闓 +闔 +闕 +闖 +闘 +關 +闞 +闡 +闢 +闥 +阜 +阝 +阡 +阪 +阭 +阮 +阯 +阱 +防 +阻 +阿 +陀 +陁 +陂 +附 +陋 +陌 +降 +限 +陔 +陘 +陛 +陜 +陝 +陞 +陟 +陡 +院 +陣 +除 +陪 +陬 +陰 +陲 +陳 +陵 +陶 +陷 +陸 +険 +陽 +隄 +隅 +隆 +隈 +隊 +隋 +隍 +階 +隔 +隕 +隗 +隘 +隙 +際 +障 +隣 +隧 +隨 +險 +隰 +隱 +隲 +隳 +隴 +隷 +隸 +隹 +隻 +隼 +雀 +雁 +雄 +雅 +集 +雇 +雉 +雋 +雌 +雍 +雎 +雑 +雒 +雕 +雖 +雙 +雛 +雜 +雝 +雞 +離 +難 +雨 +雩 +雪 +雫 +雯 +雱 +雲 +零 +雷 +雹 +電 +需 +霄 +霅 +霆 +震 +霈 +霉 +霊 +霍 +霎 +霏 +霑 +霓 +霖 +霙 +霜 +霞 +霤 +霧 +霨 +霰 +露 +霶 +霸 +霹 +霽 +霾 +靁 +靂 +靄 +靈 +靉 +靑 +青 +靖 +靚 +靛 +靜 +非 +靠 +靡 +面 +革 +靫 +靬 +靭 +靳 +靴 +靶 +靺 +靼 +鞅 +鞆 +鞋 +鞍 +鞏 +鞘 +鞞 +鞠 +鞣 +鞥 +鞦 +鞨 +鞭 +鞮 +鞴 +韁 +韃 +韆 +韋 +韌 +韑 +韓 +韙 +韜 +韞 +韠 +韡 +韭 +韮 +音 +韶 +韺 +韻 +韾 +響 +頁 +頂 +頃 +項 +順 +須 +頊 +頌 +頍 +頎 +頏 +預 +頑 +頒 +頓 +頔 +頗 +領 +頜 +頠 +頡 +頤 +頦 +頫 +頭 +頰 +頴 +頵 +頷 +頸 +頹 +頻 +頼 +顆 +題 +額 +顎 +顏 +顒 +顓 +顔 +顕 +顗 +願 +顙 +顛 +類 +顥 +顧 +顫 +顯 +顰 +顱 +顳 +顴 +風 +颮 +颯 +颱 +颶 +颺 +颼 +飄 +飆 +飈 +飛 +食 +飠 +飡 +飢 +飥 +飩 +飪 +飫 +飬 +飭 +飮 +飯 +飲 +飴 +飼 +飽 +飾 +餃 +餄 +餅 +餉 +養 +餌 +餎 +餐 +餒 +餓 +餗 +餘 +餚 +餛 +餞 +餠 +餡 +館 +餮 +餵 +餺 +餾 +餿 +饃 +饅 +饋 +饌 +饑 +饒 +饕 +饗 +饞 +饟 +饢 +首 +馗 +馘 +香 +馛 +馥 +馦 +馨 +馬 +馭 +馮 +馯 +馱 +馳 +馴 +馼 +駁 +駄 +駅 +駆 +駐 +駑 +駒 +駔 +駕 +駘 +駙 +駛 +駝 +駟 +駢 +駭 +駰 +駱 +駿 +騁 +騂 +騄 +騅 +騋 +騎 +騏 +験 +騖 +騙 +騤 +騨 +騫 +騭 +騮 +騰 +騶 +騷 +騾 +驁 +驃 +驄 +驅 +驊 +驌 +驍 +驎 +驒 +驕 +驗 +驚 +驛 +驟 +驢 +驤 +驥 +驩 +驪 +骨 +骯 +骰 +骶 +骷 +骸 +骼 +髀 +髂 +髎 +髏 +髑 +髒 +髓 +體 +高 +髙 +髡 +髦 +髪 +髭 +髮 +髯 +髲 +髷 +髹 +髻 +鬃 +鬄 +鬅 +鬆 +鬍 +鬚 +鬟 +鬢 +鬣 +鬥 +鬧 +鬨 +鬩 +鬪 +鬬 +鬮 +鬯 +鬱 +鬲 +鬹 +鬻 +鬼 +魁 +魂 +魃 +魄 +魅 +魈 +魋 +魍 +魎 +魏 +魔 +魕 +魘 +魚 +魛 +魞 +魟 +魣 +魨 +魩 +魮 +魯 +魴 +魷 +鮀 +鮁 +鮃 +鮄 +鮊 +鮋 +鮍 +鮐 +鮑 +鮒 +鮓 +鮗 +鮜 +鮟 +鮠 +鮡 +鮣 +鮨 +鮪 +鮫 +鮭 +鮮 +鮰 +鮸 +鮹 +鮻 +鯀 +鯁 +鯃 +鯇 +鯉 +鯊 +鯏 +鯒 +鯓 +鯔 +鯕 +鯖 +鯗 +鯙 +鯛 +鯡 +鯢 +鯤 +鯧 +鯨 +鯪 +鯭 +鯮 +鯰 +鯶 +鯷 +鯻 +鯽 +鯿 +鰂 +鰃 +鰆 +鰈 +鰉 +鰍 +鰏 +鰒 +鰓 +鰕 +鰗 +鰛 +鰜 +鰟 +鰣 +鰤 +鰧 +鰨 +鰩 +鰭 +鰮 +鰱 +鰲 +鰳 +鰶 +鰷 +鰹 +鰺 +鰻 +鰼 +鰾 +鱀 +鱂 +鱅 +鱇 +鱈 +鱉 +鱊 +鱒 +鱓 +鱔 +鱖 +鱗 +鱘 +鱚 +鱝 +鱟 +鱠 +鱣 +鱥 +鱧 +鱨 +鱬 +鱮 +鱰 +鱲 +鱵 +鱷 +鱸 +鱺 +鱻 +鳥 +鳧 +鳩 +鳯 +鳰 +鳳 +鳴 +鳶 +鳽 +鴆 +鴇 +鴉 +鴒 +鴓 +鴕 +鴗 +鴛 +鴝 +鴞 +鴟 +鴡 +鴣 +鴦 +鴨 +鴫 +鴯 +鴰 +鴴 +鴻 +鴿 +鵂 +鵄 +鵎 +鵐 +鵑 +鵒 +鵓 +鵙 +鵜 +鵝 +鵞 +鵟 +鵠 +鵡 +鵪 +鵬 +鵯 +鵰 +鵲 +鵵 +鵼 +鵾 +鶆 +鶇 +鶉 +鶏 +鶒 +鶓 +鶘 +鶚 +鶡 +鶥 +鶩 +鶬 +鶯 +鶲 +鶴 +鶹 +鶺 +鶻 +鶼 +鶿 +鷂 +鷄 +鷉 +鷎 +鷓 +鷗 +鷙 +鷚 +鷟 +鷥 +鷦 +鷫 +鷯 +鷲 +鷳 +鷸 +鷹 +鷺 +鸊 +鸌 +鸐 +鸑 +鸕 +鸘 +鸚 +鸛 +鸜 +鸝 +鸞 +鹮 +鹵 +鹹 +鹼 +鹽 +鹿 +麂 +麅 +麇 +麈 +麊 +麋 +麐 +麒 +麓 +麗 +麝 +麞 +麟 +麥 +麩 +麪 +麯 +麴 +麵 +麹 +麺 +麻 +麼 +麽 +麾 +麿 +黁 +黃 +黇 +黌 +黍 +黎 +黏 +黐 +黑 +黒 +黔 +默 +黙 +黛 +黜 +黝 +點 +黟 +黥 +黧 +黨 +黯 +黴 +黶 +黻 +黼 +黽 +黿 +鼂 +鼇 +鼈 +鼉 +鼎 +鼐 +鼒 +鼓 +鼕 +鼙 +鼠 +鼢 +鼩 +鼬 +鼯 +鼱 +鼴 +鼷 +鼻 +鼽 +鼾 +齊 +齋 +齒 +齕 +齡 +齣 +齦 +齧 +齲 +齶 +龍 +龎 +龐 +龑 +龔 +龕 +龜 +龝 +龠 +龢 +郎 +凉 +﹑ +﹗ +﹝ +﹞ +﹢ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +K +L +M +N +O +P +R +S +T +U +V +W +Y +Z +[ +] +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +r +s +t +u +z +{ +| +} +~ +¥ +𣇉 + diff --git a/batch_running_task/pytorchocr/utils/dict/chinese_cht_dict.txt b/batch_running_task/pytorchocr/utils/dict/chinese_cht_dict.txt new file mode 100644 index 0000000..cc1aa47 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/chinese_cht_dict.txt @@ -0,0 +1,8421 @@ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +¥ +® +° +± +² +´ +· +» +É +Ë +Ó +× +Ü +à +á +ä +è +é +ì +í +ò +ó +÷ +ú +ü +ā +ē +ī +ō +ū +ǐ +ǒ +ɔ +ɡ +ʌ +ˋ +Λ +Ο +Φ +Ω +α +β +ε +θ +μ +π +З +И +Й +П +Я +г +— +‖ +‘ +’ +“ +” +• +… +‧ +′ +″ +※ +℃ +№ +™ +Ⅱ +Ⅲ +Ⅳ +← +↑ +→ +↓ +⇋ +∈ +∑ +√ +∞ +∣ +∧ +∩ +∫ +∶ +≈ +≠ +≤ +≥ +⊙ +⊥ +① +② +③ +④ +⑧ +⑴ +⑵ +⑶ +─ +│ +┅ +┌ +├ +█ +▎ +▏ +▕ +■ +□ +▪ +▲ +△ +▼ +◆ +◇ +○ +◎ +● +◥ +★ +☆ +❋ +❤ +  +、 +。 +〇 +〉 +《 +》 +「 +」 +『 +』 +【 +】 +〔 +〕 +〖 +〗 +の +サ +シ +ジ +マ +ㄱ +ㆍ +㎏ +㎡ +㐂 +㐱 +㙟 +㴪 +㸃 +䖝 +䝉 +䰾 +䲁 +一 +丁 +七 +丄 +丈 +三 +上 +下 +丌 +不 +与 +丏 +丐 +丑 +且 +丕 +世 +丘 +丙 +丞 +丟 +両 +並 +丨 +丫 +中 +丰 +串 +丶 +丸 +丹 +主 +丼 +丿 +乂 +乃 +久 +么 +之 +乍 +乎 +乏 +乒 +乓 +乖 +乗 +乘 +乙 +乚 +乜 +九 +乞 +也 +乩 +乭 +乳 +乸 +乹 +乾 +亀 +亂 +亅 +了 +予 +亊 +事 +二 +亍 +云 +互 +亓 +五 +井 +亘 +些 +亜 +亞 +亟 +亠 +亡 +亢 +交 +亥 +亦 +亨 +享 +京 +亭 +亮 +亰 +亳 +亶 +亹 +人 +亻 +什 +仁 +仂 +仃 +仄 +仇 +仉 +今 +介 +仍 +仏 +仔 +仕 +他 +仗 +付 +仙 +仛 +仝 +仞 +仟 +仡 +代 +令 +以 +仨 +仫 +仮 +仰 +仲 +仳 +仵 +件 +仺 +任 +仼 +份 +仿 +企 +伃 +伈 +伉 +伊 +伋 +伍 +伎 +伏 +伐 +休 +伕 +伙 +伝 +伢 +伯 +估 +伱 +伴 +伶 +伷 +伸 +伺 +似 +伽 +伾 +佀 +佁 +佃 +但 +佇 +佈 +佉 +佋 +位 +低 +住 +佐 +佑 +体 +佔 +何 +佗 +佘 +余 +佚 +佛 +作 +佝 +佞 +佟 +你 +佣 +佤 +佧 +佩 +佬 +佯 +佰 +佳 +併 +佶 +佹 +佺 +佼 +佾 +使 +侁 +侃 +侄 +侅 +來 +侈 +侊 +例 +侍 +侏 +侑 +侖 +侗 +侘 +侚 +供 +依 +侞 +価 +侮 +侯 +侵 +侶 +侷 +侹 +便 +俁 +係 +促 +俄 +俅 +俊 +俋 +俌 +俍 +俎 +俏 +俐 +俑 +俗 +俘 +俚 +俛 +保 +俞 +俟 +俠 +信 +俬 +修 +俯 +俱 +俳 +俴 +俵 +俶 +俸 +俺 +俽 +俾 +倆 +倈 +倉 +個 +倌 +倍 +們 +倒 +倓 +倔 +倖 +倗 +倘 +候 +倚 +倜 +倞 +借 +倡 +倢 +倣 +値 +倦 +倧 +倩 +倪 +倫 +倬 +倭 +倮 +倻 +值 +偁 +偃 +假 +偈 +偉 +偊 +偌 +偍 +偎 +偏 +偓 +偕 +做 +停 +健 +偪 +偲 +側 +偵 +偶 +偷 +偸 +偽 +傀 +傃 +傅 +傈 +傉 +傍 +傑 +傒 +傕 +傖 +傘 +備 +傜 +傢 +傣 +催 +傭 +傲 +傳 +債 +傷 +傻 +傾 +僅 +僉 +僊 +働 +像 +僑 +僔 +僕 +僖 +僙 +僚 +僜 +僡 +僧 +僩 +僭 +僮 +僰 +僱 +僳 +僴 +僵 +價 +僻 +儀 +儁 +儂 +億 +儆 +儇 +儈 +儉 +儋 +儐 +儒 +儔 +儕 +儘 +儚 +儞 +償 +儡 +儥 +儦 +優 +儫 +儱 +儲 +儷 +儺 +儻 +儼 +兀 +允 +元 +兄 +充 +兆 +先 +光 +克 +兌 +免 +児 +兒 +兔 +兕 +兗 +兜 +入 +內 +全 +兩 +兪 +八 +公 +六 +兮 +共 +兵 +其 +具 +典 +兼 +兿 +冀 +冂 +円 +冇 +冉 +冊 +再 +冏 +冑 +冒 +冕 +冖 +冗 +冚 +冠 +冢 +冤 +冥 +冧 +冨 +冪 +冫 +冬 +冮 +冰 +冴 +冶 +冷 +冼 +冽 +凃 +凄 +准 +凈 +凋 +凌 +凍 +凖 +凜 +凝 +凞 +几 +凡 +処 +凪 +凬 +凰 +凱 +凳 +凵 +凶 +凸 +凹 +出 +函 +刀 +刁 +刂 +刃 +刄 +分 +切 +刈 +刊 +刎 +刑 +划 +列 +初 +判 +別 +刦 +刧 +刨 +利 +刪 +刮 +到 +制 +刷 +券 +刺 +刻 +刼 +剁 +剃 +則 +削 +剋 +剌 +前 +剎 +剏 +剔 +剖 +剛 +剝 +剡 +剣 +剩 +剪 +剮 +副 +割 +創 +剿 +劃 +劄 +劇 +劈 +劉 +劊 +劌 +劍 +劑 +劔 +力 +功 +加 +劣 +助 +努 +劫 +劬 +劭 +劵 +効 +劼 +劾 +勁 +勃 +勅 +勇 +勉 +勐 +勑 +勒 +勔 +動 +勖 +勗 +勘 +務 +勛 +勝 +勞 +募 +勢 +勣 +勤 +勦 +勰 +勱 +勲 +勳 +勵 +勷 +勸 +勺 +勻 +勾 +勿 +匂 +匄 +包 +匆 +匈 +匋 +匍 +匏 +匐 +匕 +化 +北 +匙 +匚 +匝 +匠 +匡 +匣 +匪 +匯 +匱 +匸 +匹 +匾 +匿 +區 +十 +千 +卅 +升 +午 +卉 +半 +卋 +卍 +卐 +卑 +卒 +卓 +協 +南 +博 +卜 +卞 +卟 +占 +卡 +卣 +卦 +卧 +卩 +卬 +卮 +卯 +印 +危 +卲 +即 +卵 +卷 +卸 +卹 +卺 +卻 +卽 +卿 +厄 +厓 +厔 +厙 +厚 +厝 +原 +厥 +厭 +厰 +厲 +厴 +厶 +去 +參 +叄 +又 +叉 +及 +友 +反 +収 +叔 +叕 +取 +受 +叛 +叟 +叡 +叢 +口 +古 +句 +另 +叨 +叩 +只 +叫 +召 +叭 +叮 +可 +台 +叱 +史 +右 +叵 +司 +叻 +叼 +吁 +吃 +各 +吆 +合 +吉 +吊 +吋 +同 +名 +后 +吏 +吐 +向 +吒 +吔 +吖 +君 +吝 +吞 +吟 +吠 +吡 +吥 +否 +吧 +吩 +含 +吮 +吱 +吲 +吳 +吵 +吶 +吸 +吹 +吻 +吼 +吾 +呀 +呂 +呃 +呈 +呉 +告 +呋 +呎 +呢 +呤 +呦 +周 +呱 +味 +呵 +呷 +呸 +呼 +命 +呾 +咀 +咁 +咂 +咄 +咅 +咆 +咋 +和 +咎 +咑 +咒 +咔 +咕 +咖 +咗 +咘 +咚 +咟 +咤 +咥 +咧 +咨 +咩 +咪 +咫 +咬 +咭 +咯 +咱 +咲 +咳 +咸 +咻 +咼 +咽 +咾 +咿 +哀 +品 +哂 +哄 +哆 +哇 +哈 +哉 +哌 +哎 +哏 +哐 +哖 +哚 +哞 +員 +哥 +哦 +哨 +哩 +哪 +哭 +哮 +哱 +哲 +哺 +哼 +唃 +唄 +唆 +唇 +唉 +唏 +唐 +唑 +唔 +唘 +唧 +唫 +唬 +唭 +售 +唯 +唱 +唳 +唵 +唷 +唸 +唻 +唾 +啁 +啃 +啄 +商 +啉 +啊 +啍 +問 +啓 +啖 +啚 +啜 +啞 +啟 +啡 +啣 +啤 +啥 +啦 +啪 +啫 +啯 +啰 +啱 +啲 +啵 +啶 +啷 +啻 +啼 +啾 +喀 +喂 +喃 +善 +喆 +喇 +喈 +喉 +喊 +喋 +喏 +喔 +喘 +喙 +喚 +喜 +喝 +喢 +喦 +喧 +喪 +喫 +喬 +單 +喰 +喱 +喲 +喳 +喵 +喹 +喻 +喼 +嗄 +嗅 +嗆 +嗇 +嗊 +嗎 +嗑 +嗒 +嗓 +嗔 +嗖 +嗚 +嗜 +嗝 +嗞 +嗡 +嗢 +嗣 +嗦 +嗨 +嗩 +嗪 +嗮 +嗯 +嗲 +嗶 +嗹 +嗽 +嘀 +嘅 +嘆 +嘉 +嘌 +嘍 +嘎 +嘏 +嘔 +嘗 +嘚 +嘛 +嘜 +嘞 +嘟 +嘢 +嘣 +嘥 +嘧 +嘩 +嘬 +嘮 +嘯 +嘰 +嘲 +嘴 +嘶 +嘸 +嘹 +嘻 +嘿 +噁 +噌 +噍 +噏 +噓 +噗 +噝 +噠 +噢 +噤 +噥 +噦 +器 +噩 +噪 +噬 +噯 +噰 +噲 +噴 +噶 +噸 +噹 +噻 +嚇 +嚈 +嚎 +嚏 +嚐 +嚒 +嚓 +嚕 +嚗 +嚙 +嚞 +嚟 +嚤 +嚦 +嚧 +嚨 +嚩 +嚮 +嚳 +嚴 +嚶 +嚷 +嚼 +嚿 +囀 +囂 +囃 +囉 +囊 +囍 +囑 +囒 +囓 +囗 +囚 +四 +囝 +回 +因 +囡 +団 +囤 +囧 +囪 +囮 +囯 +困 +囲 +図 +囶 +囷 +囹 +固 +囿 +圂 +圃 +圄 +圈 +圉 +國 +圍 +圏 +園 +圓 +圖 +圗 +團 +圜 +土 +圧 +在 +圩 +圪 +圭 +圯 +地 +圳 +圻 +圾 +址 +均 +坊 +坋 +坌 +坍 +坎 +坐 +坑 +坖 +坡 +坣 +坤 +坦 +坨 +坩 +坪 +坫 +坬 +坭 +坮 +坯 +坳 +坵 +坶 +坷 +坻 +垂 +垃 +垈 +型 +垍 +垓 +垕 +垚 +垛 +垞 +垟 +垠 +垢 +垣 +垮 +垯 +垰 +垵 +垸 +垻 +垿 +埃 +埅 +埇 +埈 +埋 +埌 +城 +埏 +埒 +埔 +埕 +埗 +埜 +域 +埠 +埡 +埤 +埧 +埨 +埪 +埭 +埮 +埴 +埵 +執 +培 +基 +埻 +埼 +堀 +堂 +堃 +堅 +堆 +堇 +堈 +堉 +堊 +堍 +堖 +堝 +堡 +堤 +堦 +堪 +堮 +堯 +堰 +報 +場 +堵 +堷 +堺 +塀 +塅 +塆 +塊 +塋 +塌 +塍 +塏 +塑 +塔 +塗 +塘 +塙 +塜 +塞 +塡 +塢 +塤 +塨 +塩 +填 +塬 +塭 +塰 +塱 +塲 +塵 +塹 +塽 +塾 +墀 +境 +墅 +墉 +墊 +墎 +墓 +増 +墘 +墜 +增 +墟 +墡 +墣 +墨 +墩 +墫 +墬 +墮 +墱 +墳 +墺 +墼 +墾 +壁 +壄 +壆 +壇 +壋 +壌 +壎 +壐 +壑 +壓 +壔 +壕 +壘 +壙 +壞 +壟 +壠 +壢 +壤 +壩 +士 +壬 +壯 +壱 +壴 +壹 +壺 +壽 +夀 +夆 +変 +夊 +夋 +夌 +夏 +夔 +夕 +外 +夙 +多 +夜 +夠 +夢 +夤 +夥 +大 +天 +太 +夫 +夬 +夭 +央 +夯 +失 +夷 +夾 +奀 +奄 +奇 +奈 +奉 +奎 +奏 +奐 +契 +奓 +奔 +奕 +套 +奘 +奚 +奠 +奢 +奣 +奧 +奩 +奪 +奫 +奭 +奮 +女 +奴 +奶 +她 +好 +妀 +妁 +如 +妃 +妄 +妊 +妍 +妏 +妑 +妒 +妓 +妖 +妙 +妝 +妞 +妠 +妤 +妥 +妧 +妨 +妭 +妮 +妯 +妲 +妳 +妸 +妹 +妺 +妻 +妾 +姀 +姁 +姃 +姆 +姈 +姉 +姊 +始 +姌 +姍 +姐 +姑 +姒 +姓 +委 +姚 +姜 +姝 +姣 +姥 +姦 +姨 +姪 +姫 +姬 +姮 +姵 +姶 +姸 +姻 +姿 +威 +娃 +娉 +娋 +娌 +娍 +娎 +娑 +娖 +娘 +娛 +娜 +娟 +娠 +娣 +娥 +娩 +娫 +娳 +娶 +娸 +娼 +娽 +婀 +婁 +婆 +婉 +婊 +婑 +婕 +婚 +婢 +婦 +婧 +婪 +婭 +婯 +婷 +婺 +婻 +婼 +婿 +媃 +媄 +媊 +媐 +媒 +媓 +媖 +媗 +媚 +媛 +媜 +媞 +媧 +媭 +媯 +媲 +媳 +媺 +媼 +媽 +媾 +媿 +嫁 +嫂 +嫄 +嫈 +嫉 +嫌 +嫖 +嫘 +嫚 +嫡 +嫣 +嫦 +嫩 +嫪 +嫲 +嫳 +嫵 +嫺 +嫻 +嬅 +嬈 +嬉 +嬋 +嬌 +嬗 +嬛 +嬝 +嬡 +嬤 +嬨 +嬪 +嬬 +嬭 +嬰 +嬴 +嬸 +嬾 +嬿 +孀 +孃 +孆 +孋 +孌 +子 +孑 +孔 +孕 +孖 +字 +存 +孚 +孛 +孜 +孝 +孟 +孢 +季 +孤 +孩 +孫 +孬 +孮 +孰 +孳 +孵 +學 +孺 +孻 +孽 +孿 +宀 +它 +宅 +宇 +守 +安 +宋 +完 +宍 +宏 +宓 +宕 +宗 +官 +宙 +定 +宛 +宜 +実 +客 +宣 +室 +宥 +宦 +宧 +宮 +宰 +害 +宴 +宵 +家 +宸 +容 +宿 +寀 +寁 +寂 +寄 +寅 +密 +寇 +寈 +寊 +富 +寐 +寒 +寓 +寔 +寕 +寖 +寗 +寘 +寛 +寜 +寞 +察 +寡 +寢 +寤 +寥 +實 +寧 +寨 +審 +寫 +寬 +寮 +寯 +寰 +寳 +寵 +寶 +寸 +寺 +対 +封 +専 +尃 +射 +將 +專 +尉 +尊 +尋 +對 +導 +小 +尐 +少 +尓 +尕 +尖 +尗 +尙 +尚 +尢 +尤 +尨 +尪 +尬 +就 +尷 +尹 +尺 +尻 +尼 +尾 +尿 +局 +屁 +屄 +居 +屆 +屇 +屈 +屋 +屌 +屍 +屎 +屏 +屐 +屑 +屓 +展 +屚 +屜 +屠 +屢 +層 +履 +屬 +屭 +屯 +山 +屹 +屺 +屻 +岀 +岈 +岌 +岐 +岑 +岔 +岡 +岢 +岣 +岧 +岩 +岪 +岫 +岬 +岰 +岱 +岳 +岵 +岷 +岸 +岻 +峁 +峅 +峇 +峋 +峍 +峒 +峘 +峙 +峚 +峠 +峨 +峩 +峪 +峭 +峯 +峰 +峴 +島 +峻 +峼 +峽 +崁 +崆 +崇 +崈 +崋 +崍 +崎 +崐 +崑 +崒 +崔 +崖 +崗 +崘 +崙 +崚 +崛 +崞 +崟 +崠 +崢 +崤 +崧 +崩 +崬 +崮 +崱 +崴 +崵 +崶 +崽 +嵇 +嵊 +嵋 +嵌 +嵎 +嵐 +嵒 +嵕 +嵖 +嵗 +嵙 +嵛 +嵜 +嵨 +嵩 +嵬 +嵮 +嵯 +嵰 +嵴 +嵻 +嵿 +嶁 +嶂 +嶃 +嶄 +嶇 +嶋 +嶌 +嶍 +嶒 +嶔 +嶗 +嶝 +嶠 +嶢 +嶦 +嶧 +嶪 +嶬 +嶰 +嶲 +嶴 +嶷 +嶸 +嶺 +嶼 +嶽 +巂 +巄 +巆 +巋 +巌 +巍 +巎 +巑 +巒 +巔 +巖 +巘 +巛 +川 +州 +巡 +巢 +工 +左 +巧 +巨 +巫 +差 +巰 +己 +已 +巳 +巴 +巶 +巷 +巻 +巽 +巾 +巿 +市 +布 +帆 +希 +帑 +帔 +帕 +帖 +帘 +帙 +帚 +帛 +帝 +帡 +帢 +帥 +師 +席 +帯 +帰 +帳 +帶 +帷 +常 +帽 +幀 +幃 +幄 +幅 +幌 +幔 +幕 +幗 +幚 +幛 +幟 +幡 +幢 +幣 +幪 +幫 +干 +平 +年 +幵 +幷 +幸 +幹 +幺 +幻 +幼 +幽 +幾 +庀 +庁 +広 +庇 +床 +序 +底 +庖 +店 +庚 +府 +庠 +庢 +庥 +度 +座 +庫 +庭 +庲 +庵 +庶 +康 +庸 +庹 +庼 +庾 +廁 +廂 +廄 +廆 +廈 +廉 +廊 +廋 +廌 +廍 +廑 +廓 +廔 +廕 +廖 +廙 +廚 +廝 +廞 +廟 +廠 +廡 +廢 +廣 +廧 +廨 +廩 +廬 +廰 +廱 +廳 +延 +廷 +廸 +建 +廻 +廼 +廿 +弁 +弄 +弅 +弇 +弈 +弉 +弊 +弋 +弍 +式 +弐 +弒 +弓 +弔 +引 +弖 +弗 +弘 +弛 +弟 +弢 +弦 +弧 +弨 +弩 +弭 +弱 +張 +強 +弸 +弼 +弾 +彀 +彄 +彅 +彆 +彈 +彊 +彌 +彎 +彐 +彔 +彖 +彗 +彘 +彙 +彜 +彞 +彠 +彡 +形 +彣 +彤 +彥 +彧 +彩 +彪 +彫 +彬 +彭 +彰 +影 +彳 +彷 +役 +彼 +彿 +往 +征 +徂 +待 +徇 +很 +徉 +徊 +律 +後 +徐 +徑 +徒 +得 +徘 +徙 +徜 +從 +徠 +御 +徧 +徨 +復 +循 +徫 +徬 +徭 +微 +徳 +徴 +徵 +德 +徸 +徹 +徽 +心 +忄 +必 +忉 +忌 +忍 +忐 +忑 +忒 +志 +忘 +忙 +応 +忝 +忞 +忠 +快 +忬 +忯 +忱 +忳 +念 +忻 +忽 +忿 +怍 +怎 +怒 +怕 +怖 +怙 +怛 +思 +怠 +怡 +急 +怦 +性 +怨 +怪 +怯 +怵 +恁 +恂 +恃 +恆 +恊 +恍 +恐 +恕 +恙 +恢 +恣 +恤 +恥 +恨 +恩 +恪 +恬 +恭 +息 +恰 +恵 +恿 +悄 +悅 +悆 +悉 +悌 +悍 +悔 +悖 +悚 +悛 +悝 +悞 +悟 +悠 +患 +悧 +您 +悪 +悰 +悲 +悳 +悵 +悶 +悸 +悼 +情 +惆 +惇 +惑 +惔 +惕 +惘 +惚 +惜 +惟 +惠 +惡 +惣 +惦 +惰 +惱 +惲 +想 +惶 +惹 +惺 +愁 +愃 +愆 +愈 +愉 +愍 +意 +愐 +愒 +愔 +愕 +愚 +愛 +愜 +感 +愣 +愧 +愨 +愫 +愭 +愴 +愷 +愼 +愾 +愿 +慄 +慈 +態 +慌 +慎 +慕 +慘 +慚 +慜 +慟 +慢 +慣 +慥 +慧 +慨 +慮 +慰 +慳 +慵 +慶 +慷 +慾 +憂 +憊 +憋 +憍 +憎 +憐 +憑 +憓 +憕 +憙 +憚 +憤 +憧 +憨 +憩 +憫 +憬 +憲 +憶 +憺 +憻 +憾 +懂 +懃 +懇 +懈 +應 +懋 +懌 +懍 +懐 +懣 +懦 +懮 +懲 +懵 +懶 +懷 +懸 +懺 +懼 +懽 +懾 +懿 +戀 +戇 +戈 +戊 +戌 +戍 +戎 +成 +我 +戒 +戔 +戕 +或 +戙 +戚 +戛 +戟 +戡 +戢 +戥 +戦 +戩 +截 +戮 +戰 +戱 +戲 +戳 +戴 +戶 +戸 +戻 +戽 +戾 +房 +所 +扁 +扆 +扇 +扈 +扉 +手 +扌 +才 +扎 +扒 +打 +扔 +托 +扙 +扛 +扞 +扣 +扥 +扦 +扭 +扮 +扯 +扳 +扶 +批 +扼 +找 +承 +技 +抃 +抄 +抇 +抉 +把 +抑 +抒 +抓 +投 +抖 +抗 +折 +抦 +披 +抬 +抱 +抵 +抹 +抻 +押 +抽 +抿 +拂 +拆 +拇 +拈 +拉 +拋 +拌 +拍 +拎 +拏 +拐 +拒 +拓 +拔 +拖 +拗 +拘 +拙 +拚 +招 +拜 +拝 +拡 +括 +拭 +拮 +拯 +拱 +拳 +拴 +拷 +拺 +拼 +拽 +拾 +拿 +持 +指 +按 +挎 +挑 +挖 +挙 +挨 +挪 +挫 +振 +挲 +挵 +挹 +挺 +挻 +挾 +捂 +捆 +捉 +捌 +捍 +捎 +捏 +捐 +捒 +捕 +捜 +捦 +捧 +捨 +捩 +捫 +捭 +捱 +捲 +捶 +捷 +捺 +捻 +掀 +掂 +掃 +掄 +掇 +授 +掉 +掌 +掏 +掐 +排 +掖 +掘 +掙 +掛 +掞 +掟 +掠 +採 +探 +掣 +接 +控 +推 +掩 +措 +掬 +掰 +掾 +揀 +揄 +揆 +揉 +揍 +描 +提 +插 +揔 +揖 +揚 +換 +握 +揪 +揭 +揮 +援 +揸 +揺 +損 +搏 +搐 +搓 +搔 +搖 +搗 +搜 +搞 +搠 +搢 +搪 +搬 +搭 +搳 +搴 +搵 +搶 +搽 +搾 +摂 +摒 +摔 +摘 +摜 +摞 +摟 +摠 +摧 +摩 +摭 +摯 +摳 +摴 +摵 +摶 +摸 +摹 +摺 +摻 +摽 +撃 +撇 +撈 +撐 +撒 +撓 +撕 +撖 +撙 +撚 +撞 +撣 +撤 +撥 +撩 +撫 +撬 +播 +撮 +撰 +撲 +撳 +撻 +撼 +撾 +撿 +擀 +擁 +擂 +擅 +擇 +擊 +擋 +操 +擎 +擒 +擔 +擘 +據 +擠 +擢 +擥 +擦 +擬 +擯 +擰 +擱 +擲 +擴 +擷 +擺 +擼 +擾 +攀 +攏 +攔 +攖 +攘 +攜 +攝 +攞 +攢 +攣 +攤 +攪 +攫 +攬 +支 +攴 +攵 +收 +攷 +攸 +改 +攻 +攽 +放 +政 +故 +效 +敍 +敎 +敏 +救 +敔 +敕 +敖 +敗 +敘 +教 +敝 +敞 +敟 +敢 +散 +敦 +敫 +敬 +敭 +敲 +整 +敵 +敷 +數 +敻 +敾 +斂 +斃 +文 +斌 +斎 +斐 +斑 +斕 +斖 +斗 +料 +斛 +斜 +斝 +斟 +斡 +斤 +斥 +斧 +斬 +斯 +新 +斷 +方 +於 +施 +斿 +旁 +旂 +旃 +旄 +旅 +旉 +旋 +旌 +旎 +族 +旖 +旗 +旙 +旛 +旡 +既 +日 +旦 +旨 +早 +旬 +旭 +旱 +旲 +旳 +旺 +旻 +旼 +旽 +旾 +旿 +昀 +昂 +昃 +昆 +昇 +昉 +昊 +昌 +昍 +明 +昏 +昐 +易 +昔 +昕 +昚 +昛 +昜 +昝 +昞 +星 +映 +昡 +昣 +昤 +春 +昧 +昨 +昪 +昫 +昭 +是 +昰 +昱 +昴 +昵 +昶 +昺 +晁 +時 +晃 +晈 +晉 +晊 +晏 +晗 +晙 +晚 +晛 +晝 +晞 +晟 +晤 +晦 +晧 +晨 +晩 +晪 +晫 +晭 +普 +景 +晰 +晳 +晴 +晶 +晷 +晸 +智 +晾 +暃 +暄 +暅 +暇 +暈 +暉 +暊 +暌 +暎 +暏 +暐 +暑 +暕 +暖 +暗 +暘 +暝 +暟 +暠 +暢 +暦 +暨 +暫 +暮 +暱 +暲 +暴 +暸 +暹 +暻 +暾 +曄 +曅 +曆 +曇 +曉 +曌 +曔 +曖 +曙 +曜 +曝 +曠 +曦 +曧 +曨 +曩 +曬 +曮 +曰 +曲 +曳 +更 +曶 +曷 +書 +曹 +曺 +曼 +曽 +曾 +替 +最 +會 +月 +有 +朊 +朋 +服 +朏 +朐 +朓 +朔 +朕 +朖 +朗 +望 +朝 +期 +朦 +朧 +木 +未 +末 +本 +札 +朱 +朴 +朵 +朶 +朽 +朿 +杁 +杉 +杋 +杌 +李 +杏 +材 +村 +杓 +杖 +杙 +杜 +杞 +束 +杠 +杣 +杤 +杧 +杬 +杭 +杯 +東 +杲 +杳 +杴 +杵 +杷 +杻 +杼 +松 +板 +极 +枇 +枉 +枋 +枏 +析 +枕 +枖 +林 +枚 +枛 +果 +枝 +枠 +枡 +枯 +枰 +枱 +枲 +枳 +架 +枷 +枸 +枹 +枼 +柁 +柃 +柄 +柉 +柊 +柎 +柏 +某 +柑 +柒 +染 +柔 +柘 +柚 +柜 +柝 +柞 +柟 +查 +柩 +柬 +柯 +柰 +柱 +柳 +柴 +柵 +柶 +柷 +査 +柾 +柿 +栃 +栄 +栐 +栒 +栓 +栜 +栝 +栞 +校 +栢 +栨 +栩 +株 +栲 +栴 +核 +根 +栻 +格 +栽 +桀 +桁 +桂 +桃 +桄 +桅 +框 +案 +桉 +桌 +桎 +桐 +桑 +桓 +桔 +桕 +桖 +桙 +桜 +桝 +桫 +桱 +桲 +桴 +桶 +桷 +桼 +桿 +梀 +梁 +梂 +梃 +梅 +梆 +梉 +梏 +梓 +梔 +梗 +梘 +條 +梟 +梠 +梢 +梣 +梧 +梨 +梫 +梭 +梯 +械 +梱 +梳 +梵 +梶 +梽 +棄 +棆 +棉 +棋 +棍 +棐 +棒 +棓 +棕 +棖 +棗 +棘 +棚 +棛 +棟 +棠 +棡 +棣 +棧 +棨 +棩 +棪 +棫 +森 +棱 +棲 +棵 +棶 +棹 +棺 +棻 +棼 +棽 +椅 +椆 +椇 +椋 +植 +椎 +椏 +椒 +椙 +椥 +椪 +椰 +椲 +椴 +椵 +椹 +椽 +椿 +楂 +楊 +楓 +楔 +楗 +楙 +楚 +楝 +楞 +楠 +楡 +楢 +楣 +楤 +楦 +楧 +楨 +楫 +業 +楮 +楯 +楳 +極 +楷 +楸 +楹 +楽 +楿 +概 +榆 +榊 +榍 +榎 +榑 +榔 +榕 +榖 +榗 +榘 +榛 +榜 +榞 +榢 +榣 +榤 +榦 +榧 +榨 +榫 +榭 +榮 +榲 +榴 +榷 +榻 +榿 +槀 +槁 +槃 +槊 +構 +槌 +槍 +槎 +槐 +槓 +槔 +槗 +様 +槙 +槤 +槩 +槭 +槰 +槱 +槲 +槳 +槺 +槻 +槼 +槽 +槿 +樀 +樁 +樂 +樅 +樆 +樊 +樋 +樑 +樓 +樗 +樘 +標 +樞 +樟 +模 +樣 +樨 +権 +樫 +樵 +樸 +樹 +樺 +樻 +樽 +樾 +橄 +橇 +橈 +橋 +橐 +橒 +橓 +橘 +橙 +橚 +機 +橡 +橢 +橪 +橫 +橿 +檀 +檄 +檇 +檉 +檊 +檎 +檐 +檔 +檗 +檜 +檞 +檠 +檡 +檢 +檣 +檦 +檨 +檫 +檬 +檯 +檳 +檵 +檸 +檻 +檽 +櫂 +櫃 +櫆 +櫈 +櫓 +櫚 +櫛 +櫞 +櫟 +櫥 +櫨 +櫪 +櫱 +櫸 +櫻 +櫾 +櫿 +欄 +欉 +權 +欏 +欒 +欖 +欞 +欠 +次 +欣 +欥 +欲 +欸 +欹 +欺 +欽 +款 +歆 +歇 +歉 +歊 +歌 +歎 +歐 +歓 +歙 +歛 +歡 +止 +正 +此 +步 +武 +歧 +歩 +歪 +歲 +歳 +歴 +歷 +歸 +歹 +死 +歿 +殂 +殃 +殄 +殆 +殉 +殊 +殑 +殖 +殘 +殛 +殞 +殟 +殤 +殭 +殮 +殯 +殲 +殳 +段 +殷 +殺 +殻 +殼 +殿 +毀 +毅 +毆 +毉 +毋 +毌 +母 +毎 +每 +毐 +毒 +毓 +比 +毖 +毗 +毘 +毛 +毫 +毬 +毯 +毴 +毸 +毽 +毿 +氂 +氈 +氍 +氏 +氐 +民 +氓 +氖 +気 +氘 +氙 +氚 +氛 +氟 +氣 +氦 +氧 +氨 +氪 +氫 +氬 +氮 +氯 +氰 +水 +氵 +氷 +永 +氹 +氻 +氽 +氾 +汀 +汁 +求 +汊 +汎 +汐 +汕 +汗 +汛 +汜 +汝 +汞 +江 +池 +污 +汧 +汨 +汩 +汪 +汭 +汰 +汲 +汴 +汶 +決 +汽 +汾 +沁 +沂 +沃 +沄 +沅 +沆 +沇 +沈 +沉 +沌 +沍 +沏 +沐 +沒 +沓 +沔 +沖 +沘 +沙 +沚 +沛 +沜 +沢 +沨 +沫 +沭 +沮 +沯 +沱 +河 +沸 +油 +沺 +治 +沼 +沽 +沾 +沿 +況 +泂 +泄 +泆 +泇 +泉 +泊 +泌 +泐 +泓 +泔 +法 +泖 +泗 +泚 +泛 +泠 +泡 +波 +泣 +泥 +泩 +泫 +泮 +泯 +泰 +泱 +泳 +泵 +洄 +洋 +洌 +洎 +洗 +洙 +洛 +洞 +洢 +洣 +洤 +津 +洨 +洩 +洪 +洮 +洱 +洲 +洳 +洵 +洸 +洹 +洺 +活 +洽 +派 +流 +浄 +浙 +浚 +浛 +浜 +浞 +浟 +浠 +浡 +浣 +浤 +浥 +浦 +浩 +浪 +浮 +浯 +浴 +浵 +海 +浸 +浹 +涅 +涇 +消 +涉 +涌 +涎 +涑 +涓 +涔 +涕 +涙 +涪 +涫 +涮 +涯 +液 +涵 +涸 +涼 +涿 +淄 +淅 +淆 +淇 +淋 +淌 +淍 +淎 +淏 +淑 +淓 +淖 +淘 +淙 +淚 +淛 +淝 +淞 +淠 +淡 +淤 +淥 +淦 +淨 +淩 +淪 +淫 +淬 +淮 +淯 +淰 +深 +淳 +淵 +淶 +混 +淸 +淹 +淺 +添 +淼 +淽 +渃 +清 +済 +渉 +渋 +渕 +渙 +渚 +減 +渝 +渟 +渠 +渡 +渣 +渤 +渥 +渦 +渫 +測 +渭 +港 +渲 +渴 +游 +渺 +渼 +渽 +渾 +湃 +湄 +湉 +湊 +湍 +湓 +湔 +湖 +湘 +湛 +湜 +湞 +湟 +湣 +湥 +湧 +湫 +湮 +湯 +湳 +湴 +湼 +満 +溁 +溇 +溈 +溉 +溋 +溎 +溏 +源 +準 +溙 +溜 +溝 +溟 +溢 +溥 +溦 +溧 +溪 +溫 +溯 +溱 +溲 +溴 +溵 +溶 +溺 +溼 +滀 +滁 +滂 +滄 +滅 +滇 +滈 +滉 +滋 +滌 +滎 +滏 +滑 +滓 +滔 +滕 +滘 +滙 +滝 +滬 +滯 +滲 +滴 +滷 +滸 +滹 +滻 +滽 +滾 +滿 +漁 +漂 +漆 +漇 +漈 +漎 +漏 +漓 +演 +漕 +漚 +漠 +漢 +漣 +漩 +漪 +漫 +漬 +漯 +漱 +漲 +漳 +漴 +漵 +漷 +漸 +漼 +漾 +漿 +潁 +潑 +潔 +潘 +潛 +潞 +潟 +潢 +潤 +潭 +潮 +潯 +潰 +潲 +潺 +潼 +潽 +潾 +潿 +澀 +澁 +澂 +澄 +澆 +澇 +澈 +澉 +澋 +澌 +澍 +澎 +澔 +澗 +澠 +澡 +澣 +澤 +澥 +澧 +澪 +澮 +澯 +澱 +澳 +澶 +澹 +澻 +激 +濁 +濂 +濃 +濉 +濊 +濋 +濕 +濘 +濙 +濛 +濞 +濟 +濠 +濡 +濤 +濫 +濬 +濮 +濯 +濰 +濱 +濲 +濶 +濺 +濼 +濾 +瀁 +瀅 +瀆 +瀉 +瀍 +瀏 +瀑 +瀔 +瀕 +瀘 +瀚 +瀛 +瀝 +瀞 +瀟 +瀠 +瀣 +瀦 +瀧 +瀨 +瀬 +瀰 +瀲 +瀴 +瀶 +瀹 +瀾 +灃 +灊 +灌 +灑 +灘 +灝 +灞 +灡 +灣 +灤 +灧 +火 +灰 +灴 +灸 +灼 +災 +炁 +炅 +炆 +炊 +炎 +炒 +炔 +炕 +炘 +炙 +炟 +炣 +炤 +炫 +炬 +炭 +炮 +炯 +炱 +炲 +炳 +炷 +炸 +為 +炻 +烈 +烉 +烊 +烋 +烏 +烒 +烔 +烘 +烙 +烜 +烝 +烤 +烯 +烱 +烴 +烷 +烹 +烺 +烽 +焃 +焄 +焉 +焊 +焌 +焓 +焗 +焙 +焚 +焜 +焞 +無 +焦 +焯 +焰 +焱 +焴 +然 +焻 +焼 +焿 +煇 +煉 +煊 +煌 +煎 +煐 +煒 +煔 +煕 +煖 +煙 +煚 +煜 +煞 +煠 +煤 +煥 +煦 +照 +煨 +煩 +煬 +煮 +煲 +煳 +煵 +煶 +煸 +煽 +熄 +熅 +熇 +熈 +熊 +熏 +熒 +熔 +熖 +熗 +熘 +熙 +熜 +熟 +熠 +熤 +熥 +熨 +熬 +熯 +熱 +熲 +熳 +熵 +熹 +熺 +熼 +熾 +熿 +燁 +燃 +燄 +燈 +燉 +燊 +燎 +燏 +燐 +燒 +燔 +燕 +燘 +燙 +燚 +燜 +燝 +營 +燥 +燦 +燧 +燫 +燬 +燭 +燮 +燴 +燹 +燻 +燼 +燾 +燿 +爀 +爆 +爌 +爍 +爐 +爔 +爚 +爛 +爝 +爨 +爪 +爬 +爭 +爯 +爰 +爲 +爵 +父 +爸 +爹 +爺 +爻 +爽 +爾 +爿 +牁 +牂 +牆 +片 +版 +牌 +牒 +牕 +牖 +牘 +牙 +牛 +牝 +牟 +牠 +牡 +牢 +牧 +物 +牯 +牲 +特 +牻 +牼 +牽 +犀 +犁 +犂 +犇 +犍 +犎 +犖 +犛 +犢 +犧 +犨 +犬 +犯 +犰 +犴 +犽 +狀 +狂 +狄 +狍 +狎 +狐 +狒 +狓 +狗 +狙 +狛 +狟 +狠 +狡 +狦 +狨 +狩 +狳 +狶 +狷 +狸 +狹 +狻 +狼 +猁 +猄 +猇 +猊 +猗 +猙 +猛 +猜 +猝 +猞 +猢 +猥 +猨 +猩 +猳 +猴 +猶 +猷 +猺 +猻 +猾 +猿 +獁 +獃 +獄 +獅 +獇 +獎 +獏 +獐 +獒 +獠 +獢 +獣 +獨 +獬 +獮 +獯 +獰 +獲 +獴 +獵 +獷 +獸 +獺 +獻 +獼 +獾 +玀 +玄 +玆 +率 +玉 +王 +玎 +玏 +玓 +玕 +玖 +玗 +玘 +玙 +玟 +玠 +玡 +玢 +玥 +玧 +玨 +玩 +玫 +玭 +玲 +玳 +玶 +玷 +玹 +玻 +玾 +珀 +珂 +珅 +珈 +珉 +珊 +珌 +珍 +珎 +珏 +珖 +珙 +珝 +珞 +珠 +珡 +珣 +珤 +珥 +珦 +珧 +珩 +珪 +班 +珮 +珵 +珹 +珺 +珽 +現 +琁 +球 +琄 +琅 +理 +琇 +琉 +琊 +琍 +琎 +琚 +琛 +琡 +琢 +琤 +琥 +琦 +琨 +琪 +琬 +琮 +琯 +琰 +琱 +琳 +琴 +琵 +琶 +琹 +琺 +琿 +瑀 +瑁 +瑂 +瑄 +瑅 +瑆 +瑈 +瑊 +瑋 +瑑 +瑒 +瑕 +瑗 +瑙 +瑚 +瑛 +瑜 +瑝 +瑞 +瑟 +瑠 +瑢 +瑣 +瑤 +瑥 +瑧 +瑨 +瑩 +瑪 +瑭 +瑯 +瑰 +瑱 +瑳 +瑴 +瑺 +瑾 +璀 +璁 +璃 +璄 +璆 +璇 +璈 +璉 +璋 +璌 +璐 +璕 +璘 +璙 +璚 +璜 +璞 +璟 +璠 +璡 +璣 +璥 +璦 +璧 +璨 +璩 +璪 +璫 +璬 +璮 +環 +璱 +璵 +璸 +璹 +璽 +璿 +瓈 +瓊 +瓌 +瓏 +瓑 +瓔 +瓖 +瓘 +瓚 +瓛 +瓜 +瓞 +瓠 +瓢 +瓣 +瓤 +瓦 +瓮 +瓴 +瓶 +瓷 +瓿 +甂 +甄 +甌 +甍 +甑 +甕 +甘 +甙 +甚 +甜 +生 +甡 +產 +産 +甥 +甦 +用 +甩 +甪 +甫 +甬 +甯 +田 +由 +甲 +申 +男 +甸 +甹 +町 +甾 +畀 +畇 +畈 +畊 +畋 +界 +畎 +畏 +畐 +畑 +畔 +留 +畜 +畝 +畠 +畢 +略 +畦 +畧 +番 +畫 +畬 +畯 +異 +畲 +畳 +畵 +當 +畷 +畸 +畹 +畿 +疃 +疆 +疇 +疊 +疋 +疌 +疍 +疏 +疑 +疒 +疕 +疙 +疚 +疝 +疣 +疤 +疥 +疫 +疲 +疳 +疵 +疸 +疹 +疼 +疽 +疾 +痂 +病 +症 +痊 +痍 +痔 +痕 +痘 +痙 +痛 +痞 +痟 +痠 +痢 +痣 +痤 +痧 +痩 +痰 +痱 +痲 +痴 +痹 +痺 +痿 +瘀 +瘁 +瘊 +瘋 +瘍 +瘓 +瘙 +瘜 +瘞 +瘟 +瘠 +瘡 +瘢 +瘤 +瘦 +瘧 +瘩 +瘰 +瘴 +瘺 +癀 +療 +癆 +癇 +癌 +癒 +癖 +癘 +癜 +癟 +癡 +癢 +癤 +癥 +癩 +癬 +癭 +癮 +癯 +癰 +癱 +癲 +癸 +発 +登 +發 +白 +百 +皂 +的 +皆 +皇 +皈 +皋 +皎 +皐 +皓 +皖 +皙 +皚 +皛 +皝 +皞 +皮 +皰 +皴 +皷 +皸 +皺 +皿 +盂 +盃 +盅 +盆 +盈 +益 +盋 +盌 +盎 +盒 +盔 +盛 +盜 +盞 +盟 +盡 +監 +盤 +盥 +盦 +盧 +盨 +盩 +盪 +盫 +目 +盯 +盱 +盲 +直 +盷 +相 +盹 +盺 +盼 +盾 +眀 +省 +眉 +看 +県 +眙 +眛 +眜 +眞 +真 +眠 +眥 +眨 +眩 +眭 +眯 +眵 +眶 +眷 +眸 +眺 +眼 +眾 +着 +睇 +睛 +睜 +睞 +睡 +睢 +督 +睥 +睦 +睨 +睪 +睫 +睭 +睹 +睺 +睽 +睾 +睿 +瞄 +瞅 +瞋 +瞌 +瞎 +瞑 +瞓 +瞞 +瞢 +瞥 +瞧 +瞪 +瞫 +瞬 +瞭 +瞰 +瞳 +瞻 +瞼 +瞽 +瞿 +矇 +矍 +矗 +矚 +矛 +矜 +矞 +矢 +矣 +知 +矧 +矩 +短 +矮 +矯 +石 +矸 +矽 +砂 +砋 +砌 +砍 +砒 +研 +砝 +砢 +砥 +砦 +砧 +砩 +砫 +砭 +砮 +砯 +砰 +砲 +砳 +破 +砵 +砷 +砸 +砼 +硂 +硃 +硅 +硇 +硏 +硐 +硒 +硓 +硚 +硜 +硝 +硤 +硨 +硫 +硬 +硭 +硯 +硼 +碁 +碇 +碉 +碌 +碎 +碑 +碓 +碕 +碗 +碘 +碚 +碟 +碡 +碣 +碧 +碩 +碪 +碭 +碰 +碲 +碳 +碴 +碶 +碸 +確 +碻 +碼 +碽 +碾 +磁 +磅 +磊 +磋 +磐 +磔 +磕 +磘 +磙 +磚 +磜 +磡 +磨 +磪 +磬 +磯 +磱 +磲 +磵 +磷 +磺 +磻 +磾 +礁 +礄 +礎 +礐 +礑 +礒 +礙 +礠 +礦 +礪 +礫 +礬 +礮 +礱 +礴 +示 +礻 +礽 +社 +祀 +祁 +祂 +祆 +祇 +祈 +祉 +祋 +祏 +祐 +祓 +祕 +祖 +祗 +祙 +祚 +祛 +祜 +祝 +神 +祟 +祠 +祥 +祧 +票 +祭 +祹 +祺 +祼 +祿 +禁 +禃 +禇 +禍 +禎 +福 +禑 +禓 +禔 +禕 +禘 +禛 +禟 +禠 +禤 +禦 +禧 +禨 +禩 +禪 +禮 +禰 +禱 +禵 +禹 +禺 +禼 +禽 +禾 +禿 +秀 +私 +秈 +秉 +秋 +科 +秒 +秕 +秘 +租 +秠 +秣 +秤 +秦 +秧 +秩 +秭 +秳 +秸 +移 +稀 +稅 +稈 +稉 +程 +稍 +稑 +稔 +稗 +稘 +稙 +稚 +稜 +稞 +稟 +稠 +種 +稱 +稲 +稷 +稹 +稺 +稻 +稼 +稽 +稾 +稿 +穀 +穂 +穆 +穈 +穉 +穌 +積 +穎 +穗 +穟 +穠 +穡 +穢 +穣 +穩 +穫 +穰 +穴 +穵 +究 +穹 +空 +穿 +突 +窄 +窅 +窈 +窋 +窒 +窕 +窖 +窗 +窘 +窟 +窠 +窣 +窨 +窩 +窪 +窮 +窯 +窰 +窶 +窺 +窿 +竄 +竅 +竇 +竈 +竊 +立 +竑 +站 +竜 +竟 +章 +竣 +童 +竦 +竩 +竭 +端 +競 +竹 +竺 +竻 +竿 +笄 +笆 +笈 +笏 +笑 +笘 +笙 +笛 +笞 +笠 +笥 +符 +笨 +笩 +笪 +第 +笭 +笮 +笯 +笱 +笳 +笹 +筅 +筆 +等 +筊 +筋 +筌 +筍 +筏 +筐 +筒 +答 +策 +筘 +筠 +筥 +筦 +筧 +筬 +筭 +筱 +筲 +筳 +筵 +筶 +筷 +筻 +箆 +箇 +箋 +箍 +箏 +箐 +箑 +箒 +箔 +箕 +算 +箜 +管 +箬 +箭 +箱 +箴 +箸 +節 +篁 +範 +篆 +篇 +築 +篊 +篋 +篌 +篔 +篙 +篝 +篠 +篡 +篤 +篥 +篦 +篩 +篪 +篭 +篯 +篳 +篷 +簀 +簃 +簇 +簉 +簋 +簍 +簑 +簕 +簗 +簞 +簠 +簡 +簧 +簪 +簫 +簷 +簸 +簹 +簺 +簽 +簾 +簿 +籀 +籃 +籌 +籍 +籐 +籙 +籛 +籜 +籝 +籟 +籠 +籣 +籤 +籥 +籪 +籬 +籮 +籲 +米 +籽 +籾 +粄 +粉 +粍 +粑 +粒 +粕 +粗 +粘 +粟 +粢 +粥 +粦 +粧 +粩 +粱 +粲 +粳 +粵 +粹 +粼 +粽 +精 +粿 +糀 +糅 +糊 +糌 +糍 +糎 +糕 +糖 +糙 +糜 +糝 +糞 +糟 +糠 +糢 +糧 +糬 +糯 +糰 +糴 +糶 +糸 +糹 +糺 +系 +糾 +紀 +紂 +約 +紅 +紆 +紇 +紈 +紉 +紊 +紋 +納 +紐 +紑 +紓 +純 +紕 +紗 +紘 +紙 +級 +紛 +紜 +紝 +紞 +素 +紡 +索 +紫 +紮 +累 +細 +紱 +紲 +紳 +紵 +紹 +紺 +紿 +終 +絃 +組 +絆 +経 +絎 +結 +絕 +絛 +絜 +絞 +絡 +絢 +給 +絨 +絪 +絮 +統 +絲 +絳 +絵 +絶 +絹 +絺 +綁 +綃 +綈 +綉 +綎 +綏 +經 +綖 +継 +続 +綜 +綝 +綞 +綠 +綢 +綣 +綦 +綧 +綫 +綬 +維 +綮 +綰 +綱 +網 +綳 +綴 +綸 +綺 +綻 +綽 +綾 +綿 +緁 +緃 +緄 +緈 +緊 +緋 +総 +緑 +緒 +緖 +緘 +線 +緜 +緝 +緞 +締 +緡 +緣 +緤 +編 +緩 +緬 +緯 +緱 +緲 +練 +緹 +緻 +縂 +縄 +縈 +縉 +縊 +縕 +縛 +縝 +縞 +縠 +縡 +縣 +縤 +縫 +縮 +縯 +縱 +縴 +縵 +縷 +縹 +縻 +總 +績 +繁 +繃 +繆 +繇 +繒 +織 +繕 +繖 +繙 +繚 +繞 +繡 +繩 +繪 +繫 +繭 +繰 +繳 +繹 +繻 +繼 +繽 +繾 +纁 +纂 +纈 +續 +纍 +纏 +纓 +纔 +纕 +纖 +纘 +纛 +纜 +缐 +缶 +缸 +缺 +缽 +罃 +罄 +罅 +罈 +罉 +罌 +罍 +罐 +罔 +罕 +罘 +罟 +罡 +罨 +罩 +罪 +置 +罰 +罱 +署 +罳 +罵 +罶 +罷 +罹 +罽 +羂 +羅 +羆 +羈 +羊 +羋 +羌 +美 +羔 +羕 +羗 +羙 +羚 +羞 +羡 +羣 +群 +羥 +羧 +羨 +義 +羯 +羰 +羱 +羲 +羸 +羹 +羽 +羿 +翀 +翁 +翂 +翃 +翅 +翊 +翌 +翎 +翏 +習 +翔 +翕 +翙 +翜 +翟 +翠 +翡 +翥 +翦 +翩 +翬 +翮 +翰 +翱 +翳 +翹 +翻 +翼 +耀 +老 +考 +耄 +者 +耆 +而 +耍 +耎 +耐 +耑 +耒 +耔 +耕 +耗 +耘 +耙 +耜 +耦 +耨 +耬 +耳 +耵 +耶 +耷 +耽 +耿 +聃 +聆 +聊 +聒 +聖 +聘 +聚 +聞 +聟 +聨 +聯 +聰 +聱 +聲 +聳 +聴 +聶 +職 +聽 +聾 +聿 +肄 +肅 +肆 +肇 +肉 +肋 +肌 +肏 +肖 +肘 +肚 +肛 +肜 +肝 +肟 +股 +肢 +肥 +肩 +肪 +肫 +肯 +肱 +育 +肸 +肹 +肺 +肼 +肽 +胂 +胃 +胄 +胅 +胇 +胊 +背 +胍 +胎 +胖 +胗 +胙 +胚 +胛 +胝 +胞 +胡 +胤 +胥 +胬 +胭 +胰 +胱 +胳 +胴 +胸 +胺 +胼 +能 +脂 +脅 +脆 +脇 +脈 +脊 +脒 +脖 +脘 +脛 +脣 +脩 +脫 +脬 +脭 +脯 +脲 +脳 +脷 +脹 +脾 +腆 +腈 +腊 +腋 +腌 +腎 +腐 +腑 +腓 +腔 +腕 +腥 +腦 +腧 +腩 +腫 +腮 +腰 +腱 +腳 +腴 +腸 +腹 +腺 +腿 +膀 +膂 +膈 +膊 +膏 +膚 +膛 +膜 +膝 +膠 +膣 +膥 +膦 +膨 +膩 +膮 +膳 +膺 +膽 +膾 +膿 +臀 +臂 +臃 +臆 +臉 +臊 +臍 +臏 +臘 +臚 +臞 +臟 +臠 +臣 +臧 +臨 +自 +臭 +臯 +至 +致 +臺 +臻 +臼 +臾 +舂 +舅 +與 +興 +舉 +舊 +舌 +舍 +舎 +舒 +舔 +舖 +舘 +舛 +舜 +舞 +舟 +舢 +舥 +舨 +舩 +航 +舫 +般 +舲 +舵 +舶 +舷 +舸 +船 +舺 +艅 +艇 +艉 +艋 +艎 +艏 +艔 +艘 +艙 +艚 +艦 +艮 +良 +艱 +色 +艶 +艷 +艸 +艽 +艾 +艿 +芃 +芊 +芋 +芍 +芎 +芑 +芒 +芘 +芙 +芛 +芝 +芡 +芥 +芨 +芩 +芪 +芫 +芬 +芭 +芮 +芯 +花 +芳 +芴 +芷 +芸 +芹 +芻 +芽 +芾 +苄 +苅 +苑 +苒 +苓 +苔 +苕 +苗 +苛 +苜 +苝 +苞 +苟 +苡 +苣 +苤 +若 +苦 +苧 +苪 +苫 +苯 +英 +苳 +苴 +苷 +苺 +苻 +苼 +苾 +茀 +茁 +茂 +范 +茄 +茅 +茆 +茇 +茈 +茉 +茌 +茗 +茘 +茚 +茛 +茜 +茝 +茨 +茫 +茬 +茭 +茮 +茯 +茱 +茲 +茴 +茵 +茶 +茷 +茸 +茹 +茺 +茼 +荀 +荃 +荅 +荇 +草 +荊 +荎 +荏 +荒 +荔 +荖 +荘 +荳 +荷 +荸 +荻 +荼 +荽 +莆 +莉 +莊 +莎 +莒 +莓 +莕 +莖 +莘 +莙 +莛 +莜 +莞 +莠 +莢 +莧 +莨 +莩 +莪 +莫 +莽 +莿 +菀 +菁 +菅 +菇 +菈 +菉 +菊 +菌 +菍 +菏 +菑 +菓 +菔 +菖 +菘 +菜 +菝 +菟 +菠 +菡 +菥 +菩 +菪 +菫 +華 +菰 +菱 +菲 +菴 +菶 +菸 +菹 +菺 +菼 +菽 +菾 +萁 +萃 +萄 +萇 +萊 +萌 +萍 +萎 +萐 +萘 +萜 +萠 +萡 +萣 +萩 +萬 +萭 +萱 +萵 +萸 +萹 +萼 +落 +葃 +葆 +葉 +葊 +葎 +葑 +葒 +著 +葙 +葚 +葛 +葜 +葝 +葡 +董 +葦 +葩 +葫 +葬 +葭 +葯 +葰 +葳 +葵 +葶 +葷 +葺 +蒂 +蒄 +蒍 +蒎 +蒐 +蒓 +蒔 +蒗 +蒙 +蒜 +蒞 +蒟 +蒡 +蒢 +蒤 +蒧 +蒨 +蒭 +蒯 +蒲 +蒴 +蒸 +蒹 +蒺 +蒻 +蒼 +蒽 +蒾 +蒿 +蓀 +蓁 +蓂 +蓄 +蓆 +蓉 +蓋 +蓍 +蓑 +蓓 +蓖 +蓘 +蓚 +蓧 +蓨 +蓪 +蓬 +蓭 +蓮 +蓯 +蓳 +蓼 +蓽 +蓿 +蔆 +蔎 +蔑 +蔓 +蔔 +蔕 +蔗 +蔘 +蔚 +蔝 +蔞 +蔡 +蔣 +蔥 +蔦 +蔬 +蔭 +蔴 +蔵 +蔻 +蔽 +蕁 +蕃 +蕅 +蕈 +蕉 +蕊 +蕎 +蕑 +蕒 +蕖 +蕘 +蕙 +蕚 +蕟 +蕡 +蕢 +蕤 +蕨 +蕩 +蕪 +蕭 +蕷 +蕹 +蕺 +蕻 +蕾 +薀 +薄 +薆 +薇 +薈 +薊 +薌 +薏 +薐 +薑 +薔 +薗 +薘 +薙 +薛 +薜 +薞 +薟 +薡 +薦 +薨 +薩 +薪 +薫 +薬 +薯 +薰 +薲 +薷 +薸 +薹 +薺 +薾 +薿 +藁 +藉 +藍 +藎 +藏 +藐 +藔 +藕 +藜 +藝 +藟 +藤 +藥 +藦 +藨 +藩 +藪 +藶 +藸 +藹 +藺 +藻 +藿 +蘂 +蘄 +蘅 +蘆 +蘇 +蘊 +蘋 +蘐 +蘑 +蘓 +蘗 +蘘 +蘚 +蘞 +蘢 +蘧 +蘩 +蘭 +蘵 +蘶 +蘸 +蘼 +蘿 +虉 +虎 +虐 +虓 +虔 +處 +虖 +虛 +虜 +虞 +號 +虢 +虧 +虨 +虯 +虱 +虵 +虹 +虺 +虻 +蚆 +蚊 +蚋 +蚌 +蚍 +蚓 +蚖 +蚜 +蚝 +蚡 +蚢 +蚣 +蚤 +蚧 +蚨 +蚩 +蚪 +蚯 +蚱 +蚴 +蚵 +蚶 +蚺 +蚼 +蛀 +蛄 +蛇 +蛉 +蛋 +蛍 +蛐 +蛑 +蛔 +蛙 +蛛 +蛞 +蛟 +蛤 +蛭 +蛯 +蛸 +蛹 +蛺 +蛻 +蛾 +蜀 +蜂 +蜃 +蜆 +蜇 +蜈 +蜉 +蜊 +蜍 +蜑 +蜒 +蜓 +蜘 +蜚 +蜛 +蜜 +蜞 +蜢 +蜣 +蜥 +蜨 +蜮 +蜯 +蜱 +蜴 +蜷 +蜻 +蜾 +蜿 +蝀 +蝌 +蝍 +蝎 +蝓 +蝕 +蝗 +蝘 +蝙 +蝚 +蝟 +蝠 +蝣 +蝤 +蝦 +蝨 +蝮 +蝯 +蝰 +蝲 +蝴 +蝶 +蝸 +蝽 +螂 +螃 +螄 +螅 +螈 +螋 +融 +螐 +螔 +螞 +螟 +螠 +螢 +螣 +螥 +螫 +螭 +螯 +螳 +螶 +螺 +螻 +螽 +螾 +蟀 +蟄 +蟅 +蟆 +蟊 +蟋 +蟌 +蟎 +蟑 +蟒 +蟜 +蟠 +蟥 +蟪 +蟫 +蟬 +蟯 +蟲 +蟳 +蟴 +蟶 +蟹 +蟻 +蟾 +蠂 +蠃 +蠄 +蠅 +蠆 +蠊 +蠋 +蠍 +蠐 +蠑 +蠓 +蠔 +蠕 +蠖 +蠘 +蠙 +蠟 +蠡 +蠢 +蠣 +蠱 +蠲 +蠵 +蠶 +蠷 +蠹 +蠻 +血 +衂 +衆 +行 +衍 +衎 +術 +衕 +衖 +街 +衙 +衚 +衛 +衜 +衝 +衞 +衡 +衢 +衣 +表 +衩 +衫 +衰 +衲 +衷 +衽 +衾 +衿 +袁 +袂 +袈 +袋 +袍 +袓 +袖 +袛 +袞 +袤 +袪 +被 +袱 +袴 +袾 +裁 +裂 +裊 +裎 +裒 +裔 +裕 +裖 +裘 +裙 +補 +裝 +裟 +裡 +裨 +裬 +裱 +裳 +裴 +裵 +裸 +裹 +製 +裾 +裿 +褀 +褂 +複 +褌 +褍 +褎 +褐 +褒 +褓 +褔 +褘 +褙 +褚 +褞 +褥 +褧 +褪 +褫 +褭 +褲 +褶 +褸 +褻 +襄 +襌 +襖 +襞 +襟 +襠 +襤 +襦 +襪 +襯 +襲 +襴 +襶 +襻 +襾 +西 +要 +覃 +覆 +覇 +覈 +見 +覌 +規 +覓 +視 +覚 +覡 +覦 +覧 +親 +覬 +覲 +観 +覺 +覽 +覿 +觀 +角 +觔 +觙 +觚 +觜 +解 +觭 +觱 +觴 +觶 +觸 +觿 +言 +訁 +訂 +訃 +訇 +計 +訊 +訌 +討 +訏 +訐 +訒 +訓 +訔 +訕 +訖 +託 +記 +訛 +訝 +訟 +訣 +訥 +訪 +設 +許 +訴 +訶 +診 +註 +証 +訾 +詁 +詆 +詈 +詐 +詒 +詔 +評 +詛 +詞 +詠 +詡 +詢 +詣 +詥 +試 +詧 +詩 +詫 +詭 +詮 +詰 +話 +該 +詳 +詵 +詹 +詼 +誄 +誅 +誇 +誌 +認 +誒 +誓 +誕 +誘 +語 +誠 +誡 +誣 +誤 +誥 +誦 +誨 +說 +説 +読 +誰 +課 +誴 +誹 +誼 +誾 +調 +談 +請 +諍 +諏 +諒 +論 +諗 +諜 +諟 +諠 +諡 +諤 +諦 +諧 +諪 +諫 +諭 +諮 +諱 +諲 +諳 +諴 +諶 +諷 +諸 +諺 +諼 +諾 +謀 +謁 +謂 +謄 +謇 +謊 +謌 +謎 +謏 +謐 +謔 +謖 +謗 +謙 +謚 +講 +謜 +謝 +謠 +謢 +謤 +謨 +謩 +謫 +謬 +謳 +謹 +謾 +證 +譏 +譓 +譔 +識 +譙 +譚 +譜 +譞 +警 +譫 +譬 +譭 +譯 +議 +譲 +譳 +譴 +護 +譽 +譿 +讀 +讃 +變 +讌 +讎 +讓 +讖 +讙 +讚 +讜 +讞 +谷 +谿 +豁 +豆 +豇 +豈 +豉 +豊 +豌 +豎 +豐 +豔 +豕 +豚 +象 +豢 +豨 +豪 +豫 +豬 +豳 +豸 +豹 +豺 +豿 +貂 +貅 +貉 +貊 +貌 +貐 +貒 +貓 +貔 +貘 +貝 +貞 +負 +財 +貢 +貤 +貧 +貨 +販 +貪 +貫 +責 +貭 +貮 +貯 +貲 +貳 +貴 +貶 +買 +貸 +貺 +費 +貼 +貽 +貿 +賀 +賁 +賂 +賃 +賄 +資 +賈 +賊 +賑 +賒 +賓 +賔 +賕 +賚 +賜 +賞 +賠 +賡 +賢 +賣 +賤 +賦 +賨 +質 +賬 +賭 +賴 +賹 +賺 +賻 +購 +賽 +賾 +贄 +贅 +贇 +贈 +贊 +贌 +贍 +贏 +贓 +贔 +贖 +贛 +赤 +赦 +赧 +赫 +赬 +赭 +走 +赳 +赴 +起 +趁 +超 +越 +趐 +趕 +趖 +趙 +趟 +趣 +趨 +足 +趴 +趵 +趺 +趼 +趾 +跅 +跆 +跋 +跌 +跏 +跑 +跖 +跗 +跛 +距 +跟 +跡 +跣 +跤 +跨 +跩 +跪 +路 +跳 +踎 +踏 +踐 +踝 +踞 +踢 +踩 +踰 +踴 +踹 +踺 +蹂 +蹄 +蹇 +蹈 +蹉 +蹊 +蹋 +蹕 +蹙 +蹟 +蹠 +蹤 +蹦 +蹬 +蹭 +蹯 +蹲 +蹴 +蹶 +蹺 +蹻 +蹼 +躁 +躂 +躄 +躉 +躋 +躍 +躑 +躒 +躔 +躝 +躪 +身 +躬 +躰 +躲 +躺 +軀 +車 +軋 +軌 +軍 +軎 +軒 +軔 +軛 +軟 +転 +軫 +軲 +軸 +軹 +軺 +軻 +軼 +軽 +軾 +較 +輄 +輅 +載 +輋 +輒 +輓 +輔 +輕 +輛 +輝 +輞 +輟 +輥 +輦 +輩 +輪 +輬 +輭 +輯 +輶 +輸 +輻 +輾 +輿 +轀 +轂 +轄 +轅 +轆 +轉 +轍 +轎 +轘 +轝 +轟 +轤 +辛 +辜 +辟 +辣 +辦 +辧 +辨 +辭 +辮 +辯 +辰 +辱 +農 +辵 +辺 +辻 +込 +迂 +迄 +迅 +迎 +近 +返 +迢 +迤 +迥 +迦 +迪 +迫 +迭 +迮 +述 +迴 +迵 +迷 +迸 +迺 +追 +退 +送 +逃 +逄 +逅 +逆 +逈 +逋 +逌 +逍 +逎 +透 +逐 +逑 +途 +逕 +逖 +逗 +這 +通 +逛 +逝 +逞 +速 +造 +逢 +連 +逤 +逨 +逮 +逯 +進 +逴 +逵 +逸 +逹 +逺 +逼 +逾 +遁 +遂 +遄 +遇 +遊 +運 +遍 +過 +遏 +遐 +遒 +道 +達 +違 +遘 +遙 +遛 +遜 +遞 +遠 +遢 +遣 +遨 +適 +遭 +遮 +遯 +遲 +遴 +遵 +遶 +遷 +選 +遹 +遺 +遼 +避 +邀 +邁 +邂 +邃 +還 +邇 +邈 +邉 +邊 +邋 +邏 +邑 +邕 +邗 +邙 +邛 +邠 +邡 +邢 +那 +邦 +邨 +邪 +邯 +邰 +邱 +邲 +邳 +邴 +邵 +邸 +邽 +邾 +郁 +郃 +郄 +郅 +郇 +郊 +郋 +郎 +郗 +郛 +郜 +郝 +郞 +郟 +郡 +郢 +郤 +部 +郪 +郫 +郭 +郯 +郳 +郴 +郵 +郷 +都 +郾 +郿 +鄂 +鄃 +鄄 +鄆 +鄉 +鄋 +鄑 +鄒 +鄔 +鄖 +鄗 +鄘 +鄙 +鄚 +鄜 +鄞 +鄠 +鄢 +鄣 +鄤 +鄧 +鄩 +鄫 +鄭 +鄯 +鄰 +鄱 +鄲 +鄳 +鄴 +鄺 +酃 +酆 +酈 +酉 +酊 +酋 +酌 +配 +酎 +酏 +酐 +酒 +酔 +酗 +酚 +酞 +酡 +酢 +酣 +酥 +酩 +酪 +酬 +酮 +酯 +酰 +酴 +酵 +酶 +酷 +酸 +酺 +酼 +醁 +醂 +醃 +醅 +醇 +醉 +醋 +醌 +醍 +醐 +醒 +醚 +醛 +醜 +醞 +醢 +醣 +醪 +醫 +醬 +醮 +醯 +醴 +醺 +醾 +醿 +釀 +釁 +釆 +采 +釉 +釋 +里 +重 +野 +量 +釐 +金 +釒 +釓 +釔 +釕 +釗 +釘 +釙 +釚 +釜 +針 +釣 +釤 +釦 +釧 +釩 +釪 +釭 +釴 +釵 +釷 +釹 +釺 +鈀 +鈁 +鈄 +鈇 +鈈 +鈉 +鈊 +鈍 +鈏 +鈐 +鈑 +鈔 +鈕 +鈖 +鈞 +鈢 +鈣 +鈥 +鈦 +鈫 +鈮 +鈰 +鈳 +鈴 +鈷 +鈸 +鈹 +鈺 +鈾 +鈿 +鉀 +鉄 +鉅 +鉆 +鉈 +鉉 +鉋 +鉌 +鉍 +鉏 +鉑 +鉓 +鉗 +鉚 +鉛 +鉞 +鉟 +鉤 +鉦 +鉬 +鉭 +鉲 +鉶 +鉷 +鉸 +鉻 +鉾 +鉿 +銀 +銂 +銃 +銅 +銋 +銍 +銑 +銓 +銕 +銖 +銘 +銚 +銜 +銠 +銣 +銥 +銦 +銨 +銩 +銪 +銫 +銬 +銭 +銱 +銲 +銳 +銶 +銷 +銹 +銻 +銼 +銾 +鋁 +鋅 +鋆 +鋇 +鋌 +鋏 +鋐 +鋒 +鋕 +鋗 +鋙 +鋡 +鋤 +鋥 +鋦 +鋨 +鋪 +鋮 +鋯 +鋰 +鋱 +鋳 +鋶 +鋸 +鋹 +鋼 +錀 +錄 +錏 +錐 +錒 +錕 +錘 +錚 +錞 +錟 +錠 +錡 +錢 +錦 +錨 +錫 +錬 +錮 +錯 +錳 +錶 +錸 +錻 +鍀 +鍇 +鍈 +鍉 +鍊 +鍋 +鍍 +鍏 +鍔 +鍘 +鍛 +鍝 +鍟 +鍠 +鍥 +鍩 +鍬 +鍱 +鍳 +鍵 +鍶 +鍷 +鍺 +鍼 +鍾 +鎂 +鎅 +鎊 +鎌 +鎏 +鎓 +鎔 +鎖 +鎗 +鎘 +鎚 +鎛 +鎢 +鎣 +鎦 +鎧 +鎪 +鎬 +鎭 +鎮 +鎰 +鎳 +鎵 +鎻 +鏃 +鏇 +鏈 +鏊 +鏌 +鏐 +鏑 +鏓 +鏖 +鏗 +鏘 +鏜 +鏝 +鏞 +鏟 +鏡 +鏢 +鏤 +鏦 +鏳 +鏴 +鏵 +鏷 +鏻 +鏽 +鐃 +鐇 +鐈 +鐓 +鐔 +鐘 +鐙 +鐠 +鐡 +鐤 +鐦 +鐧 +鐫 +鐬 +鐭 +鐮 +鐲 +鐳 +鐵 +鐸 +鐺 +鐽 +鐿 +鑀 +鑁 +鑂 +鑄 +鑅 +鑊 +鑌 +鑑 +鑒 +鑛 +鑠 +鑣 +鑨 +鑪 +鑫 +鑭 +鑰 +鑲 +鑴 +鑷 +鑼 +鑽 +鑾 +鑿 +長 +門 +閂 +閃 +閆 +閉 +開 +閎 +閏 +閑 +閒 +間 +閔 +閘 +閜 +閞 +閟 +関 +閣 +閥 +閦 +閨 +閩 +閬 +閭 +閰 +閱 +閶 +閹 +閻 +閼 +閾 +閿 +闆 +闇 +闈 +闊 +闋 +闌 +闍 +闐 +闓 +闔 +闕 +闖 +闘 +關 +闞 +闡 +闢 +闥 +阜 +阝 +阡 +阪 +阭 +阮 +阯 +阱 +防 +阻 +阿 +陀 +陁 +陂 +附 +陋 +陌 +降 +限 +陔 +陘 +陛 +陜 +陝 +陞 +陟 +陡 +院 +陣 +除 +陪 +陬 +陰 +陲 +陳 +陵 +陶 +陷 +陸 +険 +陽 +隄 +隅 +隆 +隈 +隊 +隋 +隍 +階 +隔 +隕 +隗 +隘 +隙 +際 +障 +隣 +隧 +隨 +險 +隰 +隱 +隲 +隳 +隴 +隷 +隸 +隹 +隻 +隼 +雀 +雁 +雄 +雅 +集 +雇 +雉 +雋 +雌 +雍 +雎 +雑 +雒 +雕 +雖 +雙 +雛 +雜 +雝 +雞 +離 +難 +雨 +雩 +雪 +雫 +雯 +雱 +雲 +零 +雷 +雹 +電 +需 +霄 +霅 +霆 +震 +霈 +霉 +霊 +霍 +霎 +霏 +霑 +霓 +霖 +霙 +霜 +霞 +霤 +霧 +霨 +霰 +露 +霶 +霸 +霹 +霽 +霾 +靁 +靂 +靄 +靈 +靉 +靑 +青 +靖 +靚 +靛 +靜 +非 +靠 +靡 +面 +革 +靫 +靬 +靭 +靳 +靴 +靶 +靺 +靼 +鞅 +鞆 +鞋 +鞍 +鞏 +鞘 +鞞 +鞠 +鞣 +鞥 +鞦 +鞨 +鞭 +鞮 +鞴 +韁 +韃 +韆 +韋 +韌 +韑 +韓 +韙 +韜 +韞 +韠 +韡 +韭 +韮 +音 +韶 +韺 +韻 +韾 +響 +頁 +頂 +頃 +項 +順 +須 +頊 +頌 +頍 +頎 +頏 +預 +頑 +頒 +頓 +頔 +頗 +領 +頜 +頠 +頡 +頤 +頦 +頫 +頭 +頰 +頴 +頵 +頷 +頸 +頹 +頻 +頼 +顆 +題 +額 +顎 +顏 +顒 +顓 +顔 +顕 +顗 +願 +顙 +顛 +類 +顥 +顧 +顫 +顯 +顰 +顱 +顳 +顴 +風 +颮 +颯 +颱 +颶 +颺 +颼 +飄 +飆 +飈 +飛 +食 +飠 +飡 +飢 +飥 +飩 +飪 +飫 +飬 +飭 +飮 +飯 +飲 +飴 +飼 +飽 +飾 +餃 +餄 +餅 +餉 +養 +餌 +餎 +餐 +餒 +餓 +餗 +餘 +餚 +餛 +餞 +餠 +餡 +館 +餮 +餵 +餺 +餾 +餿 +饃 +饅 +饋 +饌 +饑 +饒 +饕 +饗 +饞 +饟 +饢 +首 +馗 +馘 +香 +馛 +馥 +馦 +馨 +馬 +馭 +馮 +馯 +馱 +馳 +馴 +馼 +駁 +駄 +駅 +駆 +駐 +駑 +駒 +駔 +駕 +駘 +駙 +駛 +駝 +駟 +駢 +駭 +駰 +駱 +駿 +騁 +騂 +騄 +騅 +騋 +騎 +騏 +験 +騖 +騙 +騤 +騨 +騫 +騭 +騮 +騰 +騶 +騷 +騾 +驁 +驃 +驄 +驅 +驊 +驌 +驍 +驎 +驒 +驕 +驗 +驚 +驛 +驟 +驢 +驤 +驥 +驩 +驪 +骨 +骯 +骰 +骶 +骷 +骸 +骼 +髀 +髂 +髎 +髏 +髑 +髒 +髓 +體 +高 +髙 +髡 +髦 +髪 +髭 +髮 +髯 +髲 +髷 +髹 +髻 +鬃 +鬄 +鬅 +鬆 +鬍 +鬚 +鬟 +鬢 +鬣 +鬥 +鬧 +鬨 +鬩 +鬪 +鬬 +鬮 +鬯 +鬱 +鬲 +鬹 +鬻 +鬼 +魁 +魂 +魃 +魄 +魅 +魈 +魋 +魍 +魎 +魏 +魔 +魕 +魘 +魚 +魛 +魞 +魟 +魣 +魨 +魩 +魮 +魯 +魴 +魷 +鮀 +鮁 +鮃 +鮄 +鮊 +鮋 +鮍 +鮐 +鮑 +鮒 +鮓 +鮗 +鮜 +鮟 +鮠 +鮡 +鮣 +鮨 +鮪 +鮫 +鮭 +鮮 +鮰 +鮸 +鮹 +鮻 +鯀 +鯁 +鯃 +鯇 +鯉 +鯊 +鯏 +鯒 +鯓 +鯔 +鯕 +鯖 +鯗 +鯙 +鯛 +鯡 +鯢 +鯤 +鯧 +鯨 +鯪 +鯭 +鯮 +鯰 +鯶 +鯷 +鯻 +鯽 +鯿 +鰂 +鰃 +鰆 +鰈 +鰉 +鰍 +鰏 +鰒 +鰓 +鰕 +鰗 +鰛 +鰜 +鰟 +鰣 +鰤 +鰧 +鰨 +鰩 +鰭 +鰮 +鰱 +鰲 +鰳 +鰶 +鰷 +鰹 +鰺 +鰻 +鰼 +鰾 +鱀 +鱂 +鱅 +鱇 +鱈 +鱉 +鱊 +鱒 +鱓 +鱔 +鱖 +鱗 +鱘 +鱚 +鱝 +鱟 +鱠 +鱣 +鱥 +鱧 +鱨 +鱬 +鱮 +鱰 +鱲 +鱵 +鱷 +鱸 +鱺 +鱻 +鳥 +鳧 +鳩 +鳯 +鳰 +鳳 +鳴 +鳶 +鳽 +鴆 +鴇 +鴉 +鴒 +鴓 +鴕 +鴗 +鴛 +鴝 +鴞 +鴟 +鴡 +鴣 +鴦 +鴨 +鴫 +鴯 +鴰 +鴴 +鴻 +鴿 +鵂 +鵄 +鵎 +鵐 +鵑 +鵒 +鵓 +鵙 +鵜 +鵝 +鵞 +鵟 +鵠 +鵡 +鵪 +鵬 +鵯 +鵰 +鵲 +鵵 +鵼 +鵾 +鶆 +鶇 +鶉 +鶏 +鶒 +鶓 +鶘 +鶚 +鶡 +鶥 +鶩 +鶬 +鶯 +鶲 +鶴 +鶹 +鶺 +鶻 +鶼 +鶿 +鷂 +鷄 +鷉 +鷎 +鷓 +鷗 +鷙 +鷚 +鷟 +鷥 +鷦 +鷫 +鷯 +鷲 +鷳 +鷸 +鷹 +鷺 +鸊 +鸌 +鸐 +鸑 +鸕 +鸘 +鸚 +鸛 +鸜 +鸝 +鸞 +鹮 +鹵 +鹹 +鹼 +鹽 +鹿 +麂 +麅 +麇 +麈 +麊 +麋 +麐 +麒 +麓 +麗 +麝 +麞 +麟 +麥 +麩 +麪 +麯 +麴 +麵 +麹 +麺 +麻 +麼 +麽 +麾 +麿 +黁 +黃 +黇 +黌 +黍 +黎 +黏 +黐 +黑 +黒 +黔 +默 +黙 +黛 +黜 +黝 +點 +黟 +黥 +黧 +黨 +黯 +黴 +黶 +黻 +黼 +黽 +黿 +鼂 +鼇 +鼈 +鼉 +鼎 +鼐 +鼒 +鼓 +鼕 +鼙 +鼠 +鼢 +鼩 +鼬 +鼯 +鼱 +鼴 +鼷 +鼻 +鼽 +鼾 +齊 +齋 +齒 +齕 +齡 +齣 +齦 +齧 +齲 +齶 +龍 +龎 +龐 +龑 +龔 +龕 +龜 +龝 +龠 +龢 +郎 +凉 +﹑ +﹗ +﹝ +﹞ +﹢ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +K +L +M +N +O +P +R +S +T +U +V +W +Y +Z +[ +] +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +r +s +t +u +z +{ +| +} +~ +¥ +𣇉 + diff --git a/batch_running_task/pytorchocr/utils/dict/cyrillic_dict.txt b/batch_running_task/pytorchocr/utils/dict/cyrillic_dict.txt new file mode 100644 index 0000000..2b6f664 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/cyrillic_dict.txt @@ -0,0 +1,163 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +Ё +Є +І +Ј +Љ +Ў +А +Б +В +Г +Д +Е +Ж +З +И +Й +К +Л +М +Н +О +П +Р +С +Т +У +Ф +Х +Ц +Ч +Ш +Щ +Ъ +Ы +Ь +Э +Ю +Я +а +б +в +г +д +е +ж +з +и +й +к +л +м +н +о +п +р +с +т +у +ф +х +ц +ч +ш +щ +ъ +ы +ь +э +ю +я +ё +ђ +є +і +ј +љ +њ +ћ +ў +џ +Ґ +ґ diff --git a/batch_running_task/pytorchocr/utils/dict/devanagari_dict.txt b/batch_running_task/pytorchocr/utils/dict/devanagari_dict.txt new file mode 100644 index 0000000..f559230 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/devanagari_dict.txt @@ -0,0 +1,167 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ँ +ं +ः +अ +आ +इ +ई +उ +ऊ +ऋ +ए +ऐ +ऑ +ओ +औ +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +ऩ +प +फ +ब +भ +म +य +र +ऱ +ल +ळ +व +श +ष +स +ह +़ +ा +ि +ी +ु +ू +ृ +ॅ +े +ै +ॉ +ो +ौ +् +॒ +क़ +ख़ +ग़ +ज़ +ड़ +ढ़ +फ़ +ॠ +। +० +१ +२ +३ +४ +५ +६ +७ +८ +९ +॰ diff --git a/batch_running_task/pytorchocr/utils/dict/en_dict.txt b/batch_running_task/pytorchocr/utils/dict/en_dict.txt new file mode 100644 index 0000000..6fbd99f --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/en_dict.txt @@ -0,0 +1,63 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + diff --git a/batch_running_task/pytorchocr/utils/dict/es_dict.txt b/batch_running_task/pytorchocr/utils/dict/es_dict.txt new file mode 100644 index 0000000..f195f1e --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/es_dict.txt @@ -0,0 +1,110 @@ +x +i +_ +m +g +/ +1 +0 +I +L +S +V +R +C +2 +v +a +l +3 +6 +4 +5 +. +j +p + +Q +u +e +r +o +8 +7 +n +c +9 +t +b +é +q +d +ó +y +F +s +, +O +í +T +f +" +U +M +h +: +P +H +A +E +D +z +N +á +ñ +ú +% +; +è ++ +Y +- +B +G +( +) +¿ +? +w +¡ +! +X +É +K +k +Á +ü +Ú +« +» +J +' +ö +W +Z +º +Ö +­ +[ +] +Ç +ç +à +ä +û +ò +Í +ê +ô +ø +ª diff --git a/batch_running_task/pytorchocr/utils/dict/fa_dict.txt b/batch_running_task/pytorchocr/utils/dict/fa_dict.txt new file mode 100644 index 0000000..2328fbd --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/fa_dict.txt @@ -0,0 +1,136 @@ +f +a +_ +i +m +g +/ +1 +3 +I +L +S +V +R +C +2 +0 +v +l +6 +8 +5 +. +j +p +و +د +ر +ك +ن +ش +ه +ا +4 +9 +ی +ج +ِ +7 +غ +ل +س +ز +ّ +ت +ک +گ +ي +م +ب +ف +چ +خ +ق +ژ +آ +ص +پ +َ +ع +ئ +ح +ٔ +ض +ُ +ذ +أ +ى +ط +ظ +ث +ة +ً +ء +ؤ +ْ +ۀ +إ +ٍ +ٌ +ٰ +ٓ +ٱ +s +c +e +n +w +N +E +W +Y +D +O +H +A +d +z +r +T +G +o +t +x +h +b +B +M +Z +u +P +F +y +q +U +K +k +J +Q +' +X +# +? +% +$ +, +: +& +! +- +( +É +@ +é ++ + diff --git a/batch_running_task/pytorchocr/utils/dict/french_dict.txt b/batch_running_task/pytorchocr/utils/dict/french_dict.txt new file mode 100644 index 0000000..e8f657d --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/french_dict.txt @@ -0,0 +1,136 @@ +f +e +n +c +h +_ +i +m +g +/ +r +v +a +l +t +w +o +d +6 +1 +. +p +B +u +2 +à +3 +R +y +4 +U +E +A +5 +P +O +S +T +D +7 +Z +8 +I +N +L +G +M +H +0 +J +K +- +9 +F +C +V +é +X +' +s +Q +: +è +x +b +Y +Œ +É +z +W +Ç +È +k +Ô +ô +€ +À +Ê +q +ù +° +ê +î +* + +j +" +, +â +% +û +ç +ü +? +! +; +ö +( +) +ï +º +ó +ø +å ++ +™ +á +Ë +< +² +Á +Î +& +@ +œ +ε +Ü +ë +[ +] +í +ò +Ö +ä +ß +« +» +ú +ñ +æ +µ +³ +Å +$ +# + diff --git a/batch_running_task/pytorchocr/utils/dict/german_dict.txt b/batch_running_task/pytorchocr/utils/dict/german_dict.txt new file mode 100644 index 0000000..5e121af --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/german_dict.txt @@ -0,0 +1,143 @@ + +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +£ +§ +­ +° +´ +µ +· +º +¿ +Á +Ä +Å +É +Ï +Ô +Ö +Ü +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +í +ï +ñ +ò +ó +ô +ö +ø +ù +ú +û +ü +ō +Š +Ÿ +ʒ +β +δ +з +Ṡ +‘ +€ +© +ª +« +¬ diff --git a/batch_running_task/pytorchocr/utils/dict/hi_dict.txt b/batch_running_task/pytorchocr/utils/dict/hi_dict.txt new file mode 100644 index 0000000..8dfedb5 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/hi_dict.txt @@ -0,0 +1,162 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ँ +ं +ः +अ +आ +इ +ई +उ +ऊ +ऋ +ए +ऐ +ऑ +ओ +औ +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +प +फ +ब +भ +म +य +र +ल +ळ +व +श +ष +स +ह +़ +ा +ि +ी +ु +ू +ृ +ॅ +े +ै +ॉ +ो +ौ +् +क़ +ख़ +ग़ +ज़ +ड़ +ढ़ +फ़ +० +१ +२ +३ +४ +५ +६ +७ +८ +९ +॰ diff --git a/batch_running_task/pytorchocr/utils/dict/it_dict.txt b/batch_running_task/pytorchocr/utils/dict/it_dict.txt new file mode 100644 index 0000000..e692c6d --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/it_dict.txt @@ -0,0 +1,118 @@ +i +t +_ +m +g +/ +5 +I +L +S +V +R +C +2 +0 +1 +v +a +l +7 +8 +9 +6 +. +j +p + +e +r +o +d +s +n +3 +4 +P +u +c +A +- +, +" +z +h +f +b +q +ì +' +à +O +è +G +ù +é +ò +; +F +E +B +N +H +k +: +U +T +X +D +K +? +[ +M +­ +x +y +( +) +W +ö +º +w +] +Q +J ++ +ü +! +È +á +% += +» +ñ +Ö +Y +ä +í +Z +« +@ +ó +ø +ï +ú +ê +ç +Á +É +Å +ß +{ +} +& +` +û +î +# +$ diff --git a/batch_running_task/pytorchocr/utils/dict/japan_dict.txt b/batch_running_task/pytorchocr/utils/dict/japan_dict.txt new file mode 100644 index 0000000..339d4b8 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/japan_dict.txt @@ -0,0 +1,4399 @@ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +© +° +² +´ +½ +Á +Ä +Å +Ç +È +É +Í +Ó +Ö +× +Ü +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +í +ð +ñ +ò +ó +ô +õ +ö +ø +ú +û +ü +ý +ā +ă +ą +ć +Č +č +đ +ē +ė +ę +ğ +ī +ı +Ł +ł +ń +ň +ō +ř +Ş +ş +Š +š +ţ +ū +ż +Ž +ž +Ș +ș +ț +Δ +α +λ +μ +φ +Г +О +а +в +л +о +р +с +т +я +ồ +​ +— +― +’ +“ +” +… +℃ +→ +∇ +− +■ +☆ +  +、 +。 +々 +〆 +〈 +〉 +「 +」 +『 +』 +〔 +〕 +〜 +ぁ +あ +ぃ +い +う +ぇ +え +ぉ +お +か +が +き +ぎ +く +ぐ +け +げ +こ +ご +さ +ざ +し +じ +す +ず +せ +ぜ +そ +ぞ +た +だ +ち +ぢ +っ +つ +づ +て +で +と +ど +な +に +ぬ +ね +の +は +ば +ぱ +ひ +び +ぴ +ふ +ぶ +ぷ +へ +べ +ぺ +ほ +ぼ +ぽ +ま +み +む +め +も +ゃ +や +ゅ +ゆ +ょ +よ +ら +り +る +れ +ろ +わ +ゑ +を +ん +ゝ +ゞ +ァ +ア +ィ +イ +ゥ +ウ +ェ +エ +ォ +オ +カ +ガ +キ +ギ +ク +グ +ケ +ゲ +コ +ゴ +サ +ザ +シ +ジ +ス +ズ +セ +ゼ +ソ +ゾ +タ +ダ +チ +ヂ +ッ +ツ +ヅ +テ +デ +ト +ド +ナ +ニ +ヌ +ネ +ノ +ハ +バ +パ +ヒ +ビ +ピ +フ +ブ +プ +ヘ +ベ +ペ +ホ +ボ +ポ +マ +ミ +ム +メ +モ +ャ +ヤ +ュ +ユ +ョ +ヨ +ラ +リ +ル +レ +ロ +ワ +ヰ +ン +ヴ +ヵ +ヶ +・ +ー +㈱ +一 +丁 +七 +万 +丈 +三 +上 +下 +不 +与 +丑 +且 +世 +丘 +丙 +丞 +両 +並 +中 +串 +丸 +丹 +主 +丼 +丿 +乃 +久 +之 +乎 +乏 +乗 +乘 +乙 +九 +乞 +也 +乱 +乳 +乾 +亀 +了 +予 +争 +事 +二 +于 +互 +五 +井 +亘 +亙 +些 +亜 +亟 +亡 +交 +亥 +亦 +亨 +享 +京 +亭 +亮 +人 +什 +仁 +仇 +今 +介 +仍 +仏 +仔 +仕 +他 +仗 +付 +仙 +代 +令 +以 +仮 +仰 +仲 +件 +任 +企 +伊 +伍 +伎 +伏 +伐 +休 +会 +伝 +伯 +估 +伴 +伶 +伸 +伺 +似 +伽 +佃 +但 +位 +低 +住 +佐 +佑 +体 +何 +余 +佚 +佛 +作 +佩 +佳 +併 +佶 +使 +侈 +例 +侍 +侏 +侑 +侘 +供 +依 +侠 +価 +侮 +侯 +侵 +侶 +便 +係 +促 +俄 +俊 +俔 +俗 +俘 +保 +信 +俣 +俤 +修 +俯 +俳 +俵 +俸 +俺 +倉 +個 +倍 +倒 +候 +借 +倣 +値 +倫 +倭 +倶 +倹 +偃 +假 +偈 +偉 +偏 +偐 +偕 +停 +健 +側 +偵 +偶 +偽 +傀 +傅 +傍 +傑 +傘 +備 +催 +傭 +傲 +傳 +債 +傷 +傾 +僊 +働 +像 +僑 +僕 +僚 +僧 +僭 +僮 +儀 +億 +儇 +儒 +儛 +償 +儡 +優 +儲 +儺 +儼 +兀 +允 +元 +兄 +充 +兆 +先 +光 +克 +兌 +免 +兎 +児 +党 +兜 +入 +全 +八 +公 +六 +共 +兵 +其 +具 +典 +兼 +内 +円 +冊 +再 +冑 +冒 +冗 +写 +冠 +冤 +冥 +冨 +冬 +冲 +决 +冶 +冷 +准 +凉 +凋 +凌 +凍 +凛 +凝 +凞 +几 +凡 +処 +凪 +凰 +凱 +凶 +凸 +凹 +出 +函 +刀 +刃 +分 +切 +刈 +刊 +刎 +刑 +列 +初 +判 +別 +利 +刪 +到 +制 +刷 +券 +刹 +刺 +刻 +剃 +則 +削 +剋 +前 +剖 +剛 +剣 +剤 +剥 +剪 +副 +剰 +割 +創 +剽 +劇 +劉 +劔 +力 +功 +加 +劣 +助 +努 +劫 +劭 +励 +労 +効 +劾 +勃 +勅 +勇 +勉 +勒 +動 +勘 +務 +勝 +募 +勢 +勤 +勧 +勲 +勺 +勾 +勿 +匁 +匂 +包 +匏 +化 +北 +匙 +匝 +匠 +匡 +匣 +匯 +匲 +匹 +区 +医 +匿 +十 +千 +升 +午 +卉 +半 +卍 +卑 +卒 +卓 +協 +南 +単 +博 +卜 +占 +卦 +卯 +印 +危 +即 +却 +卵 +卸 +卿 +厄 +厚 +原 +厠 +厨 +厩 +厭 +厳 +去 +参 +又 +叉 +及 +友 +双 +反 +収 +叔 +取 +受 +叙 +叛 +叟 +叡 +叢 +口 +古 +句 +叩 +只 +叫 +召 +可 +台 +叱 +史 +右 +叶 +号 +司 +吃 +各 +合 +吉 +吊 +同 +名 +后 +吏 +吐 +向 +君 +吝 +吟 +吠 +否 +含 +吸 +吹 +吻 +吽 +吾 +呂 +呆 +呈 +呉 +告 +呑 +周 +呪 +呰 +味 +呼 +命 +咀 +咄 +咋 +和 +咒 +咫 +咲 +咳 +咸 +哀 +品 +哇 +哉 +員 +哨 +哩 +哭 +哲 +哺 +唄 +唆 +唇 +唐 +唖 +唯 +唱 +唳 +唸 +唾 +啄 +商 +問 +啓 +啼 +善 +喋 +喚 +喜 +喝 +喧 +喩 +喪 +喫 +喬 +單 +喰 +営 +嗅 +嗇 +嗔 +嗚 +嗜 +嗣 +嘆 +嘉 +嘗 +嘘 +嘩 +嘯 +嘱 +嘲 +嘴 +噂 +噌 +噛 +器 +噴 +噺 +嚆 +嚢 +囀 +囃 +囉 +囚 +四 +回 +因 +団 +困 +囲 +図 +固 +国 +圀 +圃 +國 +圏 +園 +圓 +團 +圜 +土 +圧 +在 +圭 +地 +址 +坂 +均 +坊 +坐 +坑 +坡 +坤 +坦 +坪 +垂 +型 +垢 +垣 +埃 +埋 +城 +埒 +埔 +域 +埠 +埴 +埵 +執 +培 +基 +埼 +堀 +堂 +堅 +堆 +堕 +堤 +堪 +堯 +堰 +報 +場 +堵 +堺 +塀 +塁 +塊 +塑 +塔 +塗 +塘 +塙 +塚 +塞 +塩 +填 +塵 +塾 +境 +墉 +墓 +増 +墜 +墟 +墨 +墳 +墺 +墻 +墾 +壁 +壇 +壊 +壌 +壕 +士 +壬 +壮 +声 +壱 +売 +壷 +壹 +壺 +壽 +変 +夏 +夕 +外 +夙 +多 +夜 +夢 +夥 +大 +天 +太 +夫 +夬 +夭 +央 +失 +夷 +夾 +奄 +奇 +奈 +奉 +奎 +奏 +契 +奔 +奕 +套 +奘 +奠 +奢 +奥 +奨 +奪 +奮 +女 +奴 +奸 +好 +如 +妃 +妄 +妊 +妍 +妓 +妖 +妙 +妥 +妨 +妬 +妲 +妹 +妻 +妾 +姉 +始 +姐 +姓 +委 +姚 +姜 +姞 +姥 +姦 +姨 +姪 +姫 +姶 +姻 +姿 +威 +娑 +娘 +娟 +娠 +娩 +娯 +娼 +婆 +婉 +婚 +婢 +婦 +婬 +婿 +媄 +媒 +媓 +媚 +媛 +媞 +媽 +嫁 +嫄 +嫉 +嫌 +嫐 +嫗 +嫡 +嬉 +嬌 +嬢 +嬪 +嬬 +嬾 +孁 +子 +孔 +字 +存 +孚 +孝 +孟 +季 +孤 +学 +孫 +孵 +學 +宅 +宇 +守 +安 +宋 +完 +宍 +宏 +宕 +宗 +官 +宙 +定 +宛 +宜 +宝 +実 +客 +宣 +室 +宥 +宮 +宰 +害 +宴 +宵 +家 +宸 +容 +宿 +寂 +寄 +寅 +密 +寇 +富 +寒 +寓 +寔 +寛 +寝 +察 +寡 +實 +寧 +審 +寮 +寵 +寶 +寸 +寺 +対 +寿 +封 +専 +射 +将 +尉 +尊 +尋 +對 +導 +小 +少 +尖 +尚 +尤 +尪 +尭 +就 +尹 +尺 +尻 +尼 +尽 +尾 +尿 +局 +居 +屈 +届 +屋 +屍 +屎 +屏 +屑 +屓 +展 +属 +屠 +層 +履 +屯 +山 +岐 +岑 +岡 +岩 +岫 +岬 +岳 +岷 +岸 +峠 +峡 +峨 +峯 +峰 +島 +峻 +崇 +崋 +崎 +崑 +崖 +崗 +崛 +崩 +嵌 +嵐 +嵩 +嵯 +嶂 +嶋 +嶠 +嶺 +嶼 +嶽 +巀 +巌 +巒 +巖 +川 +州 +巡 +巣 +工 +左 +巧 +巨 +巫 +差 +己 +巳 +巴 +巷 +巻 +巽 +巾 +市 +布 +帆 +希 +帖 +帚 +帛 +帝 +帥 +師 +席 +帯 +帰 +帳 +帷 +常 +帽 +幄 +幅 +幇 +幌 +幔 +幕 +幟 +幡 +幢 +幣 +干 +平 +年 +并 +幸 +幹 +幻 +幼 +幽 +幾 +庁 +広 +庄 +庇 +床 +序 +底 +庖 +店 +庚 +府 +度 +座 +庫 +庭 +庵 +庶 +康 +庸 +廂 +廃 +廉 +廊 +廓 +廟 +廠 +廣 +廬 +延 +廷 +建 +廻 +廼 +廿 +弁 +弄 +弉 +弊 +弌 +式 +弐 +弓 +弔 +引 +弖 +弗 +弘 +弛 +弟 +弥 +弦 +弧 +弱 +張 +強 +弼 +弾 +彈 +彊 +彌 +彎 +当 +彗 +彙 +彝 +形 +彦 +彩 +彫 +彬 +彭 +彰 +影 +彷 +役 +彼 +往 +征 +徂 +径 +待 +律 +後 +徐 +徑 +徒 +従 +得 +徠 +御 +徧 +徨 +復 +循 +徭 +微 +徳 +徴 +德 +徹 +徽 +心 +必 +忉 +忌 +忍 +志 +忘 +忙 +応 +忠 +快 +忯 +念 +忻 +忽 +忿 +怒 +怖 +思 +怠 +怡 +急 +性 +怨 +怪 +怯 +恂 +恋 +恐 +恒 +恕 +恣 +恤 +恥 +恨 +恩 +恬 +恭 +息 +恵 +悉 +悌 +悍 +悔 +悟 +悠 +患 +悦 +悩 +悪 +悲 +悼 +情 +惇 +惑 +惚 +惜 +惟 +惠 +惣 +惧 +惨 +惰 +想 +惹 +惺 +愈 +愉 +愍 +意 +愔 +愚 +愛 +感 +愷 +愿 +慈 +態 +慌 +慎 +慕 +慢 +慣 +慧 +慨 +慮 +慰 +慶 +憂 +憎 +憐 +憑 +憙 +憤 +憧 +憩 +憬 +憲 +憶 +憾 +懇 +應 +懌 +懐 +懲 +懸 +懺 +懽 +懿 +戈 +戊 +戌 +戎 +成 +我 +戒 +戔 +或 +戚 +戟 +戦 +截 +戮 +戯 +戴 +戸 +戻 +房 +所 +扁 +扇 +扈 +扉 +手 +才 +打 +払 +托 +扮 +扱 +扶 +批 +承 +技 +抄 +把 +抑 +抓 +投 +抗 +折 +抜 +択 +披 +抱 +抵 +抹 +押 +抽 +担 +拇 +拈 +拉 +拍 +拏 +拐 +拒 +拓 +拘 +拙 +招 +拝 +拠 +拡 +括 +拭 +拳 +拵 +拶 +拾 +拿 +持 +挂 +指 +按 +挑 +挙 +挟 +挨 +振 +挺 +挽 +挿 +捉 +捕 +捗 +捜 +捧 +捨 +据 +捺 +捻 +掃 +掄 +授 +掌 +排 +掖 +掘 +掛 +掟 +採 +探 +掣 +接 +控 +推 +掩 +措 +掬 +掲 +掴 +掻 +掾 +揃 +揄 +揆 +揉 +描 +提 +揖 +揚 +換 +握 +揮 +援 +揶 +揺 +損 +搦 +搬 +搭 +携 +搾 +摂 +摘 +摩 +摸 +摺 +撃 +撒 +撞 +撤 +撥 +撫 +播 +撮 +撰 +撲 +撹 +擁 +操 +擔 +擦 +擬 +擾 +攘 +攝 +攣 +支 +收 +改 +攻 +放 +政 +故 +敏 +救 +敗 +教 +敢 +散 +敦 +敬 +数 +整 +敵 +敷 +斂 +文 +斉 +斎 +斐 +斑 +斗 +料 +斜 +斟 +斤 +斥 +斧 +斬 +断 +斯 +新 +方 +於 +施 +旁 +旅 +旋 +旌 +族 +旗 +旛 +无 +旡 +既 +日 +旦 +旧 +旨 +早 +旬 +旭 +旺 +旻 +昂 +昆 +昇 +昉 +昌 +明 +昏 +易 +昔 +星 +映 +春 +昧 +昨 +昪 +昭 +是 +昵 +昼 +晁 +時 +晃 +晋 +晏 +晒 +晟 +晦 +晧 +晩 +普 +景 +晴 +晶 +智 +暁 +暇 +暈 +暉 +暑 +暖 +暗 +暘 +暢 +暦 +暫 +暮 +暲 +暴 +暹 +暾 +曄 +曇 +曉 +曖 +曙 +曜 +曝 +曠 +曰 +曲 +曳 +更 +書 +曹 +曼 +曽 +曾 +替 +最 +會 +月 +有 +朋 +服 +朏 +朔 +朕 +朗 +望 +朝 +期 +朧 +木 +未 +末 +本 +札 +朱 +朴 +机 +朽 +杁 +杉 +李 +杏 +材 +村 +杓 +杖 +杜 +杞 +束 +条 +杢 +杣 +来 +杭 +杮 +杯 +東 +杲 +杵 +杷 +杼 +松 +板 +枅 +枇 +析 +枓 +枕 +林 +枚 +果 +枝 +枠 +枡 +枢 +枯 +枳 +架 +柄 +柊 +柏 +某 +柑 +染 +柔 +柘 +柚 +柯 +柱 +柳 +柴 +柵 +査 +柾 +柿 +栂 +栃 +栄 +栖 +栗 +校 +株 +栲 +栴 +核 +根 +栻 +格 +栽 +桁 +桂 +桃 +框 +案 +桐 +桑 +桓 +桔 +桜 +桝 +桟 +桧 +桴 +桶 +桾 +梁 +梅 +梆 +梓 +梔 +梗 +梛 +條 +梟 +梢 +梧 +梨 +械 +梱 +梲 +梵 +梶 +棄 +棋 +棒 +棗 +棘 +棚 +棟 +棠 +森 +棲 +棹 +棺 +椀 +椅 +椋 +植 +椎 +椏 +椒 +椙 +検 +椥 +椹 +椿 +楊 +楓 +楕 +楚 +楞 +楠 +楡 +楢 +楨 +楪 +楫 +業 +楮 +楯 +楳 +極 +楷 +楼 +楽 +概 +榊 +榎 +榕 +榛 +榜 +榮 +榱 +榴 +槃 +槇 +槊 +構 +槌 +槍 +槐 +様 +槙 +槻 +槽 +槿 +樂 +樋 +樓 +樗 +標 +樟 +模 +権 +横 +樫 +樵 +樹 +樺 +樽 +橇 +橋 +橘 +機 +橿 +檀 +檄 +檎 +檐 +檗 +檜 +檣 +檥 +檬 +檮 +檸 +檻 +櫃 +櫓 +櫛 +櫟 +櫨 +櫻 +欄 +欅 +欠 +次 +欣 +欧 +欲 +欺 +欽 +款 +歌 +歎 +歓 +止 +正 +此 +武 +歩 +歪 +歯 +歳 +歴 +死 +殆 +殉 +殊 +残 +殖 +殯 +殴 +段 +殷 +殺 +殻 +殿 +毀 +毅 +母 +毎 +毒 +比 +毘 +毛 +毫 +毬 +氈 +氏 +民 +気 +水 +氷 +永 +氾 +汀 +汁 +求 +汎 +汐 +汗 +汚 +汝 +江 +池 +汪 +汰 +汲 +決 +汽 +沂 +沃 +沅 +沆 +沈 +沌 +沐 +沓 +沖 +沙 +没 +沢 +沱 +河 +沸 +油 +治 +沼 +沽 +沿 +況 +泉 +泊 +泌 +法 +泗 +泡 +波 +泣 +泥 +注 +泯 +泰 +泳 +洋 +洒 +洗 +洛 +洞 +津 +洩 +洪 +洲 +洸 +洹 +活 +洽 +派 +流 +浄 +浅 +浙 +浚 +浜 +浣 +浦 +浩 +浪 +浮 +浴 +海 +浸 +涅 +消 +涌 +涙 +涛 +涯 +液 +涵 +涼 +淀 +淄 +淆 +淇 +淋 +淑 +淘 +淡 +淤 +淨 +淫 +深 +淳 +淵 +混 +淹 +添 +清 +済 +渉 +渋 +渓 +渕 +渚 +減 +渟 +渠 +渡 +渤 +渥 +渦 +温 +渫 +測 +港 +游 +渾 +湊 +湖 +湘 +湛 +湧 +湫 +湯 +湾 +湿 +満 +源 +準 +溜 +溝 +溢 +溥 +溪 +溶 +溺 +滄 +滅 +滋 +滌 +滑 +滕 +滝 +滞 +滴 +滸 +滹 +滿 +漁 +漂 +漆 +漉 +漏 +漑 +演 +漕 +漠 +漢 +漣 +漫 +漬 +漱 +漸 +漿 +潅 +潔 +潙 +潜 +潟 +潤 +潭 +潮 +潰 +潴 +澁 +澂 +澄 +澎 +澗 +澤 +澪 +澱 +澳 +激 +濁 +濃 +濟 +濠 +濡 +濤 +濫 +濯 +濱 +濾 +瀉 +瀋 +瀑 +瀕 +瀞 +瀟 +瀧 +瀬 +瀾 +灌 +灑 +灘 +火 +灯 +灰 +灸 +災 +炉 +炊 +炎 +炒 +炭 +炮 +炷 +点 +為 +烈 +烏 +烙 +烝 +烹 +焔 +焙 +焚 +無 +焦 +然 +焼 +煇 +煉 +煌 +煎 +煕 +煙 +煤 +煥 +照 +煩 +煬 +煮 +煽 +熈 +熊 +熙 +熟 +熨 +熱 +熹 +熾 +燃 +燈 +燎 +燔 +燕 +燗 +燥 +燭 +燻 +爆 +爐 +爪 +爬 +爲 +爵 +父 +爺 +爼 +爽 +爾 +片 +版 +牌 +牒 +牘 +牙 +牛 +牝 +牟 +牡 +牢 +牧 +物 +牲 +特 +牽 +犂 +犠 +犬 +犯 +状 +狂 +狄 +狐 +狗 +狙 +狛 +狡 +狩 +独 +狭 +狷 +狸 +狼 +猊 +猛 +猟 +猥 +猨 +猩 +猪 +猫 +献 +猴 +猶 +猷 +猾 +猿 +獄 +獅 +獏 +獣 +獲 +玄 +玅 +率 +玉 +王 +玖 +玩 +玲 +珀 +珂 +珈 +珉 +珊 +珍 +珎 +珞 +珠 +珣 +珥 +珪 +班 +現 +球 +理 +琉 +琢 +琥 +琦 +琮 +琲 +琳 +琴 +琵 +琶 +瑁 +瑋 +瑙 +瑚 +瑛 +瑜 +瑞 +瑠 +瑤 +瑩 +瑪 +瑳 +瑾 +璃 +璋 +璜 +璞 +璧 +璨 +環 +璵 +璽 +璿 +瓊 +瓔 +瓜 +瓢 +瓦 +瓶 +甍 +甑 +甕 +甘 +甚 +甞 +生 +産 +甥 +用 +甫 +田 +由 +甲 +申 +男 +町 +画 +界 +畏 +畑 +畔 +留 +畜 +畝 +畠 +畢 +略 +番 +異 +畳 +當 +畷 +畸 +畺 +畿 +疆 +疇 +疋 +疎 +疏 +疑 +疫 +疱 +疲 +疹 +疼 +疾 +病 +症 +痒 +痔 +痕 +痘 +痙 +痛 +痢 +痩 +痴 +痺 +瘍 +瘡 +瘧 +療 +癇 +癌 +癒 +癖 +癡 +癪 +発 +登 +白 +百 +的 +皆 +皇 +皋 +皐 +皓 +皮 +皺 +皿 +盂 +盃 +盆 +盈 +益 +盒 +盗 +盛 +盞 +盟 +盡 +監 +盤 +盥 +盧 +目 +盲 +直 +相 +盾 +省 +眉 +看 +県 +眞 +真 +眠 +眷 +眺 +眼 +着 +睡 +督 +睦 +睨 +睿 +瞋 +瞑 +瞞 +瞬 +瞭 +瞰 +瞳 +瞻 +瞼 +瞿 +矍 +矛 +矜 +矢 +知 +矧 +矩 +短 +矮 +矯 +石 +砂 +砌 +研 +砕 +砥 +砦 +砧 +砲 +破 +砺 +硝 +硫 +硬 +硯 +碁 +碇 +碌 +碑 +碓 +碕 +碗 +碣 +碧 +碩 +確 +碾 +磁 +磐 +磔 +磧 +磨 +磬 +磯 +礁 +礎 +礒 +礙 +礫 +礬 +示 +礼 +社 +祀 +祁 +祇 +祈 +祉 +祐 +祓 +祕 +祖 +祗 +祚 +祝 +神 +祟 +祠 +祢 +祥 +票 +祭 +祷 +祺 +禁 +禄 +禅 +禊 +禍 +禎 +福 +禔 +禖 +禛 +禦 +禧 +禮 +禰 +禹 +禽 +禿 +秀 +私 +秋 +科 +秒 +秘 +租 +秤 +秦 +秩 +称 +移 +稀 +程 +税 +稔 +稗 +稙 +稚 +稜 +稠 +種 +稱 +稲 +稷 +稻 +稼 +稽 +稿 +穀 +穂 +穆 +積 +穎 +穏 +穗 +穜 +穢 +穣 +穫 +穴 +究 +空 +突 +窃 +窄 +窒 +窓 +窟 +窠 +窩 +窪 +窮 +窯 +竃 +竄 +竈 +立 +站 +竜 +竝 +竟 +章 +童 +竪 +竭 +端 +竴 +競 +竹 +竺 +竽 +竿 +笄 +笈 +笏 +笑 +笙 +笛 +笞 +笠 +笥 +符 +第 +笹 +筅 +筆 +筇 +筈 +等 +筋 +筌 +筍 +筏 +筐 +筑 +筒 +答 +策 +筝 +筥 +筧 +筬 +筮 +筯 +筰 +筵 +箆 +箇 +箋 +箏 +箒 +箔 +箕 +算 +箙 +箜 +管 +箪 +箭 +箱 +箸 +節 +篁 +範 +篆 +篇 +築 +篋 +篌 +篝 +篠 +篤 +篥 +篦 +篩 +篭 +篳 +篷 +簀 +簒 +簡 +簧 +簪 +簫 +簺 +簾 +簿 +籀 +籃 +籌 +籍 +籐 +籟 +籠 +籤 +籬 +米 +籾 +粂 +粉 +粋 +粒 +粕 +粗 +粘 +粛 +粟 +粥 +粧 +粮 +粳 +精 +糊 +糖 +糜 +糞 +糟 +糠 +糧 +糯 +糸 +糺 +系 +糾 +紀 +約 +紅 +紋 +納 +紐 +純 +紗 +紘 +紙 +級 +紛 +素 +紡 +索 +紫 +紬 +累 +細 +紳 +紵 +紹 +紺 +絁 +終 +絃 +組 +絅 +経 +結 +絖 +絞 +絡 +絣 +給 +統 +絲 +絵 +絶 +絹 +絽 +綏 +經 +継 +続 +綜 +綟 +綬 +維 +綱 +網 +綴 +綸 +綺 +綽 +綾 +綿 +緊 +緋 +総 +緑 +緒 +線 +締 +緥 +編 +緩 +緬 +緯 +練 +緻 +縁 +縄 +縅 +縒 +縛 +縞 +縢 +縣 +縦 +縫 +縮 +縹 +總 +績 +繁 +繊 +繋 +繍 +織 +繕 +繝 +繦 +繧 +繰 +繹 +繼 +纂 +纈 +纏 +纐 +纒 +纛 +缶 +罔 +罠 +罧 +罪 +置 +罰 +署 +罵 +罷 +罹 +羂 +羅 +羆 +羇 +羈 +羊 +羌 +美 +群 +羨 +義 +羯 +羲 +羹 +羽 +翁 +翅 +翌 +習 +翔 +翛 +翠 +翡 +翫 +翰 +翺 +翻 +翼 +耀 +老 +考 +者 +耆 +而 +耐 +耕 +耗 +耨 +耳 +耶 +耽 +聊 +聖 +聘 +聚 +聞 +聟 +聡 +聨 +聯 +聰 +聲 +聴 +職 +聾 +肄 +肆 +肇 +肉 +肋 +肌 +肖 +肘 +肛 +肝 +股 +肢 +肥 +肩 +肪 +肯 +肱 +育 +肴 +肺 +胃 +胆 +背 +胎 +胖 +胚 +胝 +胞 +胡 +胤 +胱 +胴 +胸 +能 +脂 +脅 +脆 +脇 +脈 +脊 +脚 +脛 +脩 +脱 +脳 +腋 +腎 +腐 +腑 +腔 +腕 +腫 +腰 +腱 +腸 +腹 +腺 +腿 +膀 +膏 +膚 +膜 +膝 +膠 +膣 +膨 +膩 +膳 +膵 +膾 +膿 +臂 +臆 +臈 +臍 +臓 +臘 +臚 +臣 +臥 +臨 +自 +臭 +至 +致 +臺 +臼 +舂 +舅 +與 +興 +舌 +舍 +舎 +舒 +舖 +舗 +舘 +舜 +舞 +舟 +舩 +航 +般 +舳 +舶 +船 +艇 +艘 +艦 +艮 +良 +色 +艶 +芋 +芒 +芙 +芝 +芥 +芦 +芬 +芭 +芯 +花 +芳 +芸 +芹 +芻 +芽 +芿 +苅 +苑 +苔 +苗 +苛 +苞 +苡 +若 +苦 +苧 +苫 +英 +苴 +苻 +茂 +范 +茄 +茅 +茎 +茗 +茘 +茜 +茨 +茲 +茵 +茶 +茸 +茹 +草 +荊 +荏 +荒 +荘 +荷 +荻 +荼 +莞 +莪 +莫 +莬 +莱 +莵 +莽 +菅 +菊 +菌 +菓 +菖 +菘 +菜 +菟 +菩 +菫 +華 +菱 +菴 +萄 +萊 +萌 +萍 +萎 +萠 +萩 +萬 +萱 +落 +葉 +著 +葛 +葡 +董 +葦 +葩 +葬 +葭 +葱 +葵 +葺 +蒋 +蒐 +蒔 +蒙 +蒟 +蒡 +蒲 +蒸 +蒻 +蒼 +蒿 +蓄 +蓆 +蓉 +蓋 +蓑 +蓬 +蓮 +蓼 +蔀 +蔑 +蔓 +蔚 +蔡 +蔦 +蔬 +蔭 +蔵 +蔽 +蕃 +蕉 +蕊 +蕎 +蕨 +蕩 +蕪 +蕭 +蕾 +薄 +薇 +薊 +薔 +薗 +薙 +薛 +薦 +薨 +薩 +薪 +薫 +薬 +薭 +薮 +藁 +藉 +藍 +藏 +藐 +藝 +藤 +藩 +藪 +藷 +藹 +藺 +藻 +蘂 +蘆 +蘇 +蘊 +蘭 +虎 +虐 +虔 +虚 +虜 +虞 +號 +虫 +虹 +虻 +蚊 +蚕 +蛇 +蛉 +蛍 +蛎 +蛙 +蛛 +蛟 +蛤 +蛭 +蛮 +蛸 +蛹 +蛾 +蜀 +蜂 +蜃 +蜆 +蜊 +蜘 +蜜 +蜷 +蜻 +蝉 +蝋 +蝕 +蝙 +蝠 +蝦 +蝶 +蝿 +螂 +融 +螣 +螺 +蟄 +蟇 +蟠 +蟷 +蟹 +蟻 +蠢 +蠣 +血 +衆 +行 +衍 +衒 +術 +街 +衙 +衛 +衝 +衞 +衡 +衢 +衣 +表 +衫 +衰 +衵 +衷 +衽 +衾 +衿 +袁 +袈 +袋 +袍 +袒 +袖 +袙 +袞 +袢 +被 +袰 +袱 +袴 +袷 +袿 +裁 +裂 +裃 +装 +裏 +裔 +裕 +裘 +裙 +補 +裟 +裡 +裲 +裳 +裴 +裸 +裹 +製 +裾 +褂 +褄 +複 +褌 +褐 +褒 +褥 +褪 +褶 +褻 +襄 +襖 +襞 +襟 +襠 +襦 +襪 +襲 +襴 +襷 +西 +要 +覆 +覇 +覈 +見 +規 +視 +覗 +覚 +覧 +親 +覲 +観 +覺 +觀 +角 +解 +触 +言 +訂 +計 +討 +訓 +託 +記 +訛 +訟 +訢 +訥 +訪 +設 +許 +訳 +訴 +訶 +診 +註 +証 +詐 +詔 +評 +詛 +詞 +詠 +詢 +詣 +試 +詩 +詫 +詮 +詰 +話 +該 +詳 +誄 +誅 +誇 +誉 +誌 +認 +誓 +誕 +誘 +語 +誠 +誡 +誣 +誤 +誥 +誦 +説 +読 +誰 +課 +誼 +誾 +調 +談 +請 +諌 +諍 +諏 +諒 +論 +諚 +諜 +諟 +諡 +諦 +諧 +諫 +諭 +諮 +諱 +諶 +諷 +諸 +諺 +諾 +謀 +謄 +謌 +謎 +謗 +謙 +謚 +講 +謝 +謡 +謫 +謬 +謹 +證 +識 +譚 +譛 +譜 +警 +譬 +譯 +議 +譲 +譴 +護 +讀 +讃 +讐 +讒 +谷 +谿 +豅 +豆 +豊 +豎 +豐 +豚 +象 +豪 +豫 +豹 +貌 +貝 +貞 +負 +財 +貢 +貧 +貨 +販 +貪 +貫 +責 +貯 +貰 +貴 +買 +貸 +費 +貼 +貿 +賀 +賁 +賂 +賃 +賄 +資 +賈 +賊 +賎 +賑 +賓 +賛 +賜 +賞 +賠 +賢 +賣 +賤 +賦 +質 +賭 +購 +賽 +贄 +贅 +贈 +贋 +贔 +贖 +赤 +赦 +走 +赴 +起 +超 +越 +趙 +趣 +足 +趺 +趾 +跋 +跏 +距 +跡 +跨 +跪 +路 +跳 +践 +踊 +踏 +踐 +踞 +踪 +踵 +蹄 +蹉 +蹊 +蹟 +蹲 +蹴 +躅 +躇 +躊 +躍 +躑 +躙 +躪 +身 +躬 +躯 +躰 +車 +軋 +軌 +軍 +軒 +軟 +転 +軸 +軻 +軽 +軾 +較 +載 +輌 +輔 +輜 +輝 +輦 +輩 +輪 +輯 +輸 +輿 +轄 +轍 +轟 +轢 +辛 +辞 +辟 +辥 +辦 +辨 +辰 +辱 +農 +辺 +辻 +込 +迂 +迅 +迎 +近 +返 +迢 +迦 +迪 +迫 +迭 +述 +迷 +迹 +追 +退 +送 +逃 +逅 +逆 +逍 +透 +逐 +逓 +途 +逕 +逗 +這 +通 +逝 +逞 +速 +造 +逢 +連 +逮 +週 +進 +逸 +逼 +遁 +遂 +遅 +遇 +遊 +運 +遍 +過 +遐 +道 +達 +違 +遙 +遜 +遠 +遡 +遣 +遥 +適 +遭 +遮 +遯 +遵 +遷 +選 +遺 +遼 +避 +邀 +邁 +邂 +邃 +還 +邇 +邉 +邊 +邑 +那 +邦 +邨 +邪 +邯 +邵 +邸 +郁 +郊 +郎 +郡 +郢 +部 +郭 +郴 +郵 +郷 +都 +鄂 +鄙 +鄭 +鄰 +鄲 +酉 +酋 +酌 +配 +酎 +酒 +酔 +酢 +酥 +酪 +酬 +酵 +酷 +酸 +醍 +醐 +醒 +醗 +醜 +醤 +醪 +醵 +醸 +采 +釈 +釉 +釋 +里 +重 +野 +量 +釐 +金 +釘 +釜 +針 +釣 +釧 +釿 +鈍 +鈎 +鈐 +鈔 +鈞 +鈦 +鈴 +鈷 +鈸 +鈿 +鉄 +鉇 +鉉 +鉋 +鉛 +鉢 +鉤 +鉦 +鉱 +鉾 +銀 +銃 +銅 +銈 +銑 +銕 +銘 +銚 +銜 +銭 +鋏 +鋒 +鋤 +鋭 +鋲 +鋳 +鋸 +鋺 +鋼 +錆 +錍 +錐 +錘 +錠 +錣 +錦 +錫 +錬 +錯 +録 +錵 +鍋 +鍍 +鍑 +鍔 +鍛 +鍬 +鍮 +鍵 +鍼 +鍾 +鎌 +鎖 +鎗 +鎚 +鎧 +鎬 +鎮 +鎰 +鎹 +鏃 +鏑 +鏡 +鐃 +鐇 +鐐 +鐔 +鐘 +鐙 +鐚 +鐡 +鐵 +鐸 +鑁 +鑊 +鑑 +鑒 +鑚 +鑠 +鑢 +鑰 +鑵 +鑷 +鑼 +鑽 +鑿 +長 +門 +閃 +閇 +閉 +開 +閏 +閑 +間 +閔 +閘 +関 +閣 +閤 +閥 +閦 +閨 +閬 +閲 +閻 +閼 +閾 +闇 +闍 +闔 +闕 +闘 +關 +闡 +闢 +闥 +阜 +阪 +阮 +阯 +防 +阻 +阿 +陀 +陂 +附 +陌 +降 +限 +陛 +陞 +院 +陣 +除 +陥 +陪 +陬 +陰 +陳 +陵 +陶 +陸 +険 +陽 +隅 +隆 +隈 +隊 +隋 +階 +随 +隔 +際 +障 +隠 +隣 +隧 +隷 +隻 +隼 +雀 +雁 +雄 +雅 +集 +雇 +雉 +雊 +雋 +雌 +雍 +雑 +雖 +雙 +雛 +離 +難 +雨 +雪 +雫 +雰 +雲 +零 +雷 +雹 +電 +需 +震 +霊 +霍 +霖 +霜 +霞 +霧 +霰 +露 +靈 +青 +靖 +静 +靜 +非 +面 +革 +靫 +靭 +靱 +靴 +靺 +鞁 +鞄 +鞆 +鞋 +鞍 +鞏 +鞘 +鞠 +鞨 +鞭 +韋 +韓 +韜 +韮 +音 +韶 +韻 +響 +頁 +頂 +頃 +項 +順 +須 +頌 +預 +頑 +頒 +頓 +領 +頚 +頬 +頭 +頴 +頸 +頻 +頼 +顆 +題 +額 +顎 +顔 +顕 +顗 +願 +顛 +類 +顧 +顯 +風 +飛 +食 +飢 +飩 +飫 +飯 +飲 +飴 +飼 +飽 +飾 +餃 +餅 +餉 +養 +餌 +餐 +餓 +餘 +餝 +餡 +館 +饂 +饅 +饉 +饋 +饌 +饒 +饗 +首 +馗 +香 +馨 +馬 +馳 +馴 +駄 +駅 +駆 +駈 +駐 +駒 +駕 +駝 +駿 +騁 +騎 +騏 +騒 +験 +騙 +騨 +騰 +驕 +驚 +驛 +驢 +骨 +骸 +髄 +體 +高 +髙 +髢 +髪 +髭 +髮 +髷 +髻 +鬘 +鬚 +鬢 +鬨 +鬯 +鬱 +鬼 +魁 +魂 +魄 +魅 +魏 +魔 +魚 +魯 +鮎 +鮑 +鮒 +鮪 +鮫 +鮭 +鮮 +鯉 +鯔 +鯖 +鯛 +鯨 +鯰 +鯱 +鰐 +鰒 +鰭 +鰯 +鰰 +鰹 +鰻 +鱈 +鱒 +鱗 +鱧 +鳥 +鳩 +鳰 +鳳 +鳴 +鳶 +鴈 +鴉 +鴎 +鴛 +鴟 +鴦 +鴨 +鴫 +鴻 +鵄 +鵜 +鵞 +鵡 +鵬 +鵲 +鵺 +鶉 +鶏 +鶯 +鶴 +鷄 +鷙 +鷲 +鷹 +鷺 +鸚 +鸞 +鹸 +鹽 +鹿 +麁 +麒 +麓 +麗 +麝 +麞 +麟 +麦 +麩 +麹 +麺 +麻 +麾 +麿 +黄 +黌 +黍 +黒 +黙 +黛 +黠 +鼈 +鼉 +鼎 +鼓 +鼠 +鼻 +齊 +齋 +齟 +齢 +齬 +龍 +龕 +龗 +! +# +% +& +( +) ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; += +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +R +S +T +U +V +W +X +Z +a +c +d +e +f +h +i +j +k +l +m +n +o +p +r +s +t +u +y +z +~ +・ + diff --git a/batch_running_task/pytorchocr/utils/dict/ka_dict.txt b/batch_running_task/pytorchocr/utils/dict/ka_dict.txt new file mode 100644 index 0000000..33d605c --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/ka_dict.txt @@ -0,0 +1,153 @@ +k +a +_ +i +m +g +/ +1 +2 +I +L +S +V +R +C +0 +v +l +6 +4 +8 +. +j +p +ಗ +ು +ಣ +ಪ +ಡ +ಿ +ಸ +ಲ +ಾ +ದ +್ +7 +5 +3 +ವ +ಷ +ಬ +ಹ +ೆ +9 +ಅ +ಳ +ನ +ರ +ಉ +ಕ +ಎ +ೇ +ಂ +ೈ +ೊ +ೀ +ಯ +ೋ +ತ +ಶ +ಭ +ಧ +ಚ +ಜ +ೂ +ಮ +ಒ +ೃ +ಥ +ಇ +ಟ +ಖ +ಆ +ಞ +ಫ +- +ಢ +ಊ +ಓ +ಐ +ಃ +ಘ +ಝ +ೌ +ಠ +ಛ +ಔ +ಏ +ಈ +ಋ +೨ +೦ +೧ +೮ +೯ +೪ +, +೫ +೭ +೩ +೬ +ಙ +s +c +e +n +w +o +u +t +d +E +A +T +B +Z +N +G +O +q +z +r +x +P +K +M +J +U +D +f +F +h +b +W +Y +y +H +X +Q +' +# +& +! +@ +$ +: +% +é +É +( +? ++ + diff --git a/batch_running_task/pytorchocr/utils/dict/kn_dict.txt b/batch_running_task/pytorchocr/utils/dict/kn_dict.txt new file mode 100644 index 0000000..33d605c --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/kn_dict.txt @@ -0,0 +1,153 @@ +k +a +_ +i +m +g +/ +1 +2 +I +L +S +V +R +C +0 +v +l +6 +4 +8 +. +j +p +ಗ +ು +ಣ +ಪ +ಡ +ಿ +ಸ +ಲ +ಾ +ದ +್ +7 +5 +3 +ವ +ಷ +ಬ +ಹ +ೆ +9 +ಅ +ಳ +ನ +ರ +ಉ +ಕ +ಎ +ೇ +ಂ +ೈ +ೊ +ೀ +ಯ +ೋ +ತ +ಶ +ಭ +ಧ +ಚ +ಜ +ೂ +ಮ +ಒ +ೃ +ಥ +ಇ +ಟ +ಖ +ಆ +ಞ +ಫ +- +ಢ +ಊ +ಓ +ಐ +ಃ +ಘ +ಝ +ೌ +ಠ +ಛ +ಔ +ಏ +ಈ +ಋ +೨ +೦ +೧ +೮ +೯ +೪ +, +೫ +೭ +೩ +೬ +ಙ +s +c +e +n +w +o +u +t +d +E +A +T +B +Z +N +G +O +q +z +r +x +P +K +M +J +U +D +f +F +h +b +W +Y +y +H +X +Q +' +# +& +! +@ +$ +: +% +é +É +( +? ++ + diff --git a/batch_running_task/pytorchocr/utils/dict/korean_dict.txt b/batch_running_task/pytorchocr/utils/dict/korean_dict.txt new file mode 100644 index 0000000..a13899f --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/korean_dict.txt @@ -0,0 +1,3688 @@ +! +" +# +$ +% +& +' +* ++ +- +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +© +° +² +½ +Á +Ä +Å +Ç +É +Í +Î +Ó +Ö +× +Ü +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +ì +í +î +ï +ð +ñ +ò +ó +ô +õ +ö +ø +ú +û +ü +ý +ā +ă +ą +ć +Č +č +đ +ē +ė +ę +ě +ğ +ī +İ +ı +Ł +ł +ń +ň +ō +ř +Ş +ş +Š +š +ţ +ū +ź +ż +Ž +ž +Ș +ș +Α +Δ +α +λ +φ +Г +О +а +в +л +о +р +с +т +я +​ +’ +“ +” +→ +∇ +∼ +「 +」 +ア +カ +グ +ニ +ラ +ン +ㄱ +ㄴ +ㄷ +ㄸ +ㄹ +ㅂ +ㅅ +ㅆ +ㅇ +ㅈ +ㅊ +ㅋ +ㅌ +ㅎ +ㅓ +ㅜ +ㅣ +一 +丁 +七 +三 +上 +下 +不 +丑 +世 +丘 +丞 +中 +丸 +丹 +主 +乃 +久 +之 +乎 +乘 +九 +也 +乳 +乾 +事 +二 +云 +互 +五 +井 +亞 +亡 +交 +亥 +亨 +享 +京 +亭 +人 +仁 +今 +他 +仙 +代 +令 +以 +仰 +仲 +件 +任 +企 +伊 +伍 +伎 +伏 +伐 +休 +伯 +伴 +伸 +佃 +佈 +位 +低 +住 +佐 +何 +佛 +作 +使 +來 +供 +依 +侯 +侵 +侶 +便 +俗 +保 +俠 +信 +修 +俱 +俳 +倉 +個 +倍 +倒 +候 +借 +値 +倫 +倭 +假 +偈 +偉 +偏 +停 +偶 +傅 +傑 +傳 +傷 +傾 +像 +僞 +僥 +僧 +價 +儀 +儉 +儒 +優 +儼 +兀 +允 +元 +兆 +先 +光 +克 +兒 +入 +內 +全 +八 +公 +六 +共 +兵 +其 +具 +典 +兼 +再 +冠 +冥 +冶 +准 +凞 +凡 +凱 +出 +函 +刀 +分 +刊 +刑 +列 +初 +判 +別 +利 +到 +制 +券 +刺 +刻 +則 +前 +剛 +副 +創 +劃 +劑 +力 +功 +加 +劣 +助 +劫 +勇 +動 +務 +勝 +勢 +勳 +勸 +匈 +化 +北 +匠 +區 +十 +千 +午 +半 +卍 +卑 +卒 +卓 +南 +博 +卜 +占 +卦 +印 +危 +卵 +卷 +卽 +卿 +厄 +原 +厦 +去 +參 +又 +叉 +友 +反 +叔 +受 +口 +古 +句 +可 +台 +史 +右 +司 +各 +合 +吉 +同 +名 +后 +吏 +吐 +君 +吠 +吳 +呂 +告 +周 +味 +呵 +命 +和 +咳 +咸 +咽 +哀 +品 +哨 +哮 +哲 +唐 +唯 +唱 +商 +問 +啼 +善 +喆 +喉 +喜 +喩 +喪 +嘗 +器 +嚴 +囊 +四 +回 +因 +困 +固 +圈 +國 +圍 +園 +圓 +圖 +團 +土 +在 +地 +均 +坊 +坐 +坑 +坵 +型 +垢 +城 +域 +埴 +執 +培 +基 +堂 +堅 +堆 +堤 +堯 +報 +場 +塔 +塚 +塞 +塵 +境 +墜 +墟 +墨 +墳 +墾 +壁 +壇 +壓 +壤 +士 +壬 +壯 +壺 +壽 +夏 +夕 +外 +多 +夜 +夢 +大 +天 +太 +夫 +央 +失 +夷 +奄 +奇 +奉 +奎 +奏 +契 +奔 +奮 +女 +奴 +好 +如 +妄 +妊 +妖 +妙 +始 +姑 +姓 +姚 +姜 +威 +婆 +婚 +婦 +媒 +媚 +子 +孔 +字 +存 +孝 +孟 +季 +孤 +孫 +學 +孺 +宇 +守 +安 +宋 +宗 +官 +宙 +定 +客 +宣 +室 +宮 +害 +家 +容 +寂 +寃 +寄 +寅 +密 +寇 +富 +寒 +寓 +實 +審 +寫 +寬 +寶 +寸 +寺 +封 +將 +專 +尊 +對 +小 +少 +尙 +尹 +尼 +尿 +局 +居 +屈 +屋 +屍 +屎 +屛 +層 +屬 +山 +岐 +岡 +岩 +岳 +岸 +峙 +峰 +島 +峻 +峽 +崇 +崔 +崖 +崩 +嶋 +巖 +川 +州 +巢 +工 +左 +巧 +巨 +巫 +差 +己 +巷 +市 +布 +帝 +師 +帶 +常 +帽 +幕 +干 +平 +年 +幹 +幻 +幼 +幽 +庇 +序 +店 +府 +度 +座 +庫 +庭 +康 +廟 +廣 +廳 +延 +廷 +建 +廻 +弁 +式 +弑 +弓 +引 +弘 +弟 +弱 +張 +强 +弼 +彌 +彛 +形 +彬 +影 +役 +彼 +彿 +往 +征 +待 +律 +後 +徐 +徑 +得 +從 +循 +微 +德 +徹 +心 +必 +忌 +忍 +志 +忠 +思 +怡 +急 +性 +恐 +恒 +恨 +恩 +悅 +悖 +患 +悲 +情 +惑 +惟 +惠 +惡 +想 +惺 +愁 +意 +愚 +愛 +感 +愼 +慈 +態 +慕 +慣 +慧 +慾 +憂 +憤 +憺 +應 +懸 +戎 +成 +我 +戟 +戮 +戰 +戴 +戶 +房 +所 +手 +才 +打 +批 +承 +技 +抄 +把 +抗 +抱 +抽 +拇 +拓 +拘 +拙 +拜 +拾 +持 +指 +捌 +捨 +捿 +授 +掌 +排 +接 +推 +提 +揚 +揭 +援 +損 +搗 +摩 +播 +操 +擒 +擔 +擘 +據 +擧 +攘 +攝 +攬 +支 +改 +攻 +放 +政 +故 +敍 +敎 +救 +敗 +散 +敬 +整 +數 +文 +斗 +料 +斛 +斜 +斧 +斯 +新 +斷 +方 +於 +施 +旋 +族 +旗 +日 +旨 +早 +旱 +昌 +明 +易 +昔 +星 +春 +昧 +昭 +是 +時 +晉 +晋 +晩 +普 +景 +晴 +晶 +智 +暈 +暑 +暗 +暘 +曉 +曜 +曠 +曦 +曰 +曲 +書 +曹 +曼 +曾 +最 +會 +月 +有 +朋 +服 +望 +朝 +期 +木 +未 +末 +本 +朱 +朴 +李 +材 +村 +杖 +杜 +杞 +杭 +杯 +東 +松 +板 +林 +果 +枝 +枯 +枰 +枾 +柏 +柑 +柱 +栗 +校 +栢 +核 +根 +格 +桀 +桂 +案 +桎 +桑 +桓 +桔 +梁 +梏 +梓 +梗 +條 +梨 +梵 +棗 +棟 +森 +植 +椒 +楊 +楓 +楚 +業 +楮 +極 +榮 +槃 +槍 +樂 +樓 +樗 +樣 +樸 +樹 +樺 +樽 +橄 +橋 +橘 +機 +橡 +檀 +檎 +權 +欌 +欖 +次 +欲 +歌 +歐 +止 +正 +此 +步 +武 +歲 +歸 +死 +殖 +段 +殷 +殺 +殿 +毅 +母 +毒 +比 +毛 +氏 +民 +氣 +水 +永 +求 +汎 +汗 +江 +池 +沅 +沒 +沖 +沙 +沛 +河 +油 +治 +沼 +沿 +泉 +泊 +法 +泗 +泡 +波 +注 +泰 +洋 +洙 +洛 +洞 +津 +洲 +活 +派 +流 +浅 +浦 +浮 +浴 +海 +涅 +涇 +消 +涌 +液 +淑 +淡 +淨 +淫 +深 +淳 +淵 +淸 +渠 +渡 +游 +渾 +湖 +湯 +源 +溪 +溫 +溶 +滄 +滅 +滋 +滯 +滿 +漁 +漆 +漢 +漫 +漸 +潑 +潤 +潭 +澄 +澎 +澤 +澳 +澹 +濁 +濕 +濟 +濤 +濯 +瀋 +瀝 +灣 +火 +灰 +灸 +災 +炎 +炭 +点 +烈 +烏 +烙 +焚 +無 +焦 +然 +煌 +煎 +照 +煬 +煮 +熟 +熱 +燁 +燈 +燔 +燕 +燥 +燧 +燮 +爲 +爵 +父 +片 +版 +牌 +牛 +牝 +牟 +牡 +物 +特 +犧 +犬 +狀 +狗 +猥 +猩 +猪 +獨 +獵 +獸 +獻 +玄 +玉 +王 +玲 +珍 +珠 +珪 +班 +現 +球 +理 +琴 +瑞 +瑟 +瑪 +璃 +璋 +璽 +瓜 +瓦 +甑 +甘 +生 +産 +用 +甫 +田 +由 +甲 +申 +男 +界 +畏 +留 +畜 +畢 +略 +番 +異 +畵 +當 +畸 +疏 +疑 +疫 +疹 +疼 +病 +症 +痔 +痛 +痺 +瘀 +瘍 +瘡 +療 +癌 +癖 +登 +發 +白 +百 +的 +皆 +皇 +皮 +盂 +盆 +益 +盛 +盜 +盟 +盡 +盤 +盧 +目 +直 +相 +省 +看 +眞 +眼 +睡 +督 +瞋 +矢 +矣 +知 +短 +石 +破 +碍 +碑 +磁 +磨 +磬 +示 +社 +祇 +祖 +祝 +神 +祥 +祭 +祺 +禁 +禅 +禍 +福 +禦 +禪 +禮 +禹 +禽 +禾 +秀 +私 +秉 +秋 +科 +秘 +秤 +秦 +秩 +移 +稀 +稗 +種 +稱 +稷 +稼 +稽 +穀 +穆 +積 +空 +窮 +竅 +立 +章 +童 +竭 +端 +竹 +笑 +符 +第 +筆 +等 +筍 +答 +策 +箋 +箕 +管 +箱 +節 +篇 +簡 +米 +粉 +粘 +粥 +精 +糖 +糞 +系 +紀 +紂 +約 +紅 +紋 +純 +紙 +級 +素 +索 +紫 +紬 +累 +細 +紳 +終 +組 +結 +絡 +統 +絲 +絶 +絹 +經 +綠 +維 +綱 +網 +綸 +綽 +緖 +線 +緣 +緯 +縣 +縱 +總 +織 +繡 +繩 +繪 +繭 +纂 +續 +罕 +置 +罰 +羅 +羊 +美 +群 +義 +羽 +翁 +習 +翟 +老 +考 +者 +而 +耐 +耕 +耳 +聃 +聖 +聞 +聰 +聲 +職 +肇 +肉 +肖 +肝 +股 +肥 +育 +肺 +胃 +胎 +胚 +胞 +胡 +胥 +能 +脂 +脈 +脚 +脛 +脣 +脩 +脫 +脯 +脾 +腋 +腎 +腫 +腸 +腹 +膜 +膠 +膨 +膽 +臆 +臟 +臣 +臥 +臨 +自 +至 +致 +臺 +臼 +臾 +與 +興 +舊 +舌 +舍 +舒 +舜 +舟 +般 +船 +艦 +良 +色 +芋 +花 +芳 +芽 +苑 +苔 +苕 +苛 +苞 +若 +苦 +英 +茂 +茵 +茶 +茹 +荀 +荇 +草 +荒 +荷 +莊 +莫 +菊 +菌 +菜 +菩 +菫 +華 +菴 +菽 +萊 +萍 +萬 +落 +葉 +著 +葛 +董 +葬 +蒙 +蒜 +蒲 +蒸 +蒿 +蓮 +蔓 +蔘 +蔡 +蔬 +蕃 +蕉 +蕓 +薄 +薑 +薛 +薩 +薪 +薺 +藏 +藝 +藤 +藥 +藩 +藻 +蘆 +蘇 +蘊 +蘚 +蘭 +虎 +處 +虛 +虞 +虹 +蜀 +蜂 +蜜 +蝕 +蝶 +融 +蟬 +蟲 +蠶 +蠻 +血 +衆 +行 +術 +衛 +衡 +衣 +表 +袁 +裔 +裕 +裙 +補 +製 +複 +襄 +西 +要 +見 +視 +親 +覺 +觀 +角 +解 +言 +訂 +訊 +訓 +託 +記 +訣 +設 +診 +註 +評 +詩 +話 +詵 +誅 +誌 +認 +誕 +語 +誠 +誤 +誥 +誦 +說 +調 +談 +諍 +論 +諡 +諫 +諭 +諸 +謙 +講 +謝 +謠 +證 +識 +譚 +譜 +譯 +議 +護 +讀 +變 +谷 +豆 +豊 +豚 +象 +豪 +豫 +貝 +貞 +財 +貧 +貨 +貪 +貫 +貴 +貸 +費 +資 +賊 +賓 +賞 +賢 +賣 +賦 +質 +贍 +赤 +赫 +走 +起 +超 +越 +趙 +趣 +趨 +足 +趾 +跋 +跡 +路 +踏 +蹟 +身 +躬 +車 +軍 +軒 +軟 +載 +輓 +輕 +輪 +輯 +輸 +輻 +輿 +轅 +轉 +辨 +辭 +辯 +辰 +農 +近 +迦 +述 +追 +逆 +透 +逐 +通 +逝 +造 +逢 +連 +進 +逵 +遂 +遊 +運 +遍 +過 +道 +達 +遠 +遡 +適 +遷 +選 +遺 +遽 +還 +邊 +邑 +那 +邪 +郞 +郡 +部 +都 +鄒 +鄕 +鄭 +鄲 +配 +酒 +酸 +醉 +醫 +醯 +釋 +里 +重 +野 +量 +釐 +金 +針 +鈍 +鈴 +鉞 +銀 +銅 +銘 +鋼 +錄 +錢 +錦 +鎭 +鏡 +鐘 +鐵 +鑑 +鑛 +長 +門 +閃 +開 +間 +閔 +閣 +閥 +閭 +閻 +闕 +關 +阪 +防 +阿 +陀 +降 +限 +陝 +院 +陰 +陳 +陵 +陶 +陸 +陽 +隆 +隊 +隋 +階 +際 +障 +隣 +隨 +隱 +隷 +雀 +雄 +雅 +集 +雇 +雌 +雖 +雙 +雜 +離 +難 +雨 +雪 +雲 +電 +霜 +露 +靈 +靑 +靖 +靜 +非 +面 +革 +靴 +鞏 +韓 +音 +韶 +韻 +順 +須 +頊 +頌 +領 +頭 +顔 +願 +顚 +類 +顯 +風 +飛 +食 +飢 +飮 +飯 +飾 +養 +餓 +餘 +首 +香 +馨 +馬 +駒 +騫 +騷 +驕 +骨 +骸 +髓 +體 +高 +髥 +髮 +鬪 +鬱 +鬼 +魏 +魔 +魚 +魯 +鮮 +鰍 +鰐 +鳥 +鳧 +鳳 +鴨 +鵲 +鶴 +鷄 +鷹 +鹽 +鹿 +麗 +麥 +麻 +黃 +黑 +默 +點 +黨 +鼎 +齊 +齋 +齒 +龍 +龜 +가 +각 +간 +갇 +갈 +갉 +감 +갑 +값 +갓 +갔 +강 +갖 +갗 +같 +갚 +갛 +개 +객 +갠 +갤 +갬 +갭 +갯 +갰 +갱 +갸 +걀 +걔 +걘 +거 +걱 +건 +걷 +걸 +검 +겁 +것 +겄 +겅 +겆 +겉 +겊 +겋 +게 +겐 +겔 +겟 +겠 +겡 +겨 +격 +겪 +견 +결 +겸 +겹 +겻 +겼 +경 +곁 +계 +곕 +곗 +고 +곡 +곤 +곧 +골 +곪 +곬 +곯 +곰 +곱 +곳 +공 +곶 +과 +곽 +관 +괄 +괌 +광 +괘 +괜 +괭 +괴 +괸 +굉 +교 +구 +국 +군 +굳 +굴 +굵 +굶 +굼 +굽 +굿 +궁 +궂 +궈 +권 +궐 +궜 +궝 +궤 +귀 +귄 +귈 +귓 +규 +균 +귤 +그 +극 +근 +글 +긁 +금 +급 +긋 +긍 +기 +긴 +길 +김 +깁 +깃 +깅 +깊 +까 +깍 +깎 +깐 +깔 +깜 +깝 +깟 +깡 +깥 +깨 +깬 +깰 +깻 +깼 +깽 +꺄 +꺼 +꺽 +꺾 +껀 +껄 +껌 +껍 +껏 +껐 +껑 +께 +껴 +꼈 +꼍 +꼐 +꼬 +꼭 +꼴 +꼼 +꼽 +꼿 +꽁 +꽂 +꽃 +꽉 +꽝 +꽤 +꽥 +꾀 +꾜 +꾸 +꾹 +꾼 +꿀 +꿇 +꿈 +꿉 +꿋 +꿍 +꿎 +꿔 +꿨 +꿩 +꿰 +꿴 +뀄 +뀌 +뀐 +뀔 +뀜 +뀝 +끄 +끈 +끊 +끌 +끓 +끔 +끕 +끗 +끙 +끝 +끼 +끽 +낀 +낄 +낌 +낍 +낏 +낑 +나 +낙 +낚 +난 +낟 +날 +낡 +남 +납 +낫 +났 +낭 +낮 +낯 +낱 +낳 +내 +낵 +낸 +낼 +냄 +냅 +냇 +냈 +냉 +냐 +냔 +냘 +냥 +너 +넉 +넋 +넌 +널 +넓 +넘 +넙 +넛 +넜 +넝 +넣 +네 +넥 +넨 +넬 +넴 +넵 +넷 +넸 +넹 +녀 +녁 +년 +념 +녔 +녕 +녘 +녜 +노 +녹 +논 +놀 +놈 +놋 +농 +높 +놓 +놔 +놨 +뇌 +뇨 +뇩 +뇽 +누 +눅 +눈 +눌 +눔 +눕 +눗 +눠 +눴 +뉘 +뉜 +뉩 +뉴 +늄 +늅 +늉 +느 +늑 +는 +늘 +늙 +늠 +늡 +능 +늦 +늪 +늬 +니 +닉 +닌 +닐 +님 +닙 +닛 +닝 +닢 +다 +닥 +닦 +단 +닫 +달 +닭 +닮 +닯 +닳 +담 +답 +닷 +당 +닻 +닿 +대 +댁 +댄 +댈 +댐 +댑 +댓 +댔 +댕 +댜 +더 +덕 +덖 +던 +덜 +덟 +덤 +덥 +덧 +덩 +덫 +덮 +데 +덱 +덴 +델 +뎀 +뎃 +뎅 +뎌 +뎠 +뎨 +도 +독 +돈 +돋 +돌 +돔 +돕 +돗 +동 +돛 +돝 +돼 +됐 +되 +된 +될 +됨 +됩 +됴 +두 +둑 +둔 +둘 +둠 +둡 +둣 +둥 +둬 +뒀 +뒤 +뒬 +뒷 +뒹 +듀 +듈 +듐 +드 +득 +든 +듣 +들 +듦 +듬 +듭 +듯 +등 +듸 +디 +딕 +딘 +딛 +딜 +딤 +딥 +딧 +딨 +딩 +딪 +따 +딱 +딴 +딸 +땀 +땄 +땅 +때 +땐 +땔 +땜 +땝 +땠 +땡 +떠 +떡 +떤 +떨 +떫 +떰 +떱 +떳 +떴 +떵 +떻 +떼 +떽 +뗀 +뗄 +뗍 +뗏 +뗐 +뗑 +또 +똑 +똘 +똥 +뙤 +뚜 +뚝 +뚤 +뚫 +뚱 +뛰 +뛴 +뛸 +뜀 +뜁 +뜨 +뜩 +뜬 +뜯 +뜰 +뜸 +뜻 +띄 +띈 +띌 +띔 +띕 +띠 +띤 +띨 +띱 +띵 +라 +락 +란 +랄 +람 +랍 +랏 +랐 +랑 +랒 +랗 +래 +랙 +랜 +랠 +램 +랩 +랫 +랬 +랭 +랴 +략 +량 +러 +럭 +런 +럴 +럼 +럽 +럿 +렀 +렁 +렇 +레 +렉 +렌 +렐 +렘 +렙 +렛 +렝 +려 +력 +련 +렬 +렴 +렵 +렷 +렸 +령 +례 +로 +록 +론 +롤 +롬 +롭 +롯 +롱 +롸 +롹 +뢰 +뢴 +뢸 +룃 +료 +룐 +룡 +루 +룩 +룬 +룰 +룸 +룹 +룻 +룽 +뤄 +뤘 +뤼 +류 +륙 +륜 +률 +륨 +륭 +르 +륵 +른 +를 +름 +릅 +릇 +릉 +릎 +리 +릭 +린 +릴 +림 +립 +릿 +링 +마 +막 +만 +많 +맏 +말 +맑 +맘 +맙 +맛 +망 +맞 +맡 +맣 +매 +맥 +맨 +맬 +맴 +맵 +맷 +맸 +맹 +맺 +먀 +먁 +머 +먹 +먼 +멀 +멈 +멋 +멍 +멎 +메 +멕 +멘 +멜 +멤 +멥 +멧 +멩 +며 +멱 +면 +멸 +몄 +명 +몇 +모 +목 +몫 +몬 +몰 +몸 +몹 +못 +몽 +뫼 +묘 +무 +묵 +묶 +문 +묻 +물 +묽 +뭄 +뭅 +뭇 +뭉 +뭍 +뭏 +뭐 +뭔 +뭘 +뭡 +뭣 +뮈 +뮌 +뮐 +뮤 +뮬 +므 +믈 +믐 +미 +믹 +민 +믿 +밀 +밈 +밉 +밋 +밌 +밍 +및 +밑 +바 +박 +밖 +반 +받 +발 +밝 +밟 +밤 +밥 +밧 +방 +밭 +배 +백 +밴 +밸 +뱀 +뱁 +뱃 +뱄 +뱅 +뱉 +뱍 +뱐 +버 +벅 +번 +벌 +범 +법 +벗 +벙 +벚 +베 +벡 +벤 +벨 +벰 +벱 +벳 +벵 +벼 +벽 +변 +별 +볍 +볏 +볐 +병 +볕 +보 +복 +볶 +본 +볼 +봄 +봅 +봇 +봉 +봐 +봤 +뵈 +뵐 +뵙 +부 +북 +분 +붇 +불 +붉 +붐 +붓 +붕 +붙 +뷔 +뷰 +뷴 +뷸 +브 +븐 +블 +비 +빅 +빈 +빌 +빔 +빕 +빗 +빙 +빚 +빛 +빠 +빡 +빤 +빨 +빳 +빴 +빵 +빻 +빼 +빽 +뺀 +뺄 +뺌 +뺏 +뺐 +뺑 +뺨 +뻐 +뻑 +뻔 +뻗 +뻘 +뻣 +뻤 +뻥 +뻬 +뼈 +뼉 +뼘 +뽀 +뽈 +뽐 +뽑 +뽕 +뾰 +뿌 +뿍 +뿐 +뿔 +뿜 +쁘 +쁜 +쁠 +쁨 +삐 +삔 +삘 +사 +삭 +삯 +산 +살 +삵 +삶 +삼 +삽 +삿 +샀 +상 +샅 +새 +색 +샌 +샐 +샘 +샙 +샛 +샜 +생 +샤 +샨 +샬 +샴 +샵 +샷 +샹 +서 +석 +섞 +선 +섣 +설 +섬 +섭 +섯 +섰 +성 +섶 +세 +섹 +센 +셀 +셈 +셉 +셋 +셌 +셍 +셔 +션 +셜 +셨 +셰 +셴 +셸 +소 +속 +손 +솔 +솜 +솝 +솟 +송 +솥 +쇄 +쇠 +쇤 +쇳 +쇼 +숀 +숄 +숍 +수 +숙 +순 +숟 +술 +숨 +숩 +숫 +숭 +숯 +숱 +숲 +숴 +쉐 +쉘 +쉬 +쉭 +쉰 +쉴 +쉼 +쉽 +슈 +슐 +슘 +슛 +슝 +스 +슥 +슨 +슬 +슭 +슴 +습 +슷 +승 +시 +식 +신 +싣 +실 +싫 +심 +십 +싯 +싱 +싶 +싸 +싹 +싼 +쌀 +쌈 +쌉 +쌌 +쌍 +쌓 +쌔 +쌘 +쌩 +써 +썩 +썬 +썰 +썸 +썹 +썼 +썽 +쎄 +쎈 +쏘 +쏙 +쏜 +쏟 +쏠 +쏭 +쏴 +쐈 +쐐 +쐬 +쑤 +쑥 +쑨 +쒀 +쒔 +쓰 +쓱 +쓴 +쓸 +씀 +씁 +씌 +씨 +씩 +씬 +씰 +씸 +씹 +씻 +씽 +아 +악 +안 +앉 +않 +알 +앎 +앓 +암 +압 +앗 +았 +앙 +앞 +애 +액 +앤 +앨 +앰 +앱 +앳 +앴 +앵 +야 +약 +얀 +얄 +얇 +얌 +얍 +얏 +양 +얕 +얗 +얘 +얜 +어 +억 +언 +얹 +얻 +얼 +얽 +엄 +업 +없 +엇 +었 +엉 +엊 +엌 +엎 +에 +엑 +엔 +엘 +엠 +엡 +엣 +엥 +여 +역 +엮 +연 +열 +엷 +염 +엽 +엾 +엿 +였 +영 +옅 +옆 +옇 +예 +옌 +옐 +옙 +옛 +오 +옥 +온 +올 +옭 +옮 +옳 +옴 +옵 +옷 +옹 +옻 +와 +왁 +완 +왈 +왑 +왓 +왔 +왕 +왜 +왠 +왱 +외 +왼 +요 +욕 +욘 +욜 +욤 +용 +우 +욱 +운 +울 +움 +웁 +웃 +웅 +워 +웍 +원 +월 +웜 +웠 +웡 +웨 +웬 +웰 +웸 +웹 +위 +윅 +윈 +윌 +윔 +윗 +윙 +유 +육 +윤 +율 +윱 +윳 +융 +으 +윽 +은 +을 +읊 +음 +읍 +응 +의 +읜 +읠 +이 +익 +인 +일 +읽 +잃 +임 +입 +잇 +있 +잉 +잊 +잎 +자 +작 +잔 +잖 +잘 +잠 +잡 +잣 +잤 +장 +잦 +재 +잭 +잰 +잴 +잽 +잿 +쟀 +쟁 +쟈 +쟉 +쟤 +저 +적 +전 +절 +젊 +점 +접 +젓 +정 +젖 +제 +젝 +젠 +젤 +젬 +젭 +젯 +져 +젼 +졀 +졌 +졍 +조 +족 +존 +졸 +좀 +좁 +종 +좇 +좋 +좌 +좍 +좽 +죄 +죠 +죤 +주 +죽 +준 +줄 +줌 +줍 +줏 +중 +줘 +줬 +쥐 +쥔 +쥘 +쥬 +쥴 +즈 +즉 +즌 +즐 +즘 +즙 +증 +지 +직 +진 +짇 +질 +짊 +짐 +집 +짓 +징 +짖 +짙 +짚 +짜 +짝 +짠 +짢 +짤 +짧 +짬 +짭 +짰 +짱 +째 +짹 +짼 +쨀 +쨉 +쨋 +쨌 +쨍 +쩄 +쩌 +쩍 +쩐 +쩔 +쩜 +쩝 +쩡 +쩨 +쪄 +쪘 +쪼 +쪽 +쪾 +쫀 +쫄 +쫑 +쫓 +쫙 +쬐 +쭈 +쭉 +쭐 +쭙 +쯔 +쯤 +쯧 +찌 +찍 +찐 +찔 +찜 +찝 +찡 +찢 +찧 +차 +착 +찬 +찮 +찰 +참 +찹 +찻 +찼 +창 +찾 +채 +책 +챈 +챌 +챔 +챕 +챗 +챘 +챙 +챠 +챤 +처 +척 +천 +철 +첨 +첩 +첫 +청 +체 +첵 +첸 +첼 +쳄 +쳇 +쳉 +쳐 +쳔 +쳤 +초 +촉 +촌 +촘 +촛 +총 +촨 +촬 +최 +쵸 +추 +축 +춘 +출 +춤 +춥 +춧 +충 +춰 +췄 +췌 +취 +췬 +츄 +츠 +측 +츨 +츰 +층 +치 +칙 +친 +칠 +칡 +침 +칩 +칫 +칭 +카 +칵 +칸 +칼 +캄 +캅 +캇 +캉 +캐 +캔 +캘 +캠 +캡 +캣 +캤 +캥 +캬 +커 +컥 +컨 +컫 +컬 +컴 +컵 +컷 +컸 +컹 +케 +켄 +켈 +켐 +켓 +켕 +켜 +켠 +켤 +켭 +켯 +켰 +코 +콕 +콘 +콜 +콤 +콥 +콧 +콩 +콰 +콱 +콴 +콸 +쾅 +쾌 +쾡 +쾨 +쾰 +쿄 +쿠 +쿡 +쿤 +쿨 +쿰 +쿵 +쿼 +퀀 +퀄 +퀘 +퀭 +퀴 +퀵 +퀸 +퀼 +큐 +큘 +크 +큰 +클 +큼 +큽 +키 +킥 +킨 +킬 +킴 +킵 +킷 +킹 +타 +탁 +탄 +탈 +탉 +탐 +탑 +탓 +탔 +탕 +태 +택 +탠 +탤 +탬 +탭 +탯 +탰 +탱 +터 +턱 +턴 +털 +텀 +텁 +텃 +텄 +텅 +테 +텍 +텐 +텔 +템 +텝 +텡 +텨 +톈 +토 +톡 +톤 +톨 +톰 +톱 +톳 +통 +퇴 +툇 +투 +툭 +툰 +툴 +툼 +퉁 +퉈 +퉜 +튀 +튄 +튈 +튕 +튜 +튠 +튤 +튬 +트 +특 +튼 +튿 +틀 +틈 +틉 +틋 +틔 +티 +틱 +틴 +틸 +팀 +팁 +팅 +파 +팍 +팎 +판 +팔 +팜 +팝 +팟 +팠 +팡 +팥 +패 +팩 +팬 +팰 +팸 +팻 +팼 +팽 +퍼 +퍽 +펀 +펄 +펌 +펍 +펐 +펑 +페 +펙 +펜 +펠 +펨 +펩 +펫 +펭 +펴 +편 +펼 +폄 +폈 +평 +폐 +포 +폭 +폰 +폴 +폼 +폿 +퐁 +표 +푭 +푸 +푹 +푼 +풀 +품 +풋 +풍 +퓨 +퓬 +퓰 +퓸 +프 +픈 +플 +픔 +픕 +피 +픽 +핀 +필 +핌 +핍 +핏 +핑 +하 +학 +한 +할 +핥 +함 +합 +핫 +항 +해 +핵 +핸 +핼 +햄 +햅 +햇 +했 +행 +햐 +향 +헀 +허 +헉 +헌 +헐 +험 +헙 +헛 +헝 +헤 +헥 +헨 +헬 +헴 +헵 +헷 +헹 +혀 +혁 +현 +혈 +혐 +협 +혓 +혔 +형 +혜 +호 +혹 +혼 +홀 +홈 +홉 +홋 +홍 +홑 +화 +확 +환 +활 +홧 +황 +홰 +홱 +횃 +회 +획 +횝 +횟 +횡 +효 +후 +훅 +훈 +훌 +훑 +훔 +훗 +훤 +훨 +훼 +휄 +휑 +휘 +휙 +휜 +휠 +휩 +휭 +휴 +휼 +흄 +흉 +흐 +흑 +흔 +흘 +흙 +흠 +흡 +흣 +흥 +흩 +희 +흰 +흽 +히 +힉 +힌 +힐 +힘 +힙 +힝 +車 +滑 +金 +奈 +羅 +洛 +卵 +欄 +蘭 +郎 +來 +盧 +老 +魯 +綠 +鹿 +論 +雷 +樓 +縷 +凌 +樂 +不 +參 +葉 +沈 +若 +兩 +凉 +梁 +呂 +女 +廬 +麗 +黎 +曆 +歷 +戀 +蓮 +連 +列 +烈 +裂 +念 +獵 +靈 +領 +例 +禮 +醴 +惡 +尿 +料 +遼 +龍 +暈 +柳 +流 +類 +六 +陸 +倫 +律 +栗 +利 +李 +梨 +理 +離 +燐 +林 +臨 +立 +茶 +切 +宅 + diff --git a/batch_running_task/pytorchocr/utils/dict/latex_symbol_dict.txt b/batch_running_task/pytorchocr/utils/dict/latex_symbol_dict.txt new file mode 100644 index 0000000..d17f2c2 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/latex_symbol_dict.txt @@ -0,0 +1,111 @@ +eos +sos +! +' +( +) ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +< += +> +A +B +C +E +F +G +H +I +L +M +N +P +R +S +T +V +X +Y +[ +\Delta +\alpha +\beta +\cdot +\cdots +\cos +\div +\exists +\forall +\frac +\gamma +\geq +\in +\infty +\int +\lambda +\ldots +\leq +\lim +\log +\mu +\neq +\phi +\pi +\pm +\prime +\rightarrow +\sigma +\sin +\sqrt +\sum +\tan +\theta +\times +] +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +\{ +| +\} +{ +} +^ +_ \ No newline at end of file diff --git a/batch_running_task/pytorchocr/utils/dict/latin_dict.txt b/batch_running_task/pytorchocr/utils/dict/latin_dict.txt new file mode 100644 index 0000000..e166bf3 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/latin_dict.txt @@ -0,0 +1,185 @@ + +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +} +¡ +£ +§ +ª +« +­ +° +² +³ +´ +µ +· +º +» +¿ +À +Á + +Ä +Å +Ç +È +É +Ê +Ë +Ì +Í +Î +Ï +Ò +Ó +Ô +Õ +Ö +Ú +Ü +Ý +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +ì +í +î +ï +ñ +ò +ó +ô +õ +ö +ø +ù +ú +û +ü +ý +ą +Ć +ć +Č +č +Đ +đ +ę +ı +Ł +ł +ō +Œ +œ +Š +š +Ÿ +Ž +ž +ʒ +β +δ +ε +з +Ṡ +‘ +€ +™ diff --git a/batch_running_task/pytorchocr/utils/dict/mr_dict.txt b/batch_running_task/pytorchocr/utils/dict/mr_dict.txt new file mode 100644 index 0000000..283b150 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/mr_dict.txt @@ -0,0 +1,153 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ँ +ं +ः +अ +आ +इ +ई +उ +ऊ +ए +ऐ +ऑ +ओ +औ +क +ख +ग +घ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +प +फ +ब +भ +म +य +र +ऱ +ल +ळ +व +श +ष +स +ह +़ +ा +ि +ी +ु +ू +ृ +ॅ +े +ै +ॉ +ो +ौ +् +० +१ +२ +३ +४ +५ +६ +७ +८ +९ diff --git a/batch_running_task/pytorchocr/utils/dict/ne_dict.txt b/batch_running_task/pytorchocr/utils/dict/ne_dict.txt new file mode 100644 index 0000000..5a7df95 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/ne_dict.txt @@ -0,0 +1,153 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ः +अ +आ +इ +ई +उ +ऊ +ऋ +ए +ऐ +ओ +औ +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +ऩ +प +फ +ब +भ +म +य +र +ऱ +ल +व +श +ष +स +ह +़ +ा +ि +ी +ु +ू +ृ +े +ै +ो +ौ +् +॒ +ॠ +। +० +१ +२ +३ +४ +५ +६ +७ +८ +९ diff --git a/batch_running_task/pytorchocr/utils/dict/oc_dict.txt b/batch_running_task/pytorchocr/utils/dict/oc_dict.txt new file mode 100644 index 0000000..e88af8b --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/oc_dict.txt @@ -0,0 +1,96 @@ +o +c +_ +i +m +g +/ +2 +0 +I +L +S +V +R +C +1 +v +a +l +4 +3 +. +j +p +r +e +è +t +9 +7 +5 +8 +n +' +b +s +6 +q +u +á +d +ò +à +h +z +f +ï +í +A +ç +x +ó +é +P +O +Ò +ü +k +À +F +- +ú +­ +æ +Á +D +E +w +K +T +N +y +U +Z +G +B +J +H +M +W +Y +X +Q +% +$ +, +@ +& +! +: +( +# +? ++ +É + diff --git a/batch_running_task/pytorchocr/utils/dict/pt_dict.txt b/batch_running_task/pytorchocr/utils/dict/pt_dict.txt new file mode 100644 index 0000000..9500fae --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/pt_dict.txt @@ -0,0 +1,130 @@ +p +u +_ +i +m +g +/ +8 +I +L +S +V +R +C +2 +0 +1 +v +a +l +6 +7 +4 +5 +. +j + +q +e +s +t +ã +o +x +9 +c +n +r +z +ç +õ +3 +A +U +d +º +ô +­ +, +E +; +ó +á +b +D +? +ú +ê +- +h +P +f +à +N +í +O +M +G +É +é +â +F +: +T +Á +" +Q +) +W +J +B +H +( +ö +% +Ö +« +w +K +y +! +k +] +' +Z ++ +Ç +Õ +Y +À +X +µ +» +ª +Í +ü +ä +´ +è +ñ +ß +ï +Ú +ë +Ô +Ï +Ó +[ +Ì +< + +ò +§ +³ +ø +å +# +$ +& +@ diff --git a/batch_running_task/pytorchocr/utils/dict/pu_dict.txt b/batch_running_task/pytorchocr/utils/dict/pu_dict.txt new file mode 100644 index 0000000..9500fae --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/pu_dict.txt @@ -0,0 +1,130 @@ +p +u +_ +i +m +g +/ +8 +I +L +S +V +R +C +2 +0 +1 +v +a +l +6 +7 +4 +5 +. +j + +q +e +s +t +ã +o +x +9 +c +n +r +z +ç +õ +3 +A +U +d +º +ô +­ +, +E +; +ó +á +b +D +? +ú +ê +- +h +P +f +à +N +í +O +M +G +É +é +â +F +: +T +Á +" +Q +) +W +J +B +H +( +ö +% +Ö +« +w +K +y +! +k +] +' +Z ++ +Ç +Õ +Y +À +X +µ +» +ª +Í +ü +ä +´ +è +ñ +ß +ï +Ú +ë +Ô +Ï +Ó +[ +Ì +< + +ò +§ +³ +ø +å +# +$ +& +@ diff --git a/batch_running_task/pytorchocr/utils/dict/rs_cyrillic_dict.txt b/batch_running_task/pytorchocr/utils/dict/rs_cyrillic_dict.txt new file mode 100644 index 0000000..95dd463 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/rs_cyrillic_dict.txt @@ -0,0 +1,134 @@ +r +s +c +_ +i +m +g +/ +5 +I +L +S +V +R +C +2 +0 +1 +v +a +l +9 +7 +8 +. +j +p +м +а +с +и +р +ћ +е +ш +3 +4 +о +г +н +з +в +л +6 +т +ж +у +к +п +њ +д +ч +С +ј +ф +ц +љ +х +О +И +А +б +Ш +К +ђ +џ +М +В +З +Д +Р +У +Н +Т +Б +? +П +Х +Ј +Ц +Г +Љ +Л +Ф +e +n +w +E +F +A +N +f +o +b +M +G +t +y +W +k +P +u +H +B +T +z +h +O +Y +d +U +K +D +x +X +J +Z +Q +q +' +- +@ +é +# +! +, +% +$ +: +& ++ +( +É + diff --git a/batch_running_task/pytorchocr/utils/dict/rs_dict.txt b/batch_running_task/pytorchocr/utils/dict/rs_dict.txt new file mode 100644 index 0000000..d1ce46d --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/rs_dict.txt @@ -0,0 +1,91 @@ +r +s +_ +i +m +g +/ +1 +I +L +S +V +R +C +2 +0 +v +a +l +7 +5 +8 +6 +. +j +p + +t +d +9 +3 +e +š +4 +k +u +ć +c +n +đ +o +z +č +b +ž +f +Z +T +h +M +F +O +Š +B +H +A +E +Đ +Ž +D +P +G +Č +K +U +N +J +Ć +w +y +W +x +Y +X +q +Q +# +& +$ +, +- +% +' +@ +! +: +? +( +É +é ++ diff --git a/batch_running_task/pytorchocr/utils/dict/rs_latin_dict.txt b/batch_running_task/pytorchocr/utils/dict/rs_latin_dict.txt new file mode 100644 index 0000000..d1ce46d --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/rs_latin_dict.txt @@ -0,0 +1,91 @@ +r +s +_ +i +m +g +/ +1 +I +L +S +V +R +C +2 +0 +v +a +l +7 +5 +8 +6 +. +j +p + +t +d +9 +3 +e +š +4 +k +u +ć +c +n +đ +o +z +č +b +ž +f +Z +T +h +M +F +O +Š +B +H +A +E +Đ +Ž +D +P +G +Č +K +U +N +J +Ć +w +y +W +x +Y +X +q +Q +# +& +$ +, +- +% +' +@ +! +: +? +( +É +é ++ diff --git a/batch_running_task/pytorchocr/utils/dict/rsc_dict.txt b/batch_running_task/pytorchocr/utils/dict/rsc_dict.txt new file mode 100644 index 0000000..95dd463 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/rsc_dict.txt @@ -0,0 +1,134 @@ +r +s +c +_ +i +m +g +/ +5 +I +L +S +V +R +C +2 +0 +1 +v +a +l +9 +7 +8 +. +j +p +м +а +с +и +р +ћ +е +ш +3 +4 +о +г +н +з +в +л +6 +т +ж +у +к +п +њ +д +ч +С +ј +ф +ц +љ +х +О +И +А +б +Ш +К +ђ +џ +М +В +З +Д +Р +У +Н +Т +Б +? +П +Х +Ј +Ц +Г +Љ +Л +Ф +e +n +w +E +F +A +N +f +o +b +M +G +t +y +W +k +P +u +H +B +T +z +h +O +Y +d +U +K +D +x +X +J +Z +Q +q +' +- +@ +é +# +! +, +% +$ +: +& ++ +( +É + diff --git a/batch_running_task/pytorchocr/utils/dict/ru_dict.txt b/batch_running_task/pytorchocr/utils/dict/ru_dict.txt new file mode 100644 index 0000000..3b0cf3a --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/ru_dict.txt @@ -0,0 +1,125 @@ +к +в +а +з +и +у +р +о +н +я +х +п +л +ы +г +е +т +м +д +ж +ш +ь +с +ё +б +й +ч +ю +ц +щ +М +э +ф +А +ъ +С +Ф +Ю +В +К +Т +Н +О +Э +У +И +Г +Л +Р +Д +Б +Ш +П +З +Х +Е +Ж +Я +Ц +Ч +Й +Щ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + diff --git a/batch_running_task/pytorchocr/utils/dict/ta_dict.txt b/batch_running_task/pytorchocr/utils/dict/ta_dict.txt new file mode 100644 index 0000000..d1bae50 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/ta_dict.txt @@ -0,0 +1,128 @@ +t +a +_ +i +m +g +/ +3 +I +L +S +V +R +C +2 +0 +1 +v +l +9 +7 +8 +. +j +p +ப +ூ +த +ம +ி +வ +ர +் +ந +ோ +ன +6 +ஆ +ற +ல +5 +ள +ா +ொ +ழ +ு +4 +ெ +ண +க +ட +ை +ே +ச +ய +ஒ +இ +அ +ங +உ +ீ +ஞ +எ +ஓ +ஃ +ஜ +ஷ +ஸ +ஏ +ஊ +ஹ +ஈ +ஐ +ௌ +ஔ +s +c +e +n +w +F +T +O +P +K +A +N +G +Y +E +M +H +U +B +o +b +D +d +r +W +u +y +f +X +k +q +h +J +z +Z +Q +x +- +' +$ +, +% +@ +é +! +# ++ +É +& +: +( +? + diff --git a/batch_running_task/pytorchocr/utils/dict/table_dict.txt b/batch_running_task/pytorchocr/utils/dict/table_dict.txt new file mode 100644 index 0000000..2ef028c --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/table_dict.txt @@ -0,0 +1,277 @@ +← + +☆ +─ +α + + +⋅ +$ +ω +ψ +χ +( +υ +≥ +σ +, +ρ +ε +0 +■ +4 +8 +✗ +b +< +✓ +Ψ +Ω +€ +D +3 +Π +H +║ + +L +Φ +Χ +θ +P +κ +λ +μ +T +ξ +X +β +γ +δ +\ +ζ +η +` +d + +h +f +l +Θ +p +√ +t + +x +Β +Γ +Δ +| +ǂ +ɛ +j +̧ +➢ +⁡ +̌ +′ +« +△ +▲ +# + +' +Ι ++ +¶ +/ +▼ +⇑ +□ +· +7 +▪ +; +? +➔ +∩ +C +÷ +G +⇒ +K + +O +S +С +W +Α +[ +○ +_ +● +‡ +c +z +g + +o + +〈 +〉 +s +⩽ +w +φ +ʹ +{ +» +∣ +̆ +e +ˆ +∈ +τ +◆ +ι +∅ +∆ +∙ +∘ +Ø +ß +✔ +∞ +∑ +− +× +◊ +∗ +∖ +˃ +˂ +∫ +" +i +& +π +↔ +* +∥ +æ +∧ +. +⁄ +ø +Q +∼ +6 +⁎ +: +★ +> +a +B +≈ +F +J +̄ +N +♯ +R +V + +― +Z +♣ +^ +¤ +¥ +§ + +¢ +£ +≦ +­ +≤ +‖ +Λ +© +n +↓ +→ +↑ +r +° +± +v + +♂ +k +♀ +~ +ᅟ +̇ +@ +” +♦ +ł +® +⊕ +„ +! + +% +⇓ +) +- +1 +5 +9 += +А +A +‰ +⋆ +Σ +E +◦ +I +※ +M +m +̨ +⩾ +† + +• +U +Y +
 +] +̸ +2 +‐ +– +‒ +̂ +— +̀ +́ +’ +‘ +⋮ +⋯ +̊ +“ +̈ +≧ +q +u +ı +y + +​ +̃ +} +ν diff --git a/batch_running_task/pytorchocr/utils/dict/table_structure_dict.txt b/batch_running_task/pytorchocr/utils/dict/table_structure_dict.txt new file mode 100644 index 0000000..9c4531e --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/table_structure_dict.txt @@ -0,0 +1,2759 @@ +277 28 1267 1186 + +V +a +r +i +b +l +e + +H +z +d + +t +o +9 +5 +% +C +I + +p + +v +u +* +A +g +( +m +n +) +0 +. +7 +1 +6 +≤ +> +8 +3 +– +2 +G +4 +M +F +T +y +f +s +L +w +c +U +h +D +S +Q +R +x +P +- +E +O +/ +k +, ++ +N +K +q +′ +[ +] +< +≥ + +− + +μ +± +J +j +W +_ +Δ +B +“ +: +Y +α +λ +; + + +? +∼ += +° +# +̊ +̈ +̂ +’ +Z +X +∗ +— +β +' +† +~ +@ +" +γ +↓ +↑ +& +‡ +χ +” +σ +§ +| +¶ +‐ +× +$ +→ +√ +✓ +‘ +\ +∞ +π +• +® +^ +∆ +≧ + + +́ +♀ +♂ +‒ +⁎ +▲ +· +£ +φ +Ψ +ß +△ +☆ +▪ +η +€ +∧ +̃ +Φ +ρ +̄ +δ +‰ +̧ +Ω +♦ +{ +} +̀ +∑ +∫ +ø +κ +ε +¥ +※ +` +ω +Σ +➔ +‖ +Β +̸ +
 +─ +● +⩾ +Χ +Α +⋅ +◆ +★ +■ +ψ +ǂ +□ +ζ +! +Γ +↔ +θ +⁄ +〈 +〉 +― +υ +τ +⋆ +Ø +© +∥ +С +˂ +➢ +ɛ +⁡ +✗ +← +○ +¢ +⩽ +∖ +˃ +­ +≈ +Π +̌ +≦ +∅ +ᅟ + + +∣ +¤ +♯ +̆ +ξ +÷ +▼ + +ι +ν +║ + + +◦ +​ +◊ +∙ +« +» +ł +ı +Θ +∈ +„ +∘ +✔ +̇ +æ +ʹ +ˆ +♣ +⇓ +∩ +⊕ +⇒ +⇑ +̨ +Ι +Λ +⋯ +А +⋮ + + + + + + + + + + colspan="2" + colspan="3" + rowspan="2" + colspan="4" + colspan="6" + rowspan="3" + colspan="9" + colspan="10" + colspan="7" + rowspan="4" + rowspan="5" + rowspan="9" + colspan="8" + rowspan="8" + rowspan="6" + rowspan="7" + rowspan="10" +0 2924682 +1 3405345 +2 2363468 +3 2709165 +4 4078680 +5 3250792 +6 1923159 +7 1617890 +8 1450532 +9 1717624 +10 1477550 +11 1489223 +12 915528 +13 819193 +14 593660 +15 518924 +16 682065 +17 494584 +18 400591 +19 396421 +20 340994 +21 280688 +22 250328 +23 226786 +24 199927 +25 182707 +26 164629 +27 141613 +28 127554 +29 116286 +30 107682 +31 96367 +32 88002 +33 79234 +34 72186 +35 65921 +36 60374 +37 55976 +38 52166 +39 47414 +40 44932 +41 41279 +42 38232 +43 35463 +44 33703 +45 30557 +46 29639 +47 27000 +48 25447 +49 23186 +50 22093 +51 20412 +52 19844 +53 18261 +54 17561 +55 16499 +56 15597 +57 14558 +58 14372 +59 13445 +60 13514 +61 12058 +62 11145 +63 10767 +64 10370 +65 9630 +66 9337 +67 8881 +68 8727 +69 8060 +70 7994 +71 7740 +72 7189 +73 6729 +74 6749 +75 6548 +76 6321 +77 5957 +78 5740 +79 5407 +80 5370 +81 5035 +82 4921 +83 4656 +84 4600 +85 4519 +86 4277 +87 4023 +88 3939 +89 3910 +90 3861 +91 3560 +92 3483 +93 3406 +94 3346 +95 3229 +96 3122 +97 3086 +98 3001 +99 2884 +100 2822 +101 2677 +102 2670 +103 2610 +104 2452 +105 2446 +106 2400 +107 2300 +108 2316 +109 2196 +110 2089 +111 2083 +112 2041 +113 1881 +114 1838 +115 1896 +116 1795 +117 1786 +118 1743 +119 1765 +120 1750 +121 1683 +122 1563 +123 1499 +124 1513 +125 1462 +126 1388 +127 1441 +128 1417 +129 1392 +130 1306 +131 1321 +132 1274 +133 1294 +134 1240 +135 1126 +136 1157 +137 1130 +138 1084 +139 1130 +140 1083 +141 1040 +142 980 +143 1031 +144 974 +145 980 +146 932 +147 898 +148 960 +149 907 +150 852 +151 912 +152 859 +153 847 +154 876 +155 792 +156 791 +157 765 +158 788 +159 787 +160 744 +161 673 +162 683 +163 697 +164 666 +165 680 +166 632 +167 677 +168 657 +169 618 +170 587 +171 585 +172 567 +173 549 +174 562 +175 548 +176 542 +177 539 +178 542 +179 549 +180 547 +181 526 +182 525 +183 514 +184 512 +185 505 +186 515 +187 467 +188 475 +189 458 +190 435 +191 443 +192 427 +193 424 +194 404 +195 389 +196 429 +197 404 +198 386 +199 351 +200 388 +201 408 +202 361 +203 346 +204 324 +205 361 +206 363 +207 364 +208 323 +209 336 +210 342 +211 315 +212 325 +213 328 +214 314 +215 327 +216 320 +217 300 +218 295 +219 315 +220 310 +221 295 +222 275 +223 248 +224 274 +225 232 +226 293 +227 259 +228 286 +229 263 +230 242 +231 214 +232 261 +233 231 +234 211 +235 250 +236 233 +237 206 +238 224 +239 210 +240 233 +241 223 +242 216 +243 222 +244 207 +245 212 +246 196 +247 205 +248 201 +249 202 +250 211 +251 201 +252 215 +253 179 +254 163 +255 179 +256 191 +257 188 +258 196 +259 150 +260 154 +261 176 +262 211 +263 166 +264 171 +265 165 +266 149 +267 182 +268 159 +269 161 +270 164 +271 161 +272 141 +273 151 +274 127 +275 129 +276 142 +277 158 +278 148 +279 135 +280 127 +281 134 +282 138 +283 131 +284 126 +285 125 +286 130 +287 126 +288 135 +289 125 +290 135 +291 131 +292 95 +293 135 +294 106 +295 117 +296 136 +297 128 +298 128 +299 118 +300 109 +301 112 +302 117 +303 108 +304 120 +305 100 +306 95 +307 108 +308 112 +309 77 +310 120 +311 104 +312 109 +313 89 +314 98 +315 82 +316 98 +317 93 +318 77 +319 93 +320 77 +321 98 +322 93 +323 86 +324 89 +325 73 +326 70 +327 71 +328 77 +329 87 +330 77 +331 93 +332 100 +333 83 +334 72 +335 74 +336 69 +337 77 +338 68 +339 78 +340 90 +341 98 +342 75 +343 80 +344 63 +345 71 +346 83 +347 66 +348 71 +349 70 +350 62 +351 62 +352 59 +353 63 +354 62 +355 52 +356 64 +357 64 +358 56 +359 49 +360 57 +361 63 +362 60 +363 68 +364 62 +365 55 +366 54 +367 40 +368 75 +369 70 +370 53 +371 58 +372 57 +373 55 +374 69 +375 57 +376 53 +377 43 +378 45 +379 47 +380 56 +381 51 +382 59 +383 51 +384 43 +385 34 +386 57 +387 49 +388 39 +389 46 +390 48 +391 43 +392 40 +393 54 +394 50 +395 41 +396 43 +397 33 +398 27 +399 49 +400 44 +401 44 +402 38 +403 30 +404 32 +405 37 +406 39 +407 42 +408 53 +409 39 +410 34 +411 31 +412 32 +413 52 +414 27 +415 41 +416 34 +417 36 +418 50 +419 35 +420 32 +421 33 +422 45 +423 35 +424 40 +425 29 +426 41 +427 40 +428 39 +429 32 +430 31 +431 34 +432 29 +433 27 +434 26 +435 22 +436 34 +437 28 +438 30 +439 38 +440 35 +441 36 +442 36 +443 27 +444 24 +445 33 +446 31 +447 25 +448 33 +449 27 +450 32 +451 46 +452 31 +453 35 +454 35 +455 34 +456 26 +457 21 +458 25 +459 26 +460 24 +461 27 +462 33 +463 30 +464 35 +465 21 +466 32 +467 19 +468 27 +469 16 +470 28 +471 26 +472 27 +473 26 +474 25 +475 25 +476 27 +477 20 +478 28 +479 22 +480 23 +481 16 +482 25 +483 27 +484 19 +485 23 +486 19 +487 15 +488 15 +489 23 +490 24 +491 19 +492 20 +493 18 +494 17 +495 30 +496 28 +497 20 +498 29 +499 17 +500 19 +501 21 +502 15 +503 24 +504 15 +505 19 +506 25 +507 16 +508 23 +509 26 +510 21 +511 15 +512 12 +513 16 +514 18 +515 24 +516 26 +517 18 +518 8 +519 25 +520 14 +521 8 +522 24 +523 20 +524 18 +525 15 +526 13 +527 17 +528 18 +529 22 +530 21 +531 9 +532 16 +533 17 +534 13 +535 17 +536 15 +537 13 +538 20 +539 13 +540 19 +541 29 +542 10 +543 8 +544 18 +545 13 +546 9 +547 18 +548 10 +549 18 +550 18 +551 9 +552 9 +553 15 +554 13 +555 15 +556 14 +557 14 +558 18 +559 8 +560 13 +561 9 +562 7 +563 12 +564 6 +565 9 +566 9 +567 18 +568 9 +569 10 +570 13 +571 14 +572 13 +573 21 +574 8 +575 16 +576 12 +577 9 +578 16 +579 17 +580 22 +581 6 +582 14 +583 13 +584 15 +585 11 +586 13 +587 5 +588 12 +589 13 +590 15 +591 13 +592 15 +593 12 +594 7 +595 18 +596 12 +597 13 +598 13 +599 13 +600 12 +601 12 +602 10 +603 11 +604 6 +605 6 +606 2 +607 9 +608 8 +609 12 +610 9 +611 12 +612 13 +613 12 +614 14 +615 9 +616 8 +617 9 +618 14 +619 13 +620 12 +621 6 +622 8 +623 8 +624 8 +625 12 +626 8 +627 7 +628 5 +629 8 +630 12 +631 6 +632 10 +633 10 +634 7 +635 8 +636 9 +637 6 +638 9 +639 4 +640 12 +641 4 +642 3 +643 11 +644 10 +645 6 +646 12 +647 12 +648 4 +649 4 +650 9 +651 8 +652 6 +653 5 +654 14 +655 10 +656 11 +657 8 +658 5 +659 5 +660 9 +661 13 +662 4 +663 5 +664 9 +665 11 +666 12 +667 7 +668 13 +669 2 +670 1 +671 7 +672 7 +673 7 +674 10 +675 9 +676 6 +677 5 +678 7 +679 6 +680 3 +681 3 +682 4 +683 9 +684 8 +685 5 +686 3 +687 11 +688 9 +689 2 +690 6 +691 5 +692 9 +693 5 +694 6 +695 5 +696 9 +697 8 +698 3 +699 7 +700 5 +701 9 +702 8 +703 7 +704 2 +705 3 +706 7 +707 6 +708 6 +709 10 +710 2 +711 10 +712 6 +713 7 +714 5 +715 6 +716 4 +717 6 +718 8 +719 4 +720 6 +721 7 +722 5 +723 7 +724 3 +725 10 +726 10 +727 3 +728 7 +729 7 +730 5 +731 2 +732 1 +733 5 +734 1 +735 5 +736 6 +737 2 +738 2 +739 3 +740 7 +741 2 +742 7 +743 4 +744 5 +745 4 +746 5 +747 3 +748 1 +749 4 +750 4 +751 2 +752 4 +753 6 +754 6 +755 6 +756 3 +757 2 +758 5 +759 5 +760 3 +761 4 +762 2 +763 1 +764 8 +765 3 +766 4 +767 3 +768 1 +769 5 +770 3 +771 3 +772 4 +773 4 +774 1 +775 3 +776 2 +777 2 +778 3 +779 3 +780 1 +781 4 +782 3 +783 4 +784 6 +785 3 +786 5 +787 4 +788 2 +789 4 +790 5 +791 4 +792 6 +794 4 +795 1 +796 1 +797 4 +798 2 +799 3 +800 3 +801 1 +802 5 +803 5 +804 3 +805 3 +806 3 +807 4 +808 4 +809 2 +811 5 +812 4 +813 6 +814 3 +815 2 +816 2 +817 3 +818 5 +819 3 +820 1 +821 1 +822 4 +823 3 +824 4 +825 8 +826 3 +827 5 +828 5 +829 3 +830 6 +831 3 +832 4 +833 8 +834 5 +835 3 +836 3 +837 2 +838 4 +839 2 +840 1 +841 3 +842 2 +843 1 +844 3 +846 4 +847 4 +848 3 +849 3 +850 2 +851 3 +853 1 +854 4 +855 4 +856 2 +857 4 +858 1 +859 2 +860 5 +861 1 +862 1 +863 4 +864 2 +865 2 +867 5 +868 1 +869 4 +870 1 +871 1 +872 1 +873 2 +875 5 +876 3 +877 1 +878 3 +879 3 +880 3 +881 2 +882 1 +883 6 +884 2 +885 2 +886 1 +887 1 +888 3 +889 2 +890 2 +891 3 +892 1 +893 3 +894 1 +895 5 +896 1 +897 3 +899 2 +900 2 +902 1 +903 2 +904 4 +905 4 +906 3 +907 1 +908 1 +909 2 +910 5 +911 2 +912 3 +914 1 +915 1 +916 2 +918 2 +919 2 +920 4 +921 4 +922 1 +923 1 +924 4 +925 5 +926 1 +928 2 +929 1 +930 1 +931 1 +932 1 +933 1 +934 2 +935 1 +936 1 +937 1 +938 2 +939 1 +941 1 +942 4 +944 2 +945 2 +946 2 +947 1 +948 1 +950 1 +951 2 +953 1 +954 2 +955 1 +956 1 +957 2 +958 1 +960 3 +962 4 +963 1 +964 1 +965 3 +966 2 +967 2 +968 1 +969 3 +970 3 +972 1 +974 4 +975 3 +976 3 +977 2 +979 2 +980 1 +981 1 +983 5 +984 1 +985 3 +986 1 +987 2 +988 4 +989 2 +991 2 +992 2 +993 1 +994 1 +996 2 +997 2 +998 1 +999 3 +1000 2 +1001 1 +1002 3 +1003 3 +1004 2 +1005 3 +1006 1 +1007 2 +1009 1 +1011 1 +1013 3 +1014 1 +1016 2 +1017 1 +1018 1 +1019 1 +1020 4 +1021 1 +1022 2 +1025 1 +1026 1 +1027 2 +1028 1 +1030 1 +1031 2 +1032 4 +1034 3 +1035 2 +1036 1 +1038 1 +1039 1 +1040 1 +1041 1 +1042 2 +1043 1 +1044 2 +1045 4 +1048 1 +1050 1 +1051 1 +1052 2 +1054 1 +1055 3 +1056 2 +1057 1 +1059 1 +1061 2 +1063 1 +1064 1 +1065 1 +1066 1 +1067 1 +1068 1 +1069 2 +1074 1 +1075 1 +1077 1 +1078 1 +1079 1 +1082 1 +1085 1 +1088 1 +1090 1 +1091 1 +1092 2 +1094 2 +1097 2 +1098 1 +1099 2 +1101 2 +1102 1 +1104 1 +1105 1 +1107 1 +1109 1 +1111 2 +1112 1 +1114 2 +1115 2 +1116 2 +1117 1 +1118 1 +1119 1 +1120 1 +1122 1 +1123 1 +1127 1 +1128 3 +1132 2 +1138 3 +1142 1 +1145 4 +1150 1 +1153 2 +1154 1 +1158 1 +1159 1 +1163 1 +1165 1 +1169 2 +1174 1 +1176 1 +1177 1 +1178 2 +1179 1 +1180 2 +1181 1 +1182 1 +1183 2 +1185 1 +1187 1 +1191 2 +1193 1 +1195 3 +1196 1 +1201 3 +1203 1 +1206 1 +1210 1 +1213 1 +1214 1 +1215 2 +1218 1 +1220 1 +1221 1 +1225 1 +1226 1 +1233 2 +1241 1 +1243 1 +1249 1 +1250 2 +1251 1 +1254 1 +1255 2 +1260 1 +1268 1 +1270 1 +1273 1 +1274 1 +1277 1 +1284 1 +1287 1 +1291 1 +1292 2 +1294 1 +1295 2 +1297 1 +1298 1 +1301 1 +1307 1 +1308 3 +1311 2 +1313 1 +1316 1 +1321 1 +1324 1 +1325 1 +1330 1 +1333 1 +1334 1 +1338 2 +1340 1 +1341 1 +1342 1 +1343 1 +1345 1 +1355 1 +1357 1 +1360 2 +1375 1 +1376 1 +1380 1 +1383 1 +1387 1 +1389 1 +1393 1 +1394 1 +1396 1 +1398 1 +1410 1 +1414 1 +1419 1 +1425 1 +1434 1 +1435 1 +1438 1 +1439 1 +1447 1 +1455 2 +1460 1 +1461 1 +1463 1 +1466 1 +1470 1 +1473 1 +1478 1 +1480 1 +1483 1 +1484 1 +1485 2 +1492 2 +1499 1 +1509 1 +1512 1 +1513 1 +1523 1 +1524 1 +1525 2 +1529 1 +1539 1 +1544 1 +1568 1 +1584 1 +1591 1 +1598 1 +1600 1 +1604 1 +1614 1 +1617 1 +1621 1 +1622 1 +1626 1 +1638 1 +1648 1 +1658 1 +1661 1 +1679 1 +1682 1 +1693 1 +1700 1 +1705 1 +1707 1 +1722 1 +1728 1 +1758 1 +1762 1 +1763 1 +1775 1 +1776 1 +1801 1 +1810 1 +1812 1 +1827 1 +1834 1 +1846 1 +1847 1 +1848 1 +1851 1 +1862 1 +1866 1 +1877 2 +1884 1 +1888 1 +1903 1 +1912 1 +1925 1 +1938 1 +1955 1 +1998 1 +2054 1 +2058 1 +2065 1 +2069 1 +2076 1 +2089 1 +2104 1 +2111 1 +2133 1 +2138 1 +2156 1 +2204 1 +2212 1 +2237 1 +2246 2 +2298 1 +2304 1 +2360 1 +2400 1 +2481 1 +2544 1 +2586 1 +2622 1 +2666 1 +2682 1 +2725 1 +2920 1 +3997 1 +4019 1 +5211 1 +12 19 +14 1 +16 401 +18 2 +20 421 +22 557 +24 625 +26 50 +28 4481 +30 52 +32 550 +34 5840 +36 4644 +38 87 +40 5794 +41 33 +42 571 +44 11805 +46 4711 +47 7 +48 597 +49 12 +50 678 +51 2 +52 14715 +53 3 +54 7322 +55 3 +56 508 +57 39 +58 3486 +59 11 +60 8974 +61 45 +62 1276 +63 4 +64 15693 +65 15 +66 657 +67 13 +68 6409 +69 10 +70 3188 +71 25 +72 1889 +73 27 +74 10370 +75 9 +76 12432 +77 23 +78 520 +79 15 +80 1534 +81 29 +82 2944 +83 23 +84 12071 +85 36 +86 1502 +87 10 +88 10978 +89 11 +90 889 +91 16 +92 4571 +93 17 +94 7855 +95 21 +96 2271 +97 33 +98 1423 +99 15 +100 11096 +101 21 +102 4082 +103 13 +104 5442 +105 25 +106 2113 +107 26 +108 3779 +109 43 +110 1294 +111 29 +112 7860 +113 29 +114 4965 +115 22 +116 7898 +117 25 +118 1772 +119 28 +120 1149 +121 38 +122 1483 +123 32 +124 10572 +125 25 +126 1147 +127 31 +128 1699 +129 22 +130 5533 +131 22 +132 4669 +133 34 +134 3777 +135 10 +136 5412 +137 21 +138 855 +139 26 +140 2485 +141 46 +142 1970 +143 27 +144 6565 +145 40 +146 933 +147 15 +148 7923 +149 16 +150 735 +151 23 +152 1111 +153 33 +154 3714 +155 27 +156 2445 +157 30 +158 3367 +159 10 +160 4646 +161 27 +162 990 +163 23 +164 5679 +165 25 +166 2186 +167 17 +168 899 +169 32 +170 1034 +171 22 +172 6185 +173 32 +174 2685 +175 17 +176 1354 +177 38 +178 1460 +179 15 +180 3478 +181 20 +182 958 +183 20 +184 6055 +185 23 +186 2180 +187 15 +188 1416 +189 30 +190 1284 +191 22 +192 1341 +193 21 +194 2413 +195 18 +196 4984 +197 13 +198 830 +199 22 +200 1834 +201 19 +202 2238 +203 9 +204 3050 +205 22 +206 616 +207 17 +208 2892 +209 22 +210 711 +211 30 +212 2631 +213 19 +214 3341 +215 21 +216 987 +217 26 +218 823 +219 9 +220 3588 +221 20 +222 692 +223 7 +224 2925 +225 31 +226 1075 +227 16 +228 2909 +229 18 +230 673 +231 20 +232 2215 +233 14 +234 1584 +235 21 +236 1292 +237 29 +238 1647 +239 25 +240 1014 +241 30 +242 1648 +243 19 +244 4465 +245 10 +246 787 +247 11 +248 480 +249 25 +250 842 +251 15 +252 1219 +253 23 +254 1508 +255 8 +256 3525 +257 16 +258 490 +259 12 +260 1678 +261 14 +262 822 +263 16 +264 1729 +265 28 +266 604 +267 11 +268 2572 +269 7 +270 1242 +271 15 +272 725 +273 18 +274 1983 +275 13 +276 1662 +277 19 +278 491 +279 12 +280 1586 +281 14 +282 563 +283 10 +284 2363 +285 10 +286 656 +287 14 +288 725 +289 28 +290 871 +291 9 +292 2606 +293 12 +294 961 +295 9 +296 478 +297 13 +298 1252 +299 10 +300 736 +301 19 +302 466 +303 13 +304 2254 +305 12 +306 486 +307 14 +308 1145 +309 13 +310 955 +311 13 +312 1235 +313 13 +314 931 +315 14 +316 1768 +317 11 +318 330 +319 10 +320 539 +321 23 +322 570 +323 12 +324 1789 +325 13 +326 884 +327 5 +328 1422 +329 14 +330 317 +331 11 +332 509 +333 13 +334 1062 +335 12 +336 577 +337 27 +338 378 +339 10 +340 2313 +341 9 +342 391 +343 13 +344 894 +345 17 +346 664 +347 9 +348 453 +349 6 +350 363 +351 15 +352 1115 +353 13 +354 1054 +355 8 +356 1108 +357 12 +358 354 +359 7 +360 363 +361 16 +362 344 +363 11 +364 1734 +365 12 +366 265 +367 10 +368 969 +369 16 +370 316 +371 12 +372 757 +373 7 +374 563 +375 15 +376 857 +377 9 +378 469 +379 9 +380 385 +381 12 +382 921 +383 15 +384 764 +385 14 +386 246 +387 6 +388 1108 +389 14 +390 230 +391 8 +392 266 +393 11 +394 641 +395 8 +396 719 +397 9 +398 243 +399 4 +400 1108 +401 7 +402 229 +403 7 +404 903 +405 7 +406 257 +407 12 +408 244 +409 3 +410 541 +411 6 +412 744 +413 8 +414 419 +415 8 +416 388 +417 19 +418 470 +419 14 +420 612 +421 6 +422 342 +423 3 +424 1179 +425 3 +426 116 +427 14 +428 207 +429 6 +430 255 +431 4 +432 288 +433 12 +434 343 +435 6 +436 1015 +437 3 +438 538 +439 10 +440 194 +441 6 +442 188 +443 15 +444 524 +445 7 +446 214 +447 7 +448 574 +449 6 +450 214 +451 5 +452 635 +453 9 +454 464 +455 5 +456 205 +457 9 +458 163 +459 2 +460 558 +461 4 +462 171 +463 14 +464 444 +465 11 +466 543 +467 5 +468 388 +469 6 +470 141 +471 4 +472 647 +473 3 +474 210 +475 4 +476 193 +477 7 +478 195 +479 7 +480 443 +481 10 +482 198 +483 3 +484 816 +485 6 +486 128 +487 9 +488 215 +489 9 +490 328 +491 7 +492 158 +493 11 +494 335 +495 8 +496 435 +497 6 +498 174 +499 1 +500 373 +501 5 +502 140 +503 7 +504 330 +505 9 +506 149 +507 5 +508 642 +509 3 +510 179 +511 3 +512 159 +513 8 +514 204 +515 7 +516 306 +517 4 +518 110 +519 5 +520 326 +521 6 +522 305 +523 6 +524 294 +525 7 +526 268 +527 5 +528 149 +529 4 +530 133 +531 2 +532 513 +533 10 +534 116 +535 5 +536 258 +537 4 +538 113 +539 4 +540 138 +541 6 +542 116 +544 485 +545 4 +546 93 +547 9 +548 299 +549 3 +550 256 +551 6 +552 92 +553 3 +554 175 +555 6 +556 253 +557 7 +558 95 +559 2 +560 128 +561 4 +562 206 +563 2 +564 465 +565 3 +566 69 +567 3 +568 157 +569 7 +570 97 +571 8 +572 118 +573 5 +574 130 +575 4 +576 301 +577 6 +578 177 +579 2 +580 397 +581 3 +582 80 +583 1 +584 128 +585 5 +586 52 +587 2 +588 72 +589 1 +590 84 +591 6 +592 323 +593 11 +594 77 +595 5 +596 205 +597 1 +598 244 +599 4 +600 69 +601 3 +602 89 +603 5 +604 254 +605 6 +606 147 +607 3 +608 83 +609 3 +610 77 +611 3 +612 194 +613 1 +614 98 +615 3 +616 243 +617 3 +618 50 +619 8 +620 188 +621 4 +622 67 +623 4 +624 123 +625 2 +626 50 +627 1 +628 239 +629 2 +630 51 +631 4 +632 65 +633 5 +634 188 +636 81 +637 3 +638 46 +639 3 +640 103 +641 1 +642 136 +643 3 +644 188 +645 3 +646 58 +648 122 +649 4 +650 47 +651 2 +652 155 +653 4 +654 71 +655 1 +656 71 +657 3 +658 50 +659 2 +660 177 +661 5 +662 66 +663 2 +664 183 +665 3 +666 50 +667 2 +668 53 +669 2 +670 115 +672 66 +673 2 +674 47 +675 1 +676 197 +677 2 +678 46 +679 3 +680 95 +681 3 +682 46 +683 3 +684 107 +685 1 +686 86 +687 2 +688 158 +689 4 +690 51 +691 1 +692 80 +694 56 +695 4 +696 40 +698 43 +699 3 +700 95 +701 2 +702 51 +703 2 +704 133 +705 1 +706 100 +707 2 +708 121 +709 2 +710 15 +711 3 +712 35 +713 2 +714 20 +715 3 +716 37 +717 2 +718 78 +720 55 +721 1 +722 42 +723 2 +724 218 +725 3 +726 23 +727 2 +728 26 +729 1 +730 64 +731 2 +732 65 +734 24 +735 2 +736 53 +737 1 +738 32 +739 1 +740 60 +742 81 +743 1 +744 77 +745 1 +746 47 +747 1 +748 62 +749 1 +750 19 +751 1 +752 86 +753 3 +754 40 +756 55 +757 2 +758 38 +759 1 +760 101 +761 1 +762 22 +764 67 +765 2 +766 35 +767 1 +768 38 +769 1 +770 22 +771 1 +772 82 +773 1 +774 73 +776 29 +777 1 +778 55 +780 23 +781 1 +782 16 +784 84 +785 3 +786 28 +788 59 +789 1 +790 33 +791 3 +792 24 +794 13 +795 1 +796 110 +797 2 +798 15 +800 22 +801 3 +802 29 +803 1 +804 87 +806 21 +808 29 +810 48 +812 28 +813 1 +814 58 +815 1 +816 48 +817 1 +818 31 +819 1 +820 66 +822 17 +823 2 +824 58 +826 10 +827 2 +828 25 +829 1 +830 29 +831 1 +832 63 +833 1 +834 26 +835 3 +836 52 +837 1 +838 18 +840 27 +841 2 +842 12 +843 1 +844 83 +845 1 +846 7 +847 1 +848 10 +850 26 +852 25 +853 1 +854 15 +856 27 +858 32 +859 1 +860 15 +862 43 +864 32 +865 1 +866 6 +868 39 +870 11 +872 25 +873 1 +874 10 +875 1 +876 20 +877 2 +878 19 +879 1 +880 30 +882 11 +884 53 +886 25 +887 1 +888 28 +890 6 +892 36 +894 10 +896 13 +898 14 +900 31 +902 14 +903 2 +904 43 +906 25 +908 9 +910 11 +911 1 +912 16 +913 1 +914 24 +916 27 +918 6 +920 15 +922 27 +923 1 +924 23 +926 13 +928 42 +929 1 +930 3 +932 27 +934 17 +936 8 +937 1 +938 11 +940 33 +942 4 +943 1 +944 18 +946 15 +948 13 +950 18 +952 12 +954 11 +956 21 +958 10 +960 13 +962 5 +964 32 +966 13 +968 8 +970 8 +971 1 +972 23 +973 2 +974 12 +975 1 +976 22 +978 7 +979 1 +980 14 +982 8 +984 22 +985 1 +986 6 +988 17 +989 1 +990 6 +992 13 +994 19 +996 11 +998 4 +1000 9 +1002 2 +1004 14 +1006 5 +1008 3 +1010 9 +1012 29 +1014 6 +1016 22 +1017 1 +1018 8 +1019 1 +1020 7 +1022 6 +1023 1 +1024 10 +1026 2 +1028 8 +1030 11 +1031 2 +1032 8 +1034 9 +1036 13 +1038 12 +1040 12 +1042 3 +1044 12 +1046 3 +1048 11 +1050 2 +1051 1 +1052 2 +1054 11 +1056 6 +1058 8 +1059 1 +1060 23 +1062 6 +1063 1 +1064 8 +1066 3 +1068 6 +1070 8 +1071 1 +1072 5 +1074 3 +1076 5 +1078 3 +1080 11 +1081 1 +1082 7 +1084 18 +1086 4 +1087 1 +1088 3 +1090 3 +1092 7 +1094 3 +1096 12 +1098 6 +1099 1 +1100 2 +1102 6 +1104 14 +1106 3 +1108 6 +1110 5 +1112 2 +1114 8 +1116 3 +1118 3 +1120 7 +1122 10 +1124 6 +1126 8 +1128 1 +1130 4 +1132 3 +1134 2 +1136 5 +1138 5 +1140 8 +1142 3 +1144 7 +1146 3 +1148 11 +1150 1 +1152 5 +1154 1 +1156 5 +1158 1 +1160 5 +1162 3 +1164 6 +1165 1 +1166 1 +1168 4 +1169 1 +1170 3 +1171 1 +1172 2 +1174 5 +1176 3 +1177 1 +1180 8 +1182 2 +1184 4 +1186 2 +1188 3 +1190 2 +1192 5 +1194 6 +1196 1 +1198 2 +1200 2 +1204 10 +1206 2 +1208 9 +1210 1 +1214 6 +1216 3 +1218 4 +1220 9 +1221 2 +1222 1 +1224 5 +1226 4 +1228 8 +1230 1 +1232 1 +1234 3 +1236 5 +1240 3 +1242 1 +1244 3 +1245 1 +1246 4 +1248 6 +1250 2 +1252 7 +1256 3 +1258 2 +1260 2 +1262 3 +1264 4 +1265 1 +1266 1 +1270 1 +1271 1 +1272 2 +1274 3 +1276 3 +1278 1 +1280 3 +1284 1 +1286 1 +1290 1 +1292 3 +1294 1 +1296 7 +1300 2 +1302 4 +1304 3 +1306 2 +1308 2 +1312 1 +1314 1 +1316 3 +1318 2 +1320 1 +1324 8 +1326 1 +1330 1 +1331 1 +1336 2 +1338 1 +1340 3 +1341 1 +1344 1 +1346 2 +1347 1 +1348 3 +1352 1 +1354 2 +1356 1 +1358 1 +1360 3 +1362 1 +1364 4 +1366 1 +1370 1 +1372 3 +1380 2 +1384 2 +1388 2 +1390 2 +1392 2 +1394 1 +1396 1 +1398 1 +1400 2 +1402 1 +1404 1 +1406 1 +1410 1 +1412 5 +1418 1 +1420 1 +1424 1 +1432 2 +1434 2 +1442 3 +1444 5 +1448 1 +1454 1 +1456 1 +1460 3 +1462 4 +1468 1 +1474 1 +1476 1 +1478 2 +1480 1 +1486 2 +1488 1 +1492 1 +1496 1 +1500 3 +1503 1 +1506 1 +1512 2 +1516 1 +1522 1 +1524 2 +1534 4 +1536 1 +1538 1 +1540 2 +1544 2 +1548 1 +1556 1 +1560 1 +1562 1 +1564 2 +1566 1 +1568 1 +1570 1 +1572 1 +1576 1 +1590 1 +1594 1 +1604 1 +1608 1 +1614 1 +1622 1 +1624 2 +1628 1 +1629 1 +1636 1 +1642 1 +1654 2 +1660 1 +1664 1 +1670 1 +1684 4 +1698 1 +1732 3 +1742 1 +1752 1 +1760 1 +1764 1 +1772 2 +1798 1 +1808 1 +1820 1 +1852 1 +1856 1 +1874 1 +1902 1 +1908 1 +1952 1 +2004 1 +2018 1 +2020 1 +2028 1 +2174 1 +2233 1 +2244 1 +2280 1 +2290 1 +2352 1 +2604 1 +4190 1 diff --git a/batch_running_task/pytorchocr/utils/dict/te_dict.txt b/batch_running_task/pytorchocr/utils/dict/te_dict.txt new file mode 100644 index 0000000..83d74cc --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/te_dict.txt @@ -0,0 +1,151 @@ +t +e +_ +i +m +g +/ +5 +I +L +S +V +R +C +2 +0 +1 +v +a +l +3 +4 +8 +9 +. +j +p +త +ె +ర +క +్ +ి +ం +చ +ే +ద +ు +7 +6 +ఉ +ా +మ +ట +ో +వ +ప +ల +శ +ఆ +య +ై +భ +' +ీ +గ +ూ +డ +ధ +హ +న +జ +స +[ +‌ +ష +అ +ణ +ఫ +బ +ఎ +; +ళ +థ +ొ +ఠ +ృ +ఒ +ఇ +ః +ఊ +ఖ +- +ఐ +ఘ +ౌ +ఏ +ఈ +ఛ +, +ఓ +ఞ +| +? +: +ఢ +" +( +” +! ++ +) +* += +& +“ +€ +] +£ +$ +s +c +n +w +k +J +G +u +d +r +E +o +h +y +b +f +B +M +O +T +N +D +P +A +F +x +W +Y +U +H +K +X +z +Z +Q +q +É +% +# +@ +é diff --git a/batch_running_task/pytorchocr/utils/dict/ug_dict.txt b/batch_running_task/pytorchocr/utils/dict/ug_dict.txt new file mode 100644 index 0000000..77602f2 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/ug_dict.txt @@ -0,0 +1,114 @@ +u +g +_ +i +m +/ +1 +I +L +S +V +R +C +2 +0 +v +a +l +8 +5 +3 +6 +9 +. +j +p + +ق +ا +پ +ل +4 +7 +ئ +ى +ش +ت +ي +ك +د +ف +ر +و +ن +ب +ە +خ +ې +چ +ۇ +ز +س +م +ۋ +گ +ڭ +ۆ +ۈ +ج +غ +ھ +ژ +s +c +e +n +w +P +E +D +U +d +r +b +y +B +o +O +Y +N +T +k +t +h +A +H +F +z +W +K +G +M +f +Z +X +Q +J +x +q +- +! +% +# +? +: +$ +, +& +' +É +@ +é +( ++ diff --git a/batch_running_task/pytorchocr/utils/dict/uk_dict.txt b/batch_running_task/pytorchocr/utils/dict/uk_dict.txt new file mode 100644 index 0000000..c5ffc0a --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/uk_dict.txt @@ -0,0 +1,142 @@ +u +k +_ +i +m +g +/ +1 +6 +I +L +S +V +R +C +2 +0 +v +a +l +7 +9 +. +j +p +в +і +д +п +о +н +с +т +ю +4 +5 +3 +а +и +м +е +р +ч +у +Б +з +л +к +8 +А +В +г +є +б +ь +х +ґ +ш +ц +ф +я +щ +ж +Г +Х +У +Т +Е +І +Н +П +З +Л +Ю +С +Д +М +К +Р +Ф +О +Ц +И +Я +Ч +Ш +Ж +Є +Ґ +Ь +s +c +e +n +w +A +P +r +E +t +o +h +d +y +M +G +N +F +B +T +D +U +O +W +Z +f +H +Y +b +K +z +x +Q +X +q +J +$ +- +' +# +& +% +? +: +! +, ++ +@ +( +é +É + diff --git a/batch_running_task/pytorchocr/utils/dict/ur_dict.txt b/batch_running_task/pytorchocr/utils/dict/ur_dict.txt new file mode 100644 index 0000000..c06786a --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/ur_dict.txt @@ -0,0 +1,137 @@ +u +r +_ +i +m +g +/ +3 +I +L +S +V +R +C +2 +0 +1 +v +a +l +9 +7 +8 +. +j +p + +چ +ٹ +پ +ا +ئ +ی +ے +4 +6 +و +ل +ن +ڈ +ھ +ک +ت +ش +ف +ق +ر +د +5 +ب +ج +خ +ہ +س +ز +غ +ڑ +ں +آ +م +ؤ +ط +ص +ح +ع +گ +ث +ض +ذ +ۓ +ِ +ء +ظ +ً +ي +ُ +ۃ +أ +ٰ +ە +ژ +ۂ +ة +ّ +ك +ه +s +c +e +n +w +o +d +t +D +M +T +U +E +b +P +h +y +W +H +A +x +B +O +N +G +Y +Q +F +k +K +q +J +Z +f +z +X +' +@ +& +! +, +: +$ +- +# +? +% +é ++ +( +É diff --git a/batch_running_task/pytorchocr/utils/dict/xi_dict.txt b/batch_running_task/pytorchocr/utils/dict/xi_dict.txt new file mode 100644 index 0000000..f195f1e --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict/xi_dict.txt @@ -0,0 +1,110 @@ +x +i +_ +m +g +/ +1 +0 +I +L +S +V +R +C +2 +v +a +l +3 +6 +4 +5 +. +j +p + +Q +u +e +r +o +8 +7 +n +c +9 +t +b +é +q +d +ó +y +F +s +, +O +í +T +f +" +U +M +h +: +P +H +A +E +D +z +N +á +ñ +ú +% +; +è ++ +Y +- +B +G +( +) +¿ +? +w +¡ +! +X +É +K +k +Á +ü +Ú +« +» +J +' +ö +W +Z +º +Ö +­ +[ +] +Ç +ç +à +ä +û +ò +Í +ê +ô +ø +ª diff --git a/batch_running_task/pytorchocr/utils/dict90.txt b/batch_running_task/pytorchocr/utils/dict90.txt new file mode 100644 index 0000000..a945ae9 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/dict90.txt @@ -0,0 +1,90 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +: +; +< += +> +? +@ +[ +\ +] +_ +` +~ \ No newline at end of file diff --git a/batch_running_task/pytorchocr/utils/e2e_utils/extract_batchsize.py b/batch_running_task/pytorchocr/utils/e2e_utils/extract_batchsize.py new file mode 100644 index 0000000..6c6a8bf --- /dev/null +++ b/batch_running_task/pytorchocr/utils/e2e_utils/extract_batchsize.py @@ -0,0 +1,88 @@ +raise ValueError('utils -> e2e_utils -> extract_batchsize') +import paddle +import numpy as np +import copy + + +def org_tcl_rois(batch_size, pos_lists, pos_masks, label_lists, tcl_bs): + """ + """ + pos_lists_, pos_masks_, label_lists_ = [], [], [] + img_bs = batch_size + ngpu = int(batch_size / img_bs) + img_ids = np.array(pos_lists, dtype=np.int32)[:, 0, 0].copy() + pos_lists_split, pos_masks_split, label_lists_split = [], [], [] + for i in range(ngpu): + pos_lists_split.append([]) + pos_masks_split.append([]) + label_lists_split.append([]) + + for i in range(img_ids.shape[0]): + img_id = img_ids[i] + gpu_id = int(img_id / img_bs) + img_id = img_id % img_bs + pos_list = pos_lists[i].copy() + pos_list[:, 0] = img_id + pos_lists_split[gpu_id].append(pos_list) + pos_masks_split[gpu_id].append(pos_masks[i].copy()) + label_lists_split[gpu_id].append(copy.deepcopy(label_lists[i])) + # repeat or delete + for i in range(ngpu): + vp_len = len(pos_lists_split[i]) + if vp_len <= tcl_bs: + for j in range(0, tcl_bs - vp_len): + pos_list = pos_lists_split[i][j].copy() + pos_lists_split[i].append(pos_list) + pos_mask = pos_masks_split[i][j].copy() + pos_masks_split[i].append(pos_mask) + label_list = copy.deepcopy(label_lists_split[i][j]) + label_lists_split[i].append(label_list) + else: + for j in range(0, vp_len - tcl_bs): + c_len = len(pos_lists_split[i]) + pop_id = np.random.permutation(c_len)[0] + pos_lists_split[i].pop(pop_id) + pos_masks_split[i].pop(pop_id) + label_lists_split[i].pop(pop_id) + # merge + for i in range(ngpu): + pos_lists_.extend(pos_lists_split[i]) + pos_masks_.extend(pos_masks_split[i]) + label_lists_.extend(label_lists_split[i]) + return pos_lists_, pos_masks_, label_lists_ + + +def pre_process(label_list, pos_list, pos_mask, max_text_length, max_text_nums, + pad_num, tcl_bs): + label_list = label_list.numpy() + batch, _, _, _ = label_list.shape + pos_list = pos_list.numpy() + pos_mask = pos_mask.numpy() + pos_list_t = [] + pos_mask_t = [] + label_list_t = [] + for i in range(batch): + for j in range(max_text_nums): + if pos_mask[i, j].any(): + pos_list_t.append(pos_list[i][j]) + pos_mask_t.append(pos_mask[i][j]) + label_list_t.append(label_list[i][j]) + pos_list, pos_mask, label_list = org_tcl_rois(batch, pos_list_t, pos_mask_t, + label_list_t, tcl_bs) + label = [] + tt = [l.tolist() for l in label_list] + for i in range(tcl_bs): + k = 0 + for j in range(max_text_length): + if tt[i][j][0] != pad_num: + k += 1 + else: + break + label.append(k) + label = paddle.to_tensor(label) + label = paddle.cast(label, dtype='int64') + pos_list = paddle.to_tensor(pos_list) + pos_mask = paddle.to_tensor(pos_mask) + label_list = paddle.squeeze(paddle.to_tensor(label_list), axis=2) + label_list = paddle.cast(label_list, dtype='int32') + return pos_list, pos_mask, label_list, label diff --git a/batch_running_task/pytorchocr/utils/e2e_utils/extract_textpoint_fast.py b/batch_running_task/pytorchocr/utils/e2e_utils/extract_textpoint_fast.py new file mode 100644 index 0000000..787cd30 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/e2e_utils/extract_textpoint_fast.py @@ -0,0 +1,457 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains various CTC decoders.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import cv2 +import math + +import numpy as np +from itertools import groupby +from skimage.morphology._skeletonize import thin + + +def get_dict(character_dict_path): + character_str = "" + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + character_str += line + dict_character = list(character_str) + return dict_character + + +def softmax(logits): + """ + logits: N x d + """ + max_value = np.max(logits, axis=1, keepdims=True) + exp = np.exp(logits - max_value) + exp_sum = np.sum(exp, axis=1, keepdims=True) + dist = exp / exp_sum + return dist + + +def get_keep_pos_idxs(labels, remove_blank=None): + """ + Remove duplicate and get pos idxs of keep items. + The value of keep_blank should be [None, 95]. + """ + duplicate_len_list = [] + keep_pos_idx_list = [] + keep_char_idx_list = [] + for k, v_ in groupby(labels): + current_len = len(list(v_)) + if k != remove_blank: + current_idx = int(sum(duplicate_len_list) + current_len // 2) + keep_pos_idx_list.append(current_idx) + keep_char_idx_list.append(k) + duplicate_len_list.append(current_len) + return keep_char_idx_list, keep_pos_idx_list + + +def remove_blank(labels, blank=0): + new_labels = [x for x in labels if x != blank] + return new_labels + + +def insert_blank(labels, blank=0): + new_labels = [blank] + for l in labels: + new_labels += [l, blank] + return new_labels + + +def ctc_greedy_decoder(probs_seq, blank=95, keep_blank_in_idxs=True): + """ + CTC greedy (best path) decoder. + """ + raw_str = np.argmax(np.array(probs_seq), axis=1) + remove_blank_in_pos = None if keep_blank_in_idxs else blank + dedup_str, keep_idx_list = get_keep_pos_idxs( + raw_str, remove_blank=remove_blank_in_pos) + dst_str = remove_blank(dedup_str, blank=blank) + return dst_str, keep_idx_list + + +def instance_ctc_greedy_decoder(gather_info, logits_map, pts_num=4): + _, _, C = logits_map.shape + ys, xs = zip(*gather_info) + logits_seq = logits_map[list(ys), list(xs)] + probs_seq = logits_seq + labels = np.argmax(probs_seq, axis=1) + dst_str = [k for k, v_ in groupby(labels) if k != C - 1] + detal = len(gather_info) // (pts_num - 1) + keep_idx_list = [0] + [detal * (i + 1) for i in range(pts_num - 2)] + [-1] + keep_gather_list = [gather_info[idx] for idx in keep_idx_list] + return dst_str, keep_gather_list + + +def ctc_decoder_for_image(gather_info_list, + logits_map, + Lexicon_Table, + pts_num=6): + """ + CTC decoder using multiple processes. + """ + decoder_str = [] + decoder_xys = [] + for gather_info in gather_info_list: + if len(gather_info) < pts_num: + continue + dst_str, xys_list = instance_ctc_greedy_decoder( + gather_info, logits_map, pts_num=pts_num) + dst_str_readable = ''.join([Lexicon_Table[idx] for idx in dst_str]) + if len(dst_str_readable) < 2: + continue + decoder_str.append(dst_str_readable) + decoder_xys.append(xys_list) + return decoder_str, decoder_xys + + +def sort_with_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + + def sort_part_with_direction(pos_list, point_direction): + pos_list = np.array(pos_list).reshape(-1, 2) + point_direction = np.array(point_direction).reshape(-1, 2) + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list[np.argsort(pos_proj_leng)].tolist() + sorted_direction = point_direction[np.argsort(pos_proj_leng)].tolist() + return sorted_list, sorted_direction + + pos_list = np.array(pos_list).reshape(-1, 2) + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + sorted_point, sorted_direction = sort_part_with_direction(pos_list, + point_direction) + + point_num = len(sorted_point) + if point_num >= 16: + middle_num = point_num // 2 + first_part_point = sorted_point[:middle_num] + first_point_direction = sorted_direction[:middle_num] + sorted_fist_part_point, sorted_fist_part_direction = sort_part_with_direction( + first_part_point, first_point_direction) + + last_part_point = sorted_point[middle_num:] + last_point_direction = sorted_direction[middle_num:] + sorted_last_part_point, sorted_last_part_direction = sort_part_with_direction( + last_part_point, last_point_direction) + sorted_point = sorted_fist_part_point + sorted_last_part_point + sorted_direction = sorted_fist_part_direction + sorted_last_part_direction + + return sorted_point, np.array(sorted_direction) + + +def add_id(pos_list, image_id=0): + """ + Add id for gather feature, for inference. + """ + new_list = [] + for item in pos_list: + new_list.append((image_id, item[0], item[1])) + return new_list + + +def sort_and_expand_with_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + h, w, _ = f_direction.shape + sorted_list, point_direction = sort_with_direction(pos_list, f_direction) + + point_num = len(sorted_list) + sub_direction_len = max(point_num // 3, 2) + left_direction = point_direction[:sub_direction_len, :] + right_dirction = point_direction[point_num - sub_direction_len:, :] + + left_average_direction = -np.mean(left_direction, axis=0, keepdims=True) + left_average_len = np.linalg.norm(left_average_direction) + left_start = np.array(sorted_list[0]) + left_step = left_average_direction / (left_average_len + 1e-6) + + right_average_direction = np.mean(right_dirction, axis=0, keepdims=True) + right_average_len = np.linalg.norm(right_average_direction) + right_step = right_average_direction / (right_average_len + 1e-6) + right_start = np.array(sorted_list[-1]) + + append_num = max( + int((left_average_len + right_average_len) / 2.0 * 0.15), 1) + left_list = [] + right_list = [] + for i in range(append_num): + ly, lx = np.round(left_start + left_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ly < h and lx < w and (ly, lx) not in left_list: + left_list.append((ly, lx)) + ry, rx = np.round(right_start + right_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ry < h and rx < w and (ry, rx) not in right_list: + right_list.append((ry, rx)) + + all_list = left_list[::-1] + sorted_list + right_list + return all_list + + +def sort_and_expand_with_direction_v2(pos_list, f_direction, binary_tcl_map): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + binary_tcl_map: h x w + """ + h, w, _ = f_direction.shape + sorted_list, point_direction = sort_with_direction(pos_list, f_direction) + + point_num = len(sorted_list) + sub_direction_len = max(point_num // 3, 2) + left_direction = point_direction[:sub_direction_len, :] + right_dirction = point_direction[point_num - sub_direction_len:, :] + + left_average_direction = -np.mean(left_direction, axis=0, keepdims=True) + left_average_len = np.linalg.norm(left_average_direction) + left_start = np.array(sorted_list[0]) + left_step = left_average_direction / (left_average_len + 1e-6) + + right_average_direction = np.mean(right_dirction, axis=0, keepdims=True) + right_average_len = np.linalg.norm(right_average_direction) + right_step = right_average_direction / (right_average_len + 1e-6) + right_start = np.array(sorted_list[-1]) + + append_num = max( + int((left_average_len + right_average_len) / 2.0 * 0.15), 1) + max_append_num = 2 * append_num + + left_list = [] + right_list = [] + for i in range(max_append_num): + ly, lx = np.round(left_start + left_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ly < h and lx < w and (ly, lx) not in left_list: + if binary_tcl_map[ly, lx] > 0.5: + left_list.append((ly, lx)) + else: + break + + for i in range(max_append_num): + ry, rx = np.round(right_start + right_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ry < h and rx < w and (ry, rx) not in right_list: + if binary_tcl_map[ry, rx] > 0.5: + right_list.append((ry, rx)) + else: + break + + all_list = left_list[::-1] + sorted_list + right_list + return all_list + + +def point_pair2poly(point_pair_list): + """ + Transfer vertical point_pairs into poly point in clockwise. + """ + point_num = len(point_pair_list) * 2 + point_list = [0] * point_num + for idx, point_pair in enumerate(point_pair_list): + point_list[idx] = point_pair[0] + point_list[point_num - 1 - idx] = point_pair[1] + return np.array(point_list).reshape(-1, 2) + + +def shrink_quad_along_width(quad, begin_width_ratio=0., end_width_ratio=1.): + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + +def expand_poly_along_width(poly, shrink_ratio_of_width=0.3): + """ + expand poly along width. + """ + point_num = poly.shape[0] + left_quad = np.array( + [poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32) + left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \ + (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6) + left_quad_expand = shrink_quad_along_width(left_quad, left_ratio, 1.0) + right_quad = np.array( + [ + poly[point_num // 2 - 2], poly[point_num // 2 - 1], + poly[point_num // 2], poly[point_num // 2 + 1] + ], + dtype=np.float32) + right_ratio = 1.0 + shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \ + (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6) + right_quad_expand = shrink_quad_along_width(right_quad, 0.0, right_ratio) + poly[0] = left_quad_expand[0] + poly[-1] = left_quad_expand[-1] + poly[point_num // 2 - 1] = right_quad_expand[1] + poly[point_num // 2] = right_quad_expand[2] + return poly + + +def restore_poly(instance_yxs_list, seq_strs, p_border, ratio_w, ratio_h, src_w, + src_h, valid_set): + poly_list = [] + keep_str_list = [] + for yx_center_line, keep_str in zip(instance_yxs_list, seq_strs): + if len(keep_str) < 2: + print('--> too short, {}'.format(keep_str)) + continue + + offset_expand = 1.0 + if valid_set == 'totaltext': + offset_expand = 1.2 + + point_pair_list = [] + for y, x in yx_center_line: + offset = p_border[:, y, x].reshape(2, 2) * offset_expand + ori_yx = np.array([y, x], dtype=np.float32) + point_pair = (ori_yx + offset)[:, ::-1] * 4.0 / np.array( + [ratio_w, ratio_h]).reshape(-1, 2) + point_pair_list.append(point_pair) + + detected_poly = point_pair2poly(point_pair_list) + detected_poly = expand_poly_along_width( + detected_poly, shrink_ratio_of_width=0.2) + detected_poly[:, 0] = np.clip(detected_poly[:, 0], a_min=0, a_max=src_w) + detected_poly[:, 1] = np.clip(detected_poly[:, 1], a_min=0, a_max=src_h) + + keep_str_list.append(keep_str) + if valid_set == 'partvgg': + middle_point = len(detected_poly) // 2 + detected_poly = detected_poly[ + [0, middle_point - 1, middle_point, -1], :] + poly_list.append(detected_poly) + elif valid_set == 'totaltext': + poly_list.append(detected_poly) + else: + print('--> Not supported format.') + exit(-1) + return poly_list, keep_str_list + + +def generate_pivot_list_fast(p_score, + p_char_maps, + f_direction, + Lexicon_Table, + score_thresh=0.5): + """ + return center point and end point of TCL instance; filter with the char maps; + """ + p_score = p_score[0] + f_direction = f_direction.transpose(1, 2, 0) + p_tcl_map = (p_score > score_thresh) * 1.0 + skeleton_map = thin(p_tcl_map.astype(np.uint8)) + instance_count, instance_label_map = cv2.connectedComponents( + skeleton_map.astype(np.uint8), connectivity=8) + + # get TCL Instance + all_pos_yxs = [] + if instance_count > 0: + for instance_id in range(1, instance_count): + pos_list = [] + ys, xs = np.where(instance_label_map == instance_id) + pos_list = list(zip(ys, xs)) + + if len(pos_list) < 3: + continue + + pos_list_sorted = sort_and_expand_with_direction_v2( + pos_list, f_direction, p_tcl_map) + all_pos_yxs.append(pos_list_sorted) + + p_char_maps = p_char_maps.transpose([1, 2, 0]) + decoded_str, keep_yxs_list = ctc_decoder_for_image( + all_pos_yxs, logits_map=p_char_maps, Lexicon_Table=Lexicon_Table) + return keep_yxs_list, decoded_str + + +def extract_main_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + pos_list = np.array(pos_list) + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] + point_direction = point_direction[:, ::-1] # x, y -> y, x + average_direction = np.mean(point_direction, axis=0, keepdims=True) + average_direction = average_direction / ( + np.linalg.norm(average_direction) + 1e-6) + return average_direction + + +def sort_by_direction_with_image_id_deprecated(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[id, y, x], [id, y, x], [id, y, x] ...] + """ + pos_list_full = np.array(pos_list).reshape(-1, 3) + pos_list = pos_list_full[:, 1:] + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list_full[np.argsort(pos_proj_leng)].tolist() + return sorted_list + + +def sort_by_direction_with_image_id(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + + def sort_part_with_direction(pos_list_full, point_direction): + pos_list_full = np.array(pos_list_full).reshape(-1, 3) + pos_list = pos_list_full[:, 1:] + point_direction = np.array(point_direction).reshape(-1, 2) + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list_full[np.argsort(pos_proj_leng)].tolist() + sorted_direction = point_direction[np.argsort(pos_proj_leng)].tolist() + return sorted_list, sorted_direction + + pos_list = np.array(pos_list).reshape(-1, 3) + point_direction = f_direction[pos_list[:, 1], pos_list[:, 2]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + sorted_point, sorted_direction = sort_part_with_direction(pos_list, + point_direction) + + point_num = len(sorted_point) + if point_num >= 16: + middle_num = point_num // 2 + first_part_point = sorted_point[:middle_num] + first_point_direction = sorted_direction[:middle_num] + sorted_fist_part_point, sorted_fist_part_direction = sort_part_with_direction( + first_part_point, first_point_direction) + + last_part_point = sorted_point[middle_num:] + last_point_direction = sorted_direction[middle_num:] + sorted_last_part_point, sorted_last_part_direction = sort_part_with_direction( + last_part_point, last_point_direction) + sorted_point = sorted_fist_part_point + sorted_last_part_point + sorted_direction = sorted_fist_part_direction + sorted_last_part_direction + + return sorted_point diff --git a/batch_running_task/pytorchocr/utils/e2e_utils/extract_textpoint_slow.py b/batch_running_task/pytorchocr/utils/e2e_utils/extract_textpoint_slow.py new file mode 100644 index 0000000..ace46fb --- /dev/null +++ b/batch_running_task/pytorchocr/utils/e2e_utils/extract_textpoint_slow.py @@ -0,0 +1,592 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains various CTC decoders.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import cv2 +import math + +import numpy as np +from itertools import groupby +from skimage.morphology._skeletonize import thin + + +def get_dict(character_dict_path): + character_str = "" + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + character_str += line + dict_character = list(character_str) + return dict_character + + +def point_pair2poly(point_pair_list): + """ + Transfer vertical point_pairs into poly point in clockwise. + """ + pair_length_list = [] + for point_pair in point_pair_list: + pair_length = np.linalg.norm(point_pair[0] - point_pair[1]) + pair_length_list.append(pair_length) + pair_length_list = np.array(pair_length_list) + pair_info = (pair_length_list.max(), pair_length_list.min(), + pair_length_list.mean()) + + point_num = len(point_pair_list) * 2 + point_list = [0] * point_num + for idx, point_pair in enumerate(point_pair_list): + point_list[idx] = point_pair[0] + point_list[point_num - 1 - idx] = point_pair[1] + return np.array(point_list).reshape(-1, 2), pair_info + + +def shrink_quad_along_width(quad, begin_width_ratio=0., end_width_ratio=1.): + """ + Generate shrink_quad_along_width. + """ + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + +def expand_poly_along_width(poly, shrink_ratio_of_width=0.3): + """ + expand poly along width. + """ + point_num = poly.shape[0] + left_quad = np.array( + [poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32) + left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \ + (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6) + left_quad_expand = shrink_quad_along_width(left_quad, left_ratio, 1.0) + right_quad = np.array( + [ + poly[point_num // 2 - 2], poly[point_num // 2 - 1], + poly[point_num // 2], poly[point_num // 2 + 1] + ], + dtype=np.float32) + right_ratio = 1.0 + \ + shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \ + (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6) + right_quad_expand = shrink_quad_along_width(right_quad, 0.0, right_ratio) + poly[0] = left_quad_expand[0] + poly[-1] = left_quad_expand[-1] + poly[point_num // 2 - 1] = right_quad_expand[1] + poly[point_num // 2] = right_quad_expand[2] + return poly + + +def softmax(logits): + """ + logits: N x d + """ + max_value = np.max(logits, axis=1, keepdims=True) + exp = np.exp(logits - max_value) + exp_sum = np.sum(exp, axis=1, keepdims=True) + dist = exp / exp_sum + return dist + + +def get_keep_pos_idxs(labels, remove_blank=None): + """ + Remove duplicate and get pos idxs of keep items. + The value of keep_blank should be [None, 95]. + """ + duplicate_len_list = [] + keep_pos_idx_list = [] + keep_char_idx_list = [] + for k, v_ in groupby(labels): + current_len = len(list(v_)) + if k != remove_blank: + current_idx = int(sum(duplicate_len_list) + current_len // 2) + keep_pos_idx_list.append(current_idx) + keep_char_idx_list.append(k) + duplicate_len_list.append(current_len) + return keep_char_idx_list, keep_pos_idx_list + + +def remove_blank(labels, blank=0): + new_labels = [x for x in labels if x != blank] + return new_labels + + +def insert_blank(labels, blank=0): + new_labels = [blank] + for l in labels: + new_labels += [l, blank] + return new_labels + + +def ctc_greedy_decoder(probs_seq, blank=95, keep_blank_in_idxs=True): + """ + CTC greedy (best path) decoder. + """ + raw_str = np.argmax(np.array(probs_seq), axis=1) + remove_blank_in_pos = None if keep_blank_in_idxs else blank + dedup_str, keep_idx_list = get_keep_pos_idxs( + raw_str, remove_blank=remove_blank_in_pos) + dst_str = remove_blank(dedup_str, blank=blank) + return dst_str, keep_idx_list + + +def instance_ctc_greedy_decoder(gather_info, + logits_map, + keep_blank_in_idxs=True): + """ + gather_info: [[x, y], [x, y] ...] + logits_map: H x W X (n_chars + 1) + """ + _, _, C = logits_map.shape + ys, xs = zip(*gather_info) + logits_seq = logits_map[list(ys), list(xs)] # n x 96 + probs_seq = softmax(logits_seq) + dst_str, keep_idx_list = ctc_greedy_decoder( + probs_seq, blank=C - 1, keep_blank_in_idxs=keep_blank_in_idxs) + keep_gather_list = [gather_info[idx] for idx in keep_idx_list] + return dst_str, keep_gather_list + + +def ctc_decoder_for_image(gather_info_list, logits_map, + keep_blank_in_idxs=True): + """ + CTC decoder using multiple processes. + """ + decoder_results = [] + for gather_info in gather_info_list: + res = instance_ctc_greedy_decoder( + gather_info, logits_map, keep_blank_in_idxs=keep_blank_in_idxs) + decoder_results.append(res) + return decoder_results + + +def sort_with_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + + def sort_part_with_direction(pos_list, point_direction): + pos_list = np.array(pos_list).reshape(-1, 2) + point_direction = np.array(point_direction).reshape(-1, 2) + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list[np.argsort(pos_proj_leng)].tolist() + sorted_direction = point_direction[np.argsort(pos_proj_leng)].tolist() + return sorted_list, sorted_direction + + pos_list = np.array(pos_list).reshape(-1, 2) + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + sorted_point, sorted_direction = sort_part_with_direction(pos_list, + point_direction) + + point_num = len(sorted_point) + if point_num >= 16: + middle_num = point_num // 2 + first_part_point = sorted_point[:middle_num] + first_point_direction = sorted_direction[:middle_num] + sorted_fist_part_point, sorted_fist_part_direction = sort_part_with_direction( + first_part_point, first_point_direction) + + last_part_point = sorted_point[middle_num:] + last_point_direction = sorted_direction[middle_num:] + sorted_last_part_point, sorted_last_part_direction = sort_part_with_direction( + last_part_point, last_point_direction) + sorted_point = sorted_fist_part_point + sorted_last_part_point + sorted_direction = sorted_fist_part_direction + sorted_last_part_direction + + return sorted_point, np.array(sorted_direction) + + +def add_id(pos_list, image_id=0): + """ + Add id for gather feature, for inference. + """ + new_list = [] + for item in pos_list: + new_list.append((image_id, item[0], item[1])) + return new_list + + +def sort_and_expand_with_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + h, w, _ = f_direction.shape + sorted_list, point_direction = sort_with_direction(pos_list, f_direction) + + # expand along + point_num = len(sorted_list) + sub_direction_len = max(point_num // 3, 2) + left_direction = point_direction[:sub_direction_len, :] + right_dirction = point_direction[point_num - sub_direction_len:, :] + + left_average_direction = -np.mean(left_direction, axis=0, keepdims=True) + left_average_len = np.linalg.norm(left_average_direction) + left_start = np.array(sorted_list[0]) + left_step = left_average_direction / (left_average_len + 1e-6) + + right_average_direction = np.mean(right_dirction, axis=0, keepdims=True) + right_average_len = np.linalg.norm(right_average_direction) + right_step = right_average_direction / (right_average_len + 1e-6) + right_start = np.array(sorted_list[-1]) + + append_num = max( + int((left_average_len + right_average_len) / 2.0 * 0.15), 1) + left_list = [] + right_list = [] + for i in range(append_num): + ly, lx = np.round(left_start + left_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ly < h and lx < w and (ly, lx) not in left_list: + left_list.append((ly, lx)) + ry, rx = np.round(right_start + right_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ry < h and rx < w and (ry, rx) not in right_list: + right_list.append((ry, rx)) + + all_list = left_list[::-1] + sorted_list + right_list + return all_list + + +def sort_and_expand_with_direction_v2(pos_list, f_direction, binary_tcl_map): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + binary_tcl_map: h x w + """ + h, w, _ = f_direction.shape + sorted_list, point_direction = sort_with_direction(pos_list, f_direction) + + # expand along + point_num = len(sorted_list) + sub_direction_len = max(point_num // 3, 2) + left_direction = point_direction[:sub_direction_len, :] + right_dirction = point_direction[point_num - sub_direction_len:, :] + + left_average_direction = -np.mean(left_direction, axis=0, keepdims=True) + left_average_len = np.linalg.norm(left_average_direction) + left_start = np.array(sorted_list[0]) + left_step = left_average_direction / (left_average_len + 1e-6) + + right_average_direction = np.mean(right_dirction, axis=0, keepdims=True) + right_average_len = np.linalg.norm(right_average_direction) + right_step = right_average_direction / (right_average_len + 1e-6) + right_start = np.array(sorted_list[-1]) + + append_num = max( + int((left_average_len + right_average_len) / 2.0 * 0.15), 1) + max_append_num = 2 * append_num + + left_list = [] + right_list = [] + for i in range(max_append_num): + ly, lx = np.round(left_start + left_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ly < h and lx < w and (ly, lx) not in left_list: + if binary_tcl_map[ly, lx] > 0.5: + left_list.append((ly, lx)) + else: + break + + for i in range(max_append_num): + ry, rx = np.round(right_start + right_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ry < h and rx < w and (ry, rx) not in right_list: + if binary_tcl_map[ry, rx] > 0.5: + right_list.append((ry, rx)) + else: + break + + all_list = left_list[::-1] + sorted_list + right_list + return all_list + + +def generate_pivot_list_curved(p_score, + p_char_maps, + f_direction, + score_thresh=0.5, + is_expand=True, + is_backbone=False, + image_id=0): + """ + return center point and end point of TCL instance; filter with the char maps; + """ + p_score = p_score[0] + f_direction = f_direction.transpose(1, 2, 0) + p_tcl_map = (p_score > score_thresh) * 1.0 + skeleton_map = thin(p_tcl_map) + instance_count, instance_label_map = cv2.connectedComponents( + skeleton_map.astype(np.uint8), connectivity=8) + + # get TCL Instance + all_pos_yxs = [] + center_pos_yxs = [] + end_points_yxs = [] + instance_center_pos_yxs = [] + pred_strs = [] + if instance_count > 0: + for instance_id in range(1, instance_count): + pos_list = [] + ys, xs = np.where(instance_label_map == instance_id) + pos_list = list(zip(ys, xs)) + + ### FIX-ME, eliminate outlier + if len(pos_list) < 3: + continue + + if is_expand: + pos_list_sorted = sort_and_expand_with_direction_v2( + pos_list, f_direction, p_tcl_map) + else: + pos_list_sorted, _ = sort_with_direction(pos_list, f_direction) + all_pos_yxs.append(pos_list_sorted) + + # use decoder to filter backgroud points. + p_char_maps = p_char_maps.transpose([1, 2, 0]) + decode_res = ctc_decoder_for_image( + all_pos_yxs, logits_map=p_char_maps, keep_blank_in_idxs=True) + for decoded_str, keep_yxs_list in decode_res: + if is_backbone: + keep_yxs_list_with_id = add_id(keep_yxs_list, image_id=image_id) + instance_center_pos_yxs.append(keep_yxs_list_with_id) + pred_strs.append(decoded_str) + else: + end_points_yxs.extend((keep_yxs_list[0], keep_yxs_list[-1])) + center_pos_yxs.extend(keep_yxs_list) + + if is_backbone: + return pred_strs, instance_center_pos_yxs + else: + return center_pos_yxs, end_points_yxs + + +def generate_pivot_list_horizontal(p_score, + p_char_maps, + f_direction, + score_thresh=0.5, + is_backbone=False, + image_id=0): + """ + return center point and end point of TCL instance; filter with the char maps; + """ + p_score = p_score[0] + f_direction = f_direction.transpose(1, 2, 0) + p_tcl_map_bi = (p_score > score_thresh) * 1.0 + instance_count, instance_label_map = cv2.connectedComponents( + p_tcl_map_bi.astype(np.uint8), connectivity=8) + + # get TCL Instance + all_pos_yxs = [] + center_pos_yxs = [] + end_points_yxs = [] + instance_center_pos_yxs = [] + + if instance_count > 0: + for instance_id in range(1, instance_count): + pos_list = [] + ys, xs = np.where(instance_label_map == instance_id) + pos_list = list(zip(ys, xs)) + + ### FIX-ME, eliminate outlier + if len(pos_list) < 5: + continue + + # add rule here + main_direction = extract_main_direction(pos_list, + f_direction) # y x + reference_directin = np.array([0, 1]).reshape([-1, 2]) # y x + is_h_angle = abs(np.sum( + main_direction * reference_directin)) < math.cos(math.pi / 180 * + 70) + + point_yxs = np.array(pos_list) + max_y, max_x = np.max(point_yxs, axis=0) + min_y, min_x = np.min(point_yxs, axis=0) + is_h_len = (max_y - min_y) < 1.5 * (max_x - min_x) + + pos_list_final = [] + if is_h_len: + xs = np.unique(xs) + for x in xs: + ys = instance_label_map[:, x].copy().reshape((-1, )) + y = int(np.where(ys == instance_id)[0].mean()) + pos_list_final.append((y, x)) + else: + ys = np.unique(ys) + for y in ys: + xs = instance_label_map[y, :].copy().reshape((-1, )) + x = int(np.where(xs == instance_id)[0].mean()) + pos_list_final.append((y, x)) + + pos_list_sorted, _ = sort_with_direction(pos_list_final, + f_direction) + all_pos_yxs.append(pos_list_sorted) + + # use decoder to filter backgroud points. + p_char_maps = p_char_maps.transpose([1, 2, 0]) + decode_res = ctc_decoder_for_image( + all_pos_yxs, logits_map=p_char_maps, keep_blank_in_idxs=True) + for decoded_str, keep_yxs_list in decode_res: + if is_backbone: + keep_yxs_list_with_id = add_id(keep_yxs_list, image_id=image_id) + instance_center_pos_yxs.append(keep_yxs_list_with_id) + else: + end_points_yxs.extend((keep_yxs_list[0], keep_yxs_list[-1])) + center_pos_yxs.extend(keep_yxs_list) + + if is_backbone: + return instance_center_pos_yxs + else: + return center_pos_yxs, end_points_yxs + + +def generate_pivot_list_slow(p_score, + p_char_maps, + f_direction, + score_thresh=0.5, + is_backbone=False, + is_curved=True, + image_id=0): + """ + Warp all the function together. + """ + if is_curved: + return generate_pivot_list_curved( + p_score, + p_char_maps, + f_direction, + score_thresh=score_thresh, + is_expand=True, + is_backbone=is_backbone, + image_id=image_id) + else: + return generate_pivot_list_horizontal( + p_score, + p_char_maps, + f_direction, + score_thresh=score_thresh, + is_backbone=is_backbone, + image_id=image_id) + + +# for refine module +def extract_main_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + pos_list = np.array(pos_list) + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] + point_direction = point_direction[:, ::-1] # x, y -> y, x + average_direction = np.mean(point_direction, axis=0, keepdims=True) + average_direction = average_direction / ( + np.linalg.norm(average_direction) + 1e-6) + return average_direction + + +def sort_by_direction_with_image_id_deprecated(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[id, y, x], [id, y, x], [id, y, x] ...] + """ + pos_list_full = np.array(pos_list).reshape(-1, 3) + pos_list = pos_list_full[:, 1:] + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list_full[np.argsort(pos_proj_leng)].tolist() + return sorted_list + + +def sort_by_direction_with_image_id(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + + def sort_part_with_direction(pos_list_full, point_direction): + pos_list_full = np.array(pos_list_full).reshape(-1, 3) + pos_list = pos_list_full[:, 1:] + point_direction = np.array(point_direction).reshape(-1, 2) + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list_full[np.argsort(pos_proj_leng)].tolist() + sorted_direction = point_direction[np.argsort(pos_proj_leng)].tolist() + return sorted_list, sorted_direction + + pos_list = np.array(pos_list).reshape(-1, 3) + point_direction = f_direction[pos_list[:, 1], pos_list[:, 2]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + sorted_point, sorted_direction = sort_part_with_direction(pos_list, + point_direction) + + point_num = len(sorted_point) + if point_num >= 16: + middle_num = point_num // 2 + first_part_point = sorted_point[:middle_num] + first_point_direction = sorted_direction[:middle_num] + sorted_fist_part_point, sorted_fist_part_direction = sort_part_with_direction( + first_part_point, first_point_direction) + + last_part_point = sorted_point[middle_num:] + last_point_direction = sorted_direction[middle_num:] + sorted_last_part_point, sorted_last_part_direction = sort_part_with_direction( + last_part_point, last_point_direction) + sorted_point = sorted_fist_part_point + sorted_last_part_point + sorted_direction = sorted_fist_part_direction + sorted_last_part_direction + + return sorted_point + + +def generate_pivot_list_tt_inference(p_score, + p_char_maps, + f_direction, + score_thresh=0.5, + is_backbone=False, + is_curved=True, + image_id=0): + """ + return center point and end point of TCL instance; filter with the char maps; + """ + p_score = p_score[0] + f_direction = f_direction.transpose(1, 2, 0) + p_tcl_map = (p_score > score_thresh) * 1.0 + skeleton_map = thin(p_tcl_map) + instance_count, instance_label_map = cv2.connectedComponents( + skeleton_map.astype(np.uint8), connectivity=8) + + # get TCL Instance + all_pos_yxs = [] + if instance_count > 0: + for instance_id in range(1, instance_count): + pos_list = [] + ys, xs = np.where(instance_label_map == instance_id) + pos_list = list(zip(ys, xs)) + ### FIX-ME, eliminate outlier + if len(pos_list) < 3: + continue + pos_list_sorted = sort_and_expand_with_direction_v2( + pos_list, f_direction, p_tcl_map) + pos_list_sorted_with_id = add_id(pos_list_sorted, image_id=image_id) + all_pos_yxs.append(pos_list_sorted_with_id) + return all_pos_yxs diff --git a/batch_running_task/pytorchocr/utils/e2e_utils/pgnet_pp_utils.py b/batch_running_task/pytorchocr/utils/e2e_utils/pgnet_pp_utils.py new file mode 100644 index 0000000..ce35a28 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/e2e_utils/pgnet_pp_utils.py @@ -0,0 +1,150 @@ + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import torch +import os +import sys + +__dir__ = os.path.dirname(__file__) +sys.path.append(__dir__) +sys.path.append(os.path.join(__dir__, '..')) +from .extract_textpoint_slow import * +from .extract_textpoint_fast import generate_pivot_list_fast, restore_poly + + +class PGNet_PostProcess(object): + # two different post-process + def __init__(self, character_dict_path, valid_set, score_thresh, outs_dict, + shape_list): + self.Lexicon_Table = get_dict(character_dict_path) + self.valid_set = valid_set + self.score_thresh = score_thresh + self.outs_dict = outs_dict + self.shape_list = shape_list + + def pg_postprocess_fast(self): + p_score = self.outs_dict['f_score'] + p_border = self.outs_dict['f_border'] + p_char = self.outs_dict['f_char'] + p_direction = self.outs_dict['f_direction'] + if isinstance(p_score, torch.Tensor): + p_score = p_score[0].numpy() + p_border = p_border[0].numpy() + p_direction = p_direction[0].numpy() + p_char = p_char[0].numpy() + else: + p_score = p_score[0] + p_border = p_border[0] + p_direction = p_direction[0] + p_char = p_char[0] + + src_h, src_w, ratio_h, ratio_w = self.shape_list[0] + instance_yxs_list, seq_strs = generate_pivot_list_fast( + p_score, + p_char, + p_direction, + self.Lexicon_Table, + score_thresh=self.score_thresh) + poly_list, keep_str_list = restore_poly(instance_yxs_list, seq_strs, + p_border, ratio_w, ratio_h, + src_w, src_h, self.valid_set) + data = { + 'points': poly_list, + 'texts': keep_str_list, + } + return data + + def pg_postprocess_slow(self): + p_score = self.outs_dict['f_score'] + p_border = self.outs_dict['f_border'] + p_char = self.outs_dict['f_char'] + p_direction = self.outs_dict['f_direction'] + if isinstance(p_score, torch.Tensor): + p_score = p_score[0].numpy() + p_border = p_border[0].numpy() + p_direction = p_direction[0].numpy() + p_char = p_char[0].numpy() + else: + p_score = p_score[0] + p_border = p_border[0] + p_direction = p_direction[0] + p_char = p_char[0] + src_h, src_w, ratio_h, ratio_w = self.shape_list[0] + is_curved = self.valid_set == "totaltext" + char_seq_idx_set, instance_yxs_list = generate_pivot_list_slow( + p_score, + p_char, + p_direction, + score_thresh=self.score_thresh, + is_backbone=True, + is_curved=is_curved) + seq_strs = [] + for char_idx_set in char_seq_idx_set: + pr_str = ''.join([self.Lexicon_Table[pos] for pos in char_idx_set]) + seq_strs.append(pr_str) + poly_list = [] + keep_str_list = [] + all_point_list = [] + all_point_pair_list = [] + for yx_center_line, keep_str in zip(instance_yxs_list, seq_strs): + if len(yx_center_line) == 1: + yx_center_line.append(yx_center_line[-1]) + + offset_expand = 1.0 + if self.valid_set == 'totaltext': + offset_expand = 1.2 + + point_pair_list = [] + for batch_id, y, x in yx_center_line: + offset = p_border[:, y, x].reshape(2, 2) + if offset_expand != 1.0: + offset_length = np.linalg.norm( + offset, axis=1, keepdims=True) + expand_length = np.clip( + offset_length * (offset_expand - 1), + a_min=0.5, + a_max=3.0) + offset_detal = offset / offset_length * expand_length + offset = offset + offset_detal + ori_yx = np.array([y, x], dtype=np.float32) + point_pair = (ori_yx + offset)[:, ::-1] * 4.0 / np.array( + [ratio_w, ratio_h]).reshape(-1, 2) + point_pair_list.append(point_pair) + + all_point_list.append([ + int(round(x * 4.0 / ratio_w)), + int(round(y * 4.0 / ratio_h)) + ]) + all_point_pair_list.append(point_pair.round().astype(np.int32) + .tolist()) + + detected_poly, pair_length_info = point_pair2poly(point_pair_list) + detected_poly = expand_poly_along_width( + detected_poly, shrink_ratio_of_width=0.2) + detected_poly[:, 0] = np.clip( + detected_poly[:, 0], a_min=0, a_max=src_w) + detected_poly[:, 1] = np.clip( + detected_poly[:, 1], a_min=0, a_max=src_h) + + if len(keep_str) < 2: + continue + + keep_str_list.append(keep_str) + detected_poly = np.round(detected_poly).astype('int32') + if self.valid_set == 'partvgg': + middle_point = len(detected_poly) // 2 + detected_poly = detected_poly[ + [0, middle_point - 1, middle_point, -1], :] + poly_list.append(detected_poly) + elif self.valid_set == 'totaltext': + poly_list.append(detected_poly) + else: + print('--> Not supported format.') + exit(-1) + data = { + 'points': poly_list, + 'texts': keep_str_list, + } + return data diff --git a/batch_running_task/pytorchocr/utils/e2e_utils/visual.py b/batch_running_task/pytorchocr/utils/e2e_utils/visual.py new file mode 100644 index 0000000..e6e4fd0 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/e2e_utils/visual.py @@ -0,0 +1,162 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import cv2 +import time + + +def resize_image(im, max_side_len=512): + """ + resize image to a size multiple of max_stride which is required by the network + :param im: the resized image + :param max_side_len: limit of max image size to avoid out of memory in gpu + :return: the resized image and the resize ratio + """ + h, w, _ = im.shape + + resize_w = w + resize_h = h + + if resize_h > resize_w: + ratio = float(max_side_len) / resize_h + else: + ratio = float(max_side_len) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + + return im, (ratio_h, ratio_w) + + +def resize_image_min(im, max_side_len=512): + """ + """ + h, w, _ = im.shape + + resize_w = w + resize_h = h + + if resize_h < resize_w: + ratio = float(max_side_len) / resize_h + else: + ratio = float(max_side_len) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return im, (ratio_h, ratio_w) + + +def resize_image_for_totaltext(im, max_side_len=512): + """ + """ + h, w, _ = im.shape + + resize_w = w + resize_h = h + ratio = 1.25 + if h * ratio > max_side_len: + ratio = float(max_side_len) / resize_h + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return im, (ratio_h, ratio_w) + + +def point_pair2poly(point_pair_list): + """ + Transfer vertical point_pairs into poly point in clockwise. + """ + pair_length_list = [] + for point_pair in point_pair_list: + pair_length = np.linalg.norm(point_pair[0] - point_pair[1]) + pair_length_list.append(pair_length) + pair_length_list = np.array(pair_length_list) + pair_info = (pair_length_list.max(), pair_length_list.min(), + pair_length_list.mean()) + + point_num = len(point_pair_list) * 2 + point_list = [0] * point_num + for idx, point_pair in enumerate(point_pair_list): + point_list[idx] = point_pair[0] + point_list[point_num - 1 - idx] = point_pair[1] + return np.array(point_list).reshape(-1, 2), pair_info + + +def shrink_quad_along_width(quad, begin_width_ratio=0., end_width_ratio=1.): + """ + Generate shrink_quad_along_width. + """ + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + +def expand_poly_along_width(poly, shrink_ratio_of_width=0.3): + """ + expand poly along width. + """ + point_num = poly.shape[0] + left_quad = np.array( + [poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32) + left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \ + (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6) + left_quad_expand = shrink_quad_along_width(left_quad, left_ratio, 1.0) + right_quad = np.array( + [ + poly[point_num // 2 - 2], poly[point_num // 2 - 1], + poly[point_num // 2], poly[point_num // 2 + 1] + ], + dtype=np.float32) + right_ratio = 1.0 + \ + shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \ + (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6) + right_quad_expand = shrink_quad_along_width(right_quad, 0.0, right_ratio) + poly[0] = left_quad_expand[0] + poly[-1] = left_quad_expand[-1] + poly[point_num // 2 - 1] = right_quad_expand[1] + poly[point_num // 2] = right_quad_expand[2] + return poly + + +def norm2(x, axis=None): + if axis: + return np.sqrt(np.sum(x**2, axis=axis)) + return np.sqrt(np.sum(x**2)) + + +def cos(p1, p2): + return (p1 * p2).sum() / (norm2(p1) * norm2(p2)) diff --git a/batch_running_task/pytorchocr/utils/en_dict.txt b/batch_running_task/pytorchocr/utils/en_dict.txt new file mode 100644 index 0000000..7677d31 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/en_dict.txt @@ -0,0 +1,95 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ + diff --git a/batch_running_task/pytorchocr/utils/ic15_dict.txt b/batch_running_task/pytorchocr/utils/ic15_dict.txt new file mode 100644 index 0000000..4740603 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/ic15_dict.txt @@ -0,0 +1,36 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z \ No newline at end of file diff --git a/batch_running_task/pytorchocr/utils/logging.py b/batch_running_task/pytorchocr/utils/logging.py new file mode 100644 index 0000000..68d0c2f --- /dev/null +++ b/batch_running_task/pytorchocr/utils/logging.py @@ -0,0 +1,52 @@ +import os +import sys +import logging +import functools +import torch.distributed as dist + +logger_initialized = {} + + +@functools.lru_cache() +def get_logger(name='root', log_file=None, log_level=logging.DEBUG): + """Initialize and get a logger by name. + If the logger has not been initialized, this method will initialize the + logger by adding one or two handlers, otherwise the initialized logger will + be directly returned. During initialization, a StreamHandler will always be + added. If `log_file` is specified a FileHandler will also be added. + Args: + name (str): Logger name. + log_file (str | None): The log filename. If specified, a FileHandler + will be added to the logger. + log_level (int): The logger level. Note that only the process of + rank 0 is affected, and other processes will set the level to + "Error" thus be silent most of the time. + Returns: + logging.Logger: The expected logger. + """ + logger = logging.getLogger(name) + if name in logger_initialized: + return logger + for logger_name in logger_initialized: + if name.startswith(logger_name): + return logger + + formatter = logging.Formatter( + '[%(asctime)s] %(name)s %(levelname)s: %(message)s', + datefmt="%Y/%m/%d %H:%M:%S") + + stream_handler = logging.StreamHandler(stream=sys.stdout) + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + if log_file is not None and dist.get_rank() == 0: + log_file_folder = os.path.split(log_file)[0] + os.makedirs(log_file_folder, exist_ok=True) + file_handler = logging.FileHandler(log_file, 'a') + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + # if dist.get_rank() == 0: + # logger.setLevel(log_level) + # else: + # logger.setLevel(logging.ERROR) + logger_initialized[name] = True + return logger \ No newline at end of file diff --git a/batch_running_task/pytorchocr/utils/poly_nms.py b/batch_running_task/pytorchocr/utils/poly_nms.py new file mode 100644 index 0000000..9dcb3d2 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/poly_nms.py @@ -0,0 +1,146 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from shapely.geometry import Polygon + + +def points2polygon(points): + """Convert k points to 1 polygon. + + Args: + points (ndarray or list): A ndarray or a list of shape (2k) + that indicates k points. + + Returns: + polygon (Polygon): A polygon object. + """ + if isinstance(points, list): + points = np.array(points) + + assert isinstance(points, np.ndarray) + assert (points.size % 2 == 0) and (points.size >= 8) + + point_mat = points.reshape([-1, 2]) + return Polygon(point_mat) + + +def poly_intersection(poly_det, poly_gt, buffer=0.0001): + """Calculate the intersection area between two polygon. + + Args: + poly_det (Polygon): A polygon predicted by detector. + poly_gt (Polygon): A gt polygon. + + Returns: + intersection_area (float): The intersection area between two polygons. + """ + assert isinstance(poly_det, Polygon) + assert isinstance(poly_gt, Polygon) + + if buffer == 0: + poly_inter = poly_det & poly_gt + else: + poly_inter = poly_det.buffer(buffer) & poly_gt.buffer(buffer) + return poly_inter.area, poly_inter + + +def poly_union(poly_det, poly_gt): + """Calculate the union area between two polygon. + + Args: + poly_det (Polygon): A polygon predicted by detector. + poly_gt (Polygon): A gt polygon. + + Returns: + union_area (float): The union area between two polygons. + """ + assert isinstance(poly_det, Polygon) + assert isinstance(poly_gt, Polygon) + + area_det = poly_det.area + area_gt = poly_gt.area + area_inters, _ = poly_intersection(poly_det, poly_gt) + return area_det + area_gt - area_inters + + +def valid_boundary(x, with_score=True): + num = len(x) + if num < 8: + return False + if num % 2 == 0 and (not with_score): + return True + if num % 2 == 1 and with_score: + return True + + return False + + +def boundary_iou(src, target): + """Calculate the IOU between two boundaries. + + Args: + src (list): Source boundary. + target (list): Target boundary. + + Returns: + iou (float): The iou between two boundaries. + """ + assert valid_boundary(src, False) + assert valid_boundary(target, False) + src_poly = points2polygon(src) + target_poly = points2polygon(target) + + return poly_iou(src_poly, target_poly) + + +def poly_iou(poly_det, poly_gt): + """Calculate the IOU between two polygons. + + Args: + poly_det (Polygon): A polygon predicted by detector. + poly_gt (Polygon): A gt polygon. + + Returns: + iou (float): The IOU between two polygons. + """ + assert isinstance(poly_det, Polygon) + assert isinstance(poly_gt, Polygon) + area_inters, _ = poly_intersection(poly_det, poly_gt) + area_union = poly_union(poly_det, poly_gt) + if area_union == 0: + return 0.0 + return area_inters / area_union + + +def poly_nms(polygons, threshold): + assert isinstance(polygons, list) + + polygons = np.array(sorted(polygons, key=lambda x: x[-1])) + + keep_poly = [] + index = [i for i in range(polygons.shape[0])] + + while len(index) > 0: + keep_poly.append(polygons[index[-1]].tolist()) + A = polygons[index[-1]][:-1] + index = np.delete(index, -1) + iou_list = np.zeros((len(index), )) + for i in range(len(index)): + B = polygons[index[i]][:-1] + iou_list[i] = boundary_iou(A, B) + remove_index = np.where(iou_list > threshold) + index = np.delete(index, remove_index) + + return keep_poly diff --git a/batch_running_task/pytorchocr/utils/ppocr_keys_v1.txt b/batch_running_task/pytorchocr/utils/ppocr_keys_v1.txt new file mode 100644 index 0000000..84b885d --- /dev/null +++ b/batch_running_task/pytorchocr/utils/ppocr_keys_v1.txt @@ -0,0 +1,6623 @@ +' +疗 +绚 +诚 +娇 +溜 +题 +贿 +者 +廖 +更 +纳 +加 +奉 +公 +一 +就 +汴 +计 +与 +路 +房 +原 +妇 +2 +0 +8 +- +7 +其 +> +: +] +, +, +骑 +刈 +全 +消 +昏 +傈 +安 +久 +钟 +嗅 +不 +影 +处 +驽 +蜿 +资 +关 +椤 +地 +瘸 +专 +问 +忖 +票 +嫉 +炎 +韵 +要 +月 +田 +节 +陂 +鄙 +捌 +备 +拳 +伺 +眼 +网 +盎 +大 +傍 +心 +东 +愉 +汇 +蹿 +科 +每 +业 +里 +航 +晏 +字 +平 +录 +先 +1 +3 +彤 +鲶 +产 +稍 +督 +腴 +有 +象 +岳 +注 +绍 +在 +泺 +文 +定 +核 +名 +水 +过 +理 +让 +偷 +率 +等 +这 +发 +” +为 +含 +肥 +酉 +相 +鄱 +七 +编 +猥 +锛 +日 +镀 +蒂 +掰 +倒 +辆 +栾 +栗 +综 +涩 +州 +雌 +滑 +馀 +了 +机 +块 +司 +宰 +甙 +兴 +矽 +抚 +保 +用 +沧 +秩 +如 +收 +息 +滥 +页 +疑 +埠 +! +! +姥 +异 +橹 +钇 +向 +下 +跄 +的 +椴 +沫 +国 +绥 +獠 +报 +开 +民 +蜇 +何 +分 +凇 +长 +讥 +藏 +掏 +施 +羽 +中 +讲 +派 +嘟 +人 +提 +浼 +间 +世 +而 +古 +多 +倪 +唇 +饯 +控 +庚 +首 +赛 +蜓 +味 +断 +制 +觉 +技 +替 +艰 +溢 +潮 +夕 +钺 +外 +摘 +枋 +动 +双 +单 +啮 +户 +枇 +确 +锦 +曜 +杜 +或 +能 +效 +霜 +盒 +然 +侗 +电 +晁 +放 +步 +鹃 +新 +杖 +蜂 +吒 +濂 +瞬 +评 +总 +隍 +对 +独 +合 +也 +是 +府 +青 +天 +诲 +墙 +组 +滴 +级 +邀 +帘 +示 +已 +时 +骸 +仄 +泅 +和 +遨 +店 +雇 +疫 +持 +巍 +踮 +境 +只 +亨 +目 +鉴 +崤 +闲 +体 +泄 +杂 +作 +般 +轰 +化 +解 +迂 +诿 +蛭 +璀 +腾 +告 +版 +服 +省 +师 +小 +规 +程 +线 +海 +办 +引 +二 +桧 +牌 +砺 +洄 +裴 +修 +图 +痫 +胡 +许 +犊 +事 +郛 +基 +柴 +呼 +食 +研 +奶 +律 +蛋 +因 +葆 +察 +戏 +褒 +戒 +再 +李 +骁 +工 +貂 +油 +鹅 +章 +啄 +休 +场 +给 +睡 +纷 +豆 +器 +捎 +说 +敏 +学 +会 +浒 +设 +诊 +格 +廓 +查 +来 +霓 +室 +溆 +¢ +诡 +寥 +焕 +舜 +柒 +狐 +回 +戟 +砾 +厄 +实 +翩 +尿 +五 +入 +径 +惭 +喹 +股 +宇 +篝 +| +; +美 +期 +云 +九 +祺 +扮 +靠 +锝 +槌 +系 +企 +酰 +阊 +暂 +蚕 +忻 +豁 +本 +羹 +执 +条 +钦 +H +獒 +限 +进 +季 +楦 +于 +芘 +玖 +铋 +茯 +未 +答 +粘 +括 +样 +精 +欠 +矢 +甥 +帷 +嵩 +扣 +令 +仔 +风 +皈 +行 +支 +部 +蓉 +刮 +站 +蜡 +救 +钊 +汗 +松 +嫌 +成 +可 +. +鹤 +院 +从 +交 +政 +怕 +活 +调 +球 +局 +验 +髌 +第 +韫 +谗 +串 +到 +圆 +年 +米 +/ +* +友 +忿 +检 +区 +看 +自 +敢 +刃 +个 +兹 +弄 +流 +留 +同 +没 +齿 +星 +聆 +轼 +湖 +什 +三 +建 +蛔 +儿 +椋 +汕 +震 +颧 +鲤 +跟 +力 +情 +璺 +铨 +陪 +务 +指 +族 +训 +滦 +鄣 +濮 +扒 +商 +箱 +十 +召 +慷 +辗 +所 +莞 +管 +护 +臭 +横 +硒 +嗓 +接 +侦 +六 +露 +党 +馋 +驾 +剖 +高 +侬 +妪 +幂 +猗 +绺 +骐 +央 +酐 +孝 +筝 +课 +徇 +缰 +门 +男 +西 +项 +句 +谙 +瞒 +秃 +篇 +教 +碲 +罚 +声 +呐 +景 +前 +富 +嘴 +鳌 +稀 +免 +朋 +啬 +睐 +去 +赈 +鱼 +住 +肩 +愕 +速 +旁 +波 +厅 +健 +茼 +厥 +鲟 +谅 +投 +攸 +炔 +数 +方 +击 +呋 +谈 +绩 +别 +愫 +僚 +躬 +鹧 +胪 +炳 +招 +喇 +膨 +泵 +蹦 +毛 +结 +5 +4 +谱 +识 +陕 +粽 +婚 +拟 +构 +且 +搜 +任 +潘 +比 +郢 +妨 +醪 +陀 +桔 +碘 +扎 +选 +哈 +骷 +楷 +亿 +明 +缆 +脯 +监 +睫 +逻 +婵 +共 +赴 +淝 +凡 +惦 +及 +达 +揖 +谩 +澹 +减 +焰 +蛹 +番 +祁 +柏 +员 +禄 +怡 +峤 +龙 +白 +叽 +生 +闯 +起 +细 +装 +谕 +竟 +聚 +钙 +上 +导 +渊 +按 +艾 +辘 +挡 +耒 +盹 +饪 +臀 +记 +邮 +蕙 +受 +各 +医 +搂 +普 +滇 +朗 +茸 +带 +翻 +酚 +( +光 +堤 +墟 +蔷 +万 +幻 +〓 +瑙 +辈 +昧 +盏 +亘 +蛀 +吉 +铰 +请 +子 +假 +闻 +税 +井 +诩 +哨 +嫂 +好 +面 +琐 +校 +馊 +鬣 +缂 +营 +访 +炖 +占 +农 +缀 +否 +经 +钚 +棵 +趟 +张 +亟 +吏 +茶 +谨 +捻 +论 +迸 +堂 +玉 +信 +吧 +瞠 +乡 +姬 +寺 +咬 +溏 +苄 +皿 +意 +赉 +宝 +尔 +钰 +艺 +特 +唳 +踉 +都 +荣 +倚 +登 +荐 +丧 +奇 +涵 +批 +炭 +近 +符 +傩 +感 +道 +着 +菊 +虹 +仲 +众 +懈 +濯 +颞 +眺 +南 +释 +北 +缝 +标 +既 +茗 +整 +撼 +迤 +贲 +挎 +耱 +拒 +某 +妍 +卫 +哇 +英 +矶 +藩 +治 +他 +元 +领 +膜 +遮 +穗 +蛾 +飞 +荒 +棺 +劫 +么 +市 +火 +温 +拈 +棚 +洼 +转 +果 +奕 +卸 +迪 +伸 +泳 +斗 +邡 +侄 +涨 +屯 +萋 +胭 +氡 +崮 +枞 +惧 +冒 +彩 +斜 +手 +豚 +随 +旭 +淑 +妞 +形 +菌 +吲 +沱 +争 +驯 +歹 +挟 +兆 +柱 +传 +至 +包 +内 +响 +临 +红 +功 +弩 +衡 +寂 +禁 +老 +棍 +耆 +渍 +织 +害 +氵 +渑 +布 +载 +靥 +嗬 +虽 +苹 +咨 +娄 +库 +雉 +榜 +帜 +嘲 +套 +瑚 +亲 +簸 +欧 +边 +6 +腿 +旮 +抛 +吹 +瞳 +得 +镓 +梗 +厨 +继 +漾 +愣 +憨 +士 +策 +窑 +抑 +躯 +襟 +脏 +参 +贸 +言 +干 +绸 +鳄 +穷 +藜 +音 +折 +详 +) +举 +悍 +甸 +癌 +黎 +谴 +死 +罩 +迁 +寒 +驷 +袖 +媒 +蒋 +掘 +模 +纠 +恣 +观 +祖 +蛆 +碍 +位 +稿 +主 +澧 +跌 +筏 +京 +锏 +帝 +贴 +证 +糠 +才 +黄 +鲸 +略 +炯 +饱 +四 +出 +园 +犀 +牧 +容 +汉 +杆 +浈 +汰 +瑷 +造 +虫 +瘩 +怪 +驴 +济 +应 +花 +沣 +谔 +夙 +旅 +价 +矿 +以 +考 +s +u +呦 +晒 +巡 +茅 +准 +肟 +瓴 +詹 +仟 +褂 +译 +桌 +混 +宁 +怦 +郑 +抿 +些 +余 +鄂 +饴 +攒 +珑 +群 +阖 +岔 +琨 +藓 +预 +环 +洮 +岌 +宀 +杲 +瀵 +最 +常 +囡 +周 +踊 +女 +鼓 +袭 +喉 +简 +范 +薯 +遐 +疏 +粱 +黜 +禧 +法 +箔 +斤 +遥 +汝 +奥 +直 +贞 +撑 +置 +绱 +集 +她 +馅 +逗 +钧 +橱 +魉 +[ +恙 +躁 +唤 +9 +旺 +膘 +待 +脾 +惫 +购 +吗 +依 +盲 +度 +瘿 +蠖 +俾 +之 +镗 +拇 +鲵 +厝 +簧 +续 +款 +展 +啃 +表 +剔 +品 +钻 +腭 +损 +清 +锶 +统 +涌 +寸 +滨 +贪 +链 +吠 +冈 +伎 +迥 +咏 +吁 +览 +防 +迅 +失 +汾 +阔 +逵 +绀 +蔑 +列 +川 +凭 +努 +熨 +揪 +利 +俱 +绉 +抢 +鸨 +我 +即 +责 +膦 +易 +毓 +鹊 +刹 +玷 +岿 +空 +嘞 +绊 +排 +术 +估 +锷 +违 +们 +苟 +铜 +播 +肘 +件 +烫 +审 +鲂 +广 +像 +铌 +惰 +铟 +巳 +胍 +鲍 +康 +憧 +色 +恢 +想 +拷 +尤 +疳 +知 +S +Y +F +D +A +峄 +裕 +帮 +握 +搔 +氐 +氘 +难 +墒 +沮 +雨 +叁 +缥 +悴 +藐 +湫 +娟 +苑 +稠 +颛 +簇 +后 +阕 +闭 +蕤 +缚 +怎 +佞 +码 +嘤 +蔡 +痊 +舱 +螯 +帕 +赫 +昵 +升 +烬 +岫 +、 +疵 +蜻 +髁 +蕨 +隶 +烛 +械 +丑 +盂 +梁 +强 +鲛 +由 +拘 +揉 +劭 +龟 +撤 +钩 +呕 +孛 +费 +妻 +漂 +求 +阑 +崖 +秤 +甘 +通 +深 +补 +赃 +坎 +床 +啪 +承 +吼 +量 +暇 +钼 +烨 +阂 +擎 +脱 +逮 +称 +P +神 +属 +矗 +华 +届 +狍 +葑 +汹 +育 +患 +窒 +蛰 +佼 +静 +槎 +运 +鳗 +庆 +逝 +曼 +疱 +克 +代 +官 +此 +麸 +耧 +蚌 +晟 +例 +础 +榛 +副 +测 +唰 +缢 +迹 +灬 +霁 +身 +岁 +赭 +扛 +又 +菡 +乜 +雾 +板 +读 +陷 +徉 +贯 +郁 +虑 +变 +钓 +菜 +圾 +现 +琢 +式 +乐 +维 +渔 +浜 +左 +吾 +脑 +钡 +警 +T +啵 +拴 +偌 +漱 +湿 +硕 +止 +骼 +魄 +积 +燥 +联 +踢 +玛 +则 +窿 +见 +振 +畿 +送 +班 +钽 +您 +赵 +刨 +印 +讨 +踝 +籍 +谡 +舌 +崧 +汽 +蔽 +沪 +酥 +绒 +怖 +财 +帖 +肱 +私 +莎 +勋 +羔 +霸 +励 +哼 +帐 +将 +帅 +渠 +纪 +婴 +娩 +岭 +厘 +滕 +吻 +伤 +坝 +冠 +戊 +隆 +瘁 +介 +涧 +物 +黍 +并 +姗 +奢 +蹑 +掣 +垸 +锴 +命 +箍 +捉 +病 +辖 +琰 +眭 +迩 +艘 +绌 +繁 +寅 +若 +毋 +思 +诉 +类 +诈 +燮 +轲 +酮 +狂 +重 +反 +职 +筱 +县 +委 +磕 +绣 +奖 +晋 +濉 +志 +徽 +肠 +呈 +獐 +坻 +口 +片 +碰 +几 +村 +柿 +劳 +料 +获 +亩 +惕 +晕 +厌 +号 +罢 +池 +正 +鏖 +煨 +家 +棕 +复 +尝 +懋 +蜥 +锅 +岛 +扰 +队 +坠 +瘾 +钬 +@ +卧 +疣 +镇 +譬 +冰 +彷 +频 +黯 +据 +垄 +采 +八 +缪 +瘫 +型 +熹 +砰 +楠 +襁 +箐 +但 +嘶 +绳 +啤 +拍 +盥 +穆 +傲 +洗 +盯 +塘 +怔 +筛 +丿 +台 +恒 +喂 +葛 +永 +¥ +烟 +酒 +桦 +书 +砂 +蚝 +缉 +态 +瀚 +袄 +圳 +轻 +蛛 +超 +榧 +遛 +姒 +奘 +铮 +右 +荽 +望 +偻 +卡 +丶 +氰 +附 +做 +革 +索 +戚 +坨 +桷 +唁 +垅 +榻 +岐 +偎 +坛 +莨 +山 +殊 +微 +骇 +陈 +爨 +推 +嗝 +驹 +澡 +藁 +呤 +卤 +嘻 +糅 +逛 +侵 +郓 +酌 +德 +摇 +※ +鬃 +被 +慨 +殡 +羸 +昌 +泡 +戛 +鞋 +河 +宪 +沿 +玲 +鲨 +翅 +哽 +源 +铅 +语 +照 +邯 +址 +荃 +佬 +顺 +鸳 +町 +霭 +睾 +瓢 +夸 +椁 +晓 +酿 +痈 +咔 +侏 +券 +噎 +湍 +签 +嚷 +离 +午 +尚 +社 +锤 +背 +孟 +使 +浪 +缦 +潍 +鞅 +军 +姹 +驶 +笑 +鳟 +鲁 +》 +孽 +钜 +绿 +洱 +礴 +焯 +椰 +颖 +囔 +乌 +孔 +巴 +互 +性 +椽 +哞 +聘 +昨 +早 +暮 +胶 +炀 +隧 +低 +彗 +昝 +铁 +呓 +氽 +藉 +喔 +癖 +瑗 +姨 +权 +胱 +韦 +堑 +蜜 +酋 +楝 +砝 +毁 +靓 +歙 +锲 +究 +屋 +喳 +骨 +辨 +碑 +武 +鸠 +宫 +辜 +烊 +适 +坡 +殃 +培 +佩 +供 +走 +蜈 +迟 +翼 +况 +姣 +凛 +浔 +吃 +飘 +债 +犟 +金 +促 +苛 +崇 +坂 +莳 +畔 +绂 +兵 +蠕 +斋 +根 +砍 +亢 +欢 +恬 +崔 +剁 +餐 +榫 +快 +扶 +‖ +濒 +缠 +鳜 +当 +彭 +驭 +浦 +篮 +昀 +锆 +秸 +钳 +弋 +娣 +瞑 +夷 +龛 +苫 +拱 +致 +% +嵊 +障 +隐 +弑 +初 +娓 +抉 +汩 +累 +蓖 +" +唬 +助 +苓 +昙 +押 +毙 +破 +城 +郧 +逢 +嚏 +獭 +瞻 +溱 +婿 +赊 +跨 +恼 +璧 +萃 +姻 +貉 +灵 +炉 +密 +氛 +陶 +砸 +谬 +衔 +点 +琛 +沛 +枳 +层 +岱 +诺 +脍 +榈 +埂 +征 +冷 +裁 +打 +蹴 +素 +瘘 +逞 +蛐 +聊 +激 +腱 +萘 +踵 +飒 +蓟 +吆 +取 +咙 +簋 +涓 +矩 +曝 +挺 +揣 +座 +你 +史 +舵 +焱 +尘 +苏 +笈 +脚 +溉 +榨 +诵 +樊 +邓 +焊 +义 +庶 +儋 +蟋 +蒲 +赦 +呷 +杞 +诠 +豪 +还 +试 +颓 +茉 +太 +除 +紫 +逃 +痴 +草 +充 +鳕 +珉 +祗 +墨 +渭 +烩 +蘸 +慕 +璇 +镶 +穴 +嵘 +恶 +骂 +险 +绋 +幕 +碉 +肺 +戳 +刘 +潞 +秣 +纾 +潜 +銮 +洛 +须 +罘 +销 +瘪 +汞 +兮 +屉 +r +林 +厕 +质 +探 +划 +狸 +殚 +善 +煊 +烹 +〒 +锈 +逯 +宸 +辍 +泱 +柚 +袍 +远 +蹋 +嶙 +绝 +峥 +娥 +缍 +雀 +徵 +认 +镱 +谷 += +贩 +勉 +撩 +鄯 +斐 +洋 +非 +祚 +泾 +诒 +饿 +撬 +威 +晷 +搭 +芍 +锥 +笺 +蓦 +候 +琊 +档 +礁 +沼 +卵 +荠 +忑 +朝 +凹 +瑞 +头 +仪 +弧 +孵 +畏 +铆 +突 +衲 +车 +浩 +气 +茂 +悖 +厢 +枕 +酝 +戴 +湾 +邹 +飚 +攘 +锂 +写 +宵 +翁 +岷 +无 +喜 +丈 +挑 +嗟 +绛 +殉 +议 +槽 +具 +醇 +淞 +笃 +郴 +阅 +饼 +底 +壕 +砚 +弈 +询 +缕 +庹 +翟 +零 +筷 +暨 +舟 +闺 +甯 +撞 +麂 +茌 +蔼 +很 +珲 +捕 +棠 +角 +阉 +媛 +娲 +诽 +剿 +尉 +爵 +睬 +韩 +诰 +匣 +危 +糍 +镯 +立 +浏 +阳 +少 +盆 +舔 +擘 +匪 +申 +尬 +铣 +旯 +抖 +赘 +瓯 +居 +ˇ +哮 +游 +锭 +茏 +歌 +坏 +甚 +秒 +舞 +沙 +仗 +劲 +潺 +阿 +燧 +郭 +嗖 +霏 +忠 +材 +奂 +耐 +跺 +砀 +输 +岖 +媳 +氟 +极 +摆 +灿 +今 +扔 +腻 +枝 +奎 +药 +熄 +吨 +话 +q +额 +慑 +嘌 +协 +喀 +壳 +埭 +视 +著 +於 +愧 +陲 +翌 +峁 +颅 +佛 +腹 +聋 +侯 +咎 +叟 +秀 +颇 +存 +较 +罪 +哄 +岗 +扫 +栏 +钾 +羌 +己 +璨 +枭 +霉 +煌 +涸 +衿 +键 +镝 +益 +岢 +奏 +连 +夯 +睿 +冥 +均 +糖 +狞 +蹊 +稻 +爸 +刿 +胥 +煜 +丽 +肿 +璃 +掸 +跚 +灾 +垂 +樾 +濑 +乎 +莲 +窄 +犹 +撮 +战 +馄 +软 +络 +显 +鸢 +胸 +宾 +妲 +恕 +埔 +蝌 +份 +遇 +巧 +瞟 +粒 +恰 +剥 +桡 +博 +讯 +凯 +堇 +阶 +滤 +卖 +斌 +骚 +彬 +兑 +磺 +樱 +舷 +两 +娱 +福 +仃 +差 +找 +桁 +÷ +净 +把 +阴 +污 +戬 +雷 +碓 +蕲 +楚 +罡 +焖 +抽 +妫 +咒 +仑 +闱 +尽 +邑 +菁 +爱 +贷 +沥 +鞑 +牡 +嗉 +崴 +骤 +塌 +嗦 +订 +拮 +滓 +捡 +锻 +次 +坪 +杩 +臃 +箬 +融 +珂 +鹗 +宗 +枚 +降 +鸬 +妯 +阄 +堰 +盐 +毅 +必 +杨 +崃 +俺 +甬 +状 +莘 +货 +耸 +菱 +腼 +铸 +唏 +痤 +孚 +澳 +懒 +溅 +翘 +疙 +杷 +淼 +缙 +骰 +喊 +悉 +砻 +坷 +艇 +赁 +界 +谤 +纣 +宴 +晃 +茹 +归 +饭 +梢 +铡 +街 +抄 +肼 +鬟 +苯 +颂 +撷 +戈 +炒 +咆 +茭 +瘙 +负 +仰 +客 +琉 +铢 +封 +卑 +珥 +椿 +镧 +窨 +鬲 +寿 +御 +袤 +铃 +萎 +砖 +餮 +脒 +裳 +肪 +孕 +嫣 +馗 +嵇 +恳 +氯 +江 +石 +褶 +冢 +祸 +阻 +狈 +羞 +银 +靳 +透 +咳 +叼 +敷 +芷 +啥 +它 +瓤 +兰 +痘 +懊 +逑 +肌 +往 +捺 +坊 +甩 +呻 +〃 +沦 +忘 +膻 +祟 +菅 +剧 +崆 +智 +坯 +臧 +霍 +墅 +攻 +眯 +倘 +拢 +骠 +铐 +庭 +岙 +瓠 +′ +缺 +泥 +迢 +捶 +? +? +郏 +喙 +掷 +沌 +纯 +秘 +种 +听 +绘 +固 +螨 +团 +香 +盗 +妒 +埚 +蓝 +拖 +旱 +荞 +铀 +血 +遏 +汲 +辰 +叩 +拽 +幅 +硬 +惶 +桀 +漠 +措 +泼 +唑 +齐 +肾 +念 +酱 +虚 +屁 +耶 +旗 +砦 +闵 +婉 +馆 +拭 +绅 +韧 +忏 +窝 +醋 +葺 +顾 +辞 +倜 +堆 +辋 +逆 +玟 +贱 +疾 +董 +惘 +倌 +锕 +淘 +嘀 +莽 +俭 +笏 +绑 +鲷 +杈 +择 +蟀 +粥 +嗯 +驰 +逾 +案 +谪 +褓 +胫 +哩 +昕 +颚 +鲢 +绠 +躺 +鹄 +崂 +儒 +俨 +丝 +尕 +泌 +啊 +萸 +彰 +幺 +吟 +骄 +苣 +弦 +脊 +瑰 +〈 +诛 +镁 +析 +闪 +剪 +侧 +哟 +框 +螃 +守 +嬗 +燕 +狭 +铈 +缮 +概 +迳 +痧 +鲲 +俯 +售 +笼 +痣 +扉 +挖 +满 +咋 +援 +邱 +扇 +歪 +便 +玑 +绦 +峡 +蛇 +叨 +〖 +泽 +胃 +斓 +喋 +怂 +坟 +猪 +该 +蚬 +炕 +弥 +赞 +棣 +晔 +娠 +挲 +狡 +创 +疖 +铕 +镭 +稷 +挫 +弭 +啾 +翔 +粉 +履 +苘 +哦 +楼 +秕 +铂 +土 +锣 +瘟 +挣 +栉 +习 +享 +桢 +袅 +磨 +桂 +谦 +延 +坚 +蔚 +噗 +署 +谟 +猬 +钎 +恐 +嬉 +雒 +倦 +衅 +亏 +璩 +睹 +刻 +殿 +王 +算 +雕 +麻 +丘 +柯 +骆 +丸 +塍 +谚 +添 +鲈 +垓 +桎 +蚯 +芥 +予 +飕 +镦 +谌 +窗 +醚 +菀 +亮 +搪 +莺 +蒿 +羁 +足 +J +真 +轶 +悬 +衷 +靛 +翊 +掩 +哒 +炅 +掐 +冼 +妮 +l +谐 +稚 +荆 +擒 +犯 +陵 +虏 +浓 +崽 +刍 +陌 +傻 +孜 +千 +靖 +演 +矜 +钕 +煽 +杰 +酗 +渗 +伞 +栋 +俗 +泫 +戍 +罕 +沾 +疽 +灏 +煦 +芬 +磴 +叱 +阱 +榉 +湃 +蜀 +叉 +醒 +彪 +租 +郡 +篷 +屎 +良 +垢 +隗 +弱 +陨 +峪 +砷 +掴 +颁 +胎 +雯 +绵 +贬 +沐 +撵 +隘 +篙 +暖 +曹 +陡 +栓 +填 +臼 +彦 +瓶 +琪 +潼 +哪 +鸡 +摩 +啦 +俟 +锋 +域 +耻 +蔫 +疯 +纹 +撇 +毒 +绶 +痛 +酯 +忍 +爪 +赳 +歆 +嘹 +辕 +烈 +册 +朴 +钱 +吮 +毯 +癜 +娃 +谀 +邵 +厮 +炽 +璞 +邃 +丐 +追 +词 +瓒 +忆 +轧 +芫 +谯 +喷 +弟 +半 +冕 +裙 +掖 +墉 +绮 +寝 +苔 +势 +顷 +褥 +切 +衮 +君 +佳 +嫒 +蚩 +霞 +佚 +洙 +逊 +镖 +暹 +唛 +& +殒 +顶 +碗 +獗 +轭 +铺 +蛊 +废 +恹 +汨 +崩 +珍 +那 +杵 +曲 +纺 +夏 +薰 +傀 +闳 +淬 +姘 +舀 +拧 +卷 +楂 +恍 +讪 +厩 +寮 +篪 +赓 +乘 +灭 +盅 +鞣 +沟 +慎 +挂 +饺 +鼾 +杳 +树 +缨 +丛 +絮 +娌 +臻 +嗳 +篡 +侩 +述 +衰 +矛 +圈 +蚜 +匕 +筹 +匿 +濞 +晨 +叶 +骋 +郝 +挚 +蚴 +滞 +增 +侍 +描 +瓣 +吖 +嫦 +蟒 +匾 +圣 +赌 +毡 +癞 +恺 +百 +曳 +需 +篓 +肮 +庖 +帏 +卿 +驿 +遗 +蹬 +鬓 +骡 +歉 +芎 +胳 +屐 +禽 +烦 +晌 +寄 +媾 +狄 +翡 +苒 +船 +廉 +终 +痞 +殇 +々 +畦 +饶 +改 +拆 +悻 +萄 +£ +瓿 +乃 +訾 +桅 +匮 +溧 +拥 +纱 +铍 +骗 +蕃 +龋 +缬 +父 +佐 +疚 +栎 +醍 +掳 +蓄 +x +惆 +颜 +鲆 +榆 +〔 +猎 +敌 +暴 +谥 +鲫 +贾 +罗 +玻 +缄 +扦 +芪 +癣 +落 +徒 +臾 +恿 +猩 +托 +邴 +肄 +牵 +春 +陛 +耀 +刊 +拓 +蓓 +邳 +堕 +寇 +枉 +淌 +啡 +湄 +兽 +酷 +萼 +碚 +濠 +萤 +夹 +旬 +戮 +梭 +琥 +椭 +昔 +勺 +蜊 +绐 +晚 +孺 +僵 +宣 +摄 +冽 +旨 +萌 +忙 +蚤 +眉 +噼 +蟑 +付 +契 +瓜 +悼 +颡 +壁 +曾 +窕 +颢 +澎 +仿 +俑 +浑 +嵌 +浣 +乍 +碌 +褪 +乱 +蔟 +隙 +玩 +剐 +葫 +箫 +纲 +围 +伐 +决 +伙 +漩 +瑟 +刑 +肓 +镳 +缓 +蹭 +氨 +皓 +典 +畲 +坍 +铑 +檐 +塑 +洞 +倬 +储 +胴 +淳 +戾 +吐 +灼 +惺 +妙 +毕 +珐 +缈 +虱 +盖 +羰 +鸿 +磅 +谓 +髅 +娴 +苴 +唷 +蚣 +霹 +抨 +贤 +唠 +犬 +誓 +逍 +庠 +逼 +麓 +籼 +釉 +呜 +碧 +秧 +氩 +摔 +霄 +穸 +纨 +辟 +妈 +映 +完 +牛 +缴 +嗷 +炊 +恩 +荔 +茆 +掉 +紊 +慌 +莓 +羟 +阙 +萁 +磐 +另 +蕹 +辱 +鳐 +湮 +吡 +吩 +唐 +睦 +垠 +舒 +圜 +冗 +瞿 +溺 +芾 +囱 +匠 +僳 +汐 +菩 +饬 +漓 +黑 +霰 +浸 +濡 +窥 +毂 +蒡 +兢 +驻 +鹉 +芮 +诙 +迫 +雳 +厂 +忐 +臆 +猴 +鸣 +蚪 +栈 +箕 +羡 +渐 +莆 +捍 +眈 +哓 +趴 +蹼 +埕 +嚣 +骛 +宏 +淄 +斑 +噜 +严 +瑛 +垃 +椎 +诱 +压 +庾 +绞 +焘 +廿 +抡 +迄 +棘 +夫 +纬 +锹 +眨 +瞌 +侠 +脐 +竞 +瀑 +孳 +骧 +遁 +姜 +颦 +荪 +滚 +萦 +伪 +逸 +粳 +爬 +锁 +矣 +役 +趣 +洒 +颔 +诏 +逐 +奸 +甭 +惠 +攀 +蹄 +泛 +尼 +拼 +阮 +鹰 +亚 +颈 +惑 +勒 +〉 +际 +肛 +爷 +刚 +钨 +丰 +养 +冶 +鲽 +辉 +蔻 +画 +覆 +皴 +妊 +麦 +返 +醉 +皂 +擀 +〗 +酶 +凑 +粹 +悟 +诀 +硖 +港 +卜 +z +杀 +涕 +± +舍 +铠 +抵 +弛 +段 +敝 +镐 +奠 +拂 +轴 +跛 +袱 +e +t +沉 +菇 +俎 +薪 +峦 +秭 +蟹 +历 +盟 +菠 +寡 +液 +肢 +喻 +染 +裱 +悱 +抱 +氙 +赤 +捅 +猛 +跑 +氮 +谣 +仁 +尺 +辊 +窍 +烙 +衍 +架 +擦 +倏 +璐 +瑁 +币 +楞 +胖 +夔 +趸 +邛 +惴 +饕 +虔 +蝎 +§ +哉 +贝 +宽 +辫 +炮 +扩 +饲 +籽 +魏 +菟 +锰 +伍 +猝 +末 +琳 +哚 +蛎 +邂 +呀 +姿 +鄞 +却 +歧 +仙 +恸 +椐 +森 +牒 +寤 +袒 +婆 +虢 +雅 +钉 +朵 +贼 +欲 +苞 +寰 +故 +龚 +坭 +嘘 +咫 +礼 +硷 +兀 +睢 +汶 +’ +铲 +烧 +绕 +诃 +浃 +钿 +哺 +柜 +讼 +颊 +璁 +腔 +洽 +咐 +脲 +簌 +筠 +镣 +玮 +鞠 +谁 +兼 +姆 +挥 +梯 +蝴 +谘 +漕 +刷 +躏 +宦 +弼 +b +垌 +劈 +麟 +莉 +揭 +笙 +渎 +仕 +嗤 +仓 +配 +怏 +抬 +错 +泯 +镊 +孰 +猿 +邪 +仍 +秋 +鼬 +壹 +歇 +吵 +炼 +< +尧 +射 +柬 +廷 +胧 +霾 +凳 +隋 +肚 +浮 +梦 +祥 +株 +堵 +退 +L +鹫 +跎 +凶 +毽 +荟 +炫 +栩 +玳 +甜 +沂 +鹿 +顽 +伯 +爹 +赔 +蛴 +徐 +匡 +欣 +狰 +缸 +雹 +蟆 +疤 +默 +沤 +啜 +痂 +衣 +禅 +w +i +h +辽 +葳 +黝 +钗 +停 +沽 +棒 +馨 +颌 +肉 +吴 +硫 +悯 +劾 +娈 +马 +啧 +吊 +悌 +镑 +峭 +帆 +瀣 +涉 +咸 +疸 +滋 +泣 +翦 +拙 +癸 +钥 +蜒 ++ +尾 +庄 +凝 +泉 +婢 +渴 +谊 +乞 +陆 +锉 +糊 +鸦 +淮 +I +B +N +晦 +弗 +乔 +庥 +葡 +尻 +席 +橡 +傣 +渣 +拿 +惩 +麋 +斛 +缃 +矮 +蛏 +岘 +鸽 +姐 +膏 +催 +奔 +镒 +喱 +蠡 +摧 +钯 +胤 +柠 +拐 +璋 +鸥 +卢 +荡 +倾 +^ +_ +珀 +逄 +萧 +塾 +掇 +贮 +笆 +聂 +圃 +冲 +嵬 +M +滔 +笕 +值 +炙 +偶 +蜱 +搐 +梆 +汪 +蔬 +腑 +鸯 +蹇 +敞 +绯 +仨 +祯 +谆 +梧 +糗 +鑫 +啸 +豺 +囹 +猾 +巢 +柄 +瀛 +筑 +踌 +沭 +暗 +苁 +鱿 +蹉 +脂 +蘖 +牢 +热 +木 +吸 +溃 +宠 +序 +泞 +偿 +拜 +檩 +厚 +朐 +毗 +螳 +吞 +媚 +朽 +担 +蝗 +橘 +畴 +祈 +糟 +盱 +隼 +郜 +惜 +珠 +裨 +铵 +焙 +琚 +唯 +咚 +噪 +骊 +丫 +滢 +勤 +棉 +呸 +咣 +淀 +隔 +蕾 +窈 +饨 +挨 +煅 +短 +匙 +粕 +镜 +赣 +撕 +墩 +酬 +馁 +豌 +颐 +抗 +酣 +氓 +佑 +搁 +哭 +递 +耷 +涡 +桃 +贻 +碣 +截 +瘦 +昭 +镌 +蔓 +氚 +甲 +猕 +蕴 +蓬 +散 +拾 +纛 +狼 +猷 +铎 +埋 +旖 +矾 +讳 +囊 +糜 +迈 +粟 +蚂 +紧 +鲳 +瘢 +栽 +稼 +羊 +锄 +斟 +睁 +桥 +瓮 +蹙 +祉 +醺 +鼻 +昱 +剃 +跳 +篱 +跷 +蒜 +翎 +宅 +晖 +嗑 +壑 +峻 +癫 +屏 +狠 +陋 +袜 +途 +憎 +祀 +莹 +滟 +佶 +溥 +臣 +约 +盛 +峰 +磁 +慵 +婪 +拦 +莅 +朕 +鹦 +粲 +裤 +哎 +疡 +嫖 +琵 +窟 +堪 +谛 +嘉 +儡 +鳝 +斩 +郾 +驸 +酊 +妄 +胜 +贺 +徙 +傅 +噌 +钢 +栅 +庇 +恋 +匝 +巯 +邈 +尸 +锚 +粗 +佟 +蛟 +薹 +纵 +蚊 +郅 +绢 +锐 +苗 +俞 +篆 +淆 +膀 +鲜 +煎 +诶 +秽 +寻 +涮 +刺 +怀 +噶 +巨 +褰 +魅 +灶 +灌 +桉 +藕 +谜 +舸 +薄 +搀 +恽 +借 +牯 +痉 +渥 +愿 +亓 +耘 +杠 +柩 +锔 +蚶 +钣 +珈 +喘 +蹒 +幽 +赐 +稗 +晤 +莱 +泔 +扯 +肯 +菪 +裆 +腩 +豉 +疆 +骜 +腐 +倭 +珏 +唔 +粮 +亡 +润 +慰 +伽 +橄 +玄 +誉 +醐 +胆 +龊 +粼 +塬 +陇 +彼 +削 +嗣 +绾 +芽 +妗 +垭 +瘴 +爽 +薏 +寨 +龈 +泠 +弹 +赢 +漪 +猫 +嘧 +涂 +恤 +圭 +茧 +烽 +屑 +痕 +巾 +赖 +荸 +凰 +腮 +畈 +亵 +蹲 +偃 +苇 +澜 +艮 +换 +骺 +烘 +苕 +梓 +颉 +肇 +哗 +悄 +氤 +涠 +葬 +屠 +鹭 +植 +竺 +佯 +诣 +鲇 +瘀 +鲅 +邦 +移 +滁 +冯 +耕 +癔 +戌 +茬 +沁 +巩 +悠 +湘 +洪 +痹 +锟 +循 +谋 +腕 +鳃 +钠 +捞 +焉 +迎 +碱 +伫 +急 +榷 +奈 +邝 +卯 +辄 +皲 +卟 +醛 +畹 +忧 +稳 +雄 +昼 +缩 +阈 +睑 +扌 +耗 +曦 +涅 +捏 +瞧 +邕 +淖 +漉 +铝 +耦 +禹 +湛 +喽 +莼 +琅 +诸 +苎 +纂 +硅 +始 +嗨 +傥 +燃 +臂 +赅 +嘈 +呆 +贵 +屹 +壮 +肋 +亍 +蚀 +卅 +豹 +腆 +邬 +迭 +浊 +} +童 +螂 +捐 +圩 +勐 +触 +寞 +汊 +壤 +荫 +膺 +渌 +芳 +懿 +遴 +螈 +泰 +蓼 +蛤 +茜 +舅 +枫 +朔 +膝 +眙 +避 +梅 +判 +鹜 +璜 +牍 +缅 +垫 +藻 +黔 +侥 +惚 +懂 +踩 +腰 +腈 +札 +丞 +唾 +慈 +顿 +摹 +荻 +琬 +~ +斧 +沈 +滂 +胁 +胀 +幄 +莜 +Z +匀 +鄄 +掌 +绰 +茎 +焚 +赋 +萱 +谑 +汁 +铒 +瞎 +夺 +蜗 +野 +娆 +冀 +弯 +篁 +懵 +灞 +隽 +芡 +脘 +俐 +辩 +芯 +掺 +喏 +膈 +蝈 +觐 +悚 +踹 +蔗 +熠 +鼠 +呵 +抓 +橼 +峨 +畜 +缔 +禾 +崭 +弃 +熊 +摒 +凸 +拗 +穹 +蒙 +抒 +祛 +劝 +闫 +扳 +阵 +醌 +踪 +喵 +侣 +搬 +仅 +荧 +赎 +蝾 +琦 +买 +婧 +瞄 +寓 +皎 +冻 +赝 +箩 +莫 +瞰 +郊 +笫 +姝 +筒 +枪 +遣 +煸 +袋 +舆 +痱 +涛 +母 +〇 +启 +践 +耙 +绲 +盘 +遂 +昊 +搞 +槿 +诬 +纰 +泓 +惨 +檬 +亻 +越 +C +o +憩 +熵 +祷 +钒 +暧 +塔 +阗 +胰 +咄 +娶 +魔 +琶 +钞 +邻 +扬 +杉 +殴 +咽 +弓 +〆 +髻 +】 +吭 +揽 +霆 +拄 +殖 +脆 +彻 +岩 +芝 +勃 +辣 +剌 +钝 +嘎 +甄 +佘 +皖 +伦 +授 +徕 +憔 +挪 +皇 +庞 +稔 +芜 +踏 +溴 +兖 +卒 +擢 +饥 +鳞 +煲 +‰ +账 +颗 +叻 +斯 +捧 +鳍 +琮 +讹 +蛙 +纽 +谭 +酸 +兔 +莒 +睇 +伟 +觑 +羲 +嗜 +宜 +褐 +旎 +辛 +卦 +诘 +筋 +鎏 +溪 +挛 +熔 +阜 +晰 +鳅 +丢 +奚 +灸 +呱 +献 +陉 +黛 +鸪 +甾 +萨 +疮 +拯 +洲 +疹 +辑 +叙 +恻 +谒 +允 +柔 +烂 +氏 +逅 +漆 +拎 +惋 +扈 +湟 +纭 +啕 +掬 +擞 +哥 +忽 +涤 +鸵 +靡 +郗 +瓷 +扁 +廊 +怨 +雏 +钮 +敦 +E +懦 +憋 +汀 +拚 +啉 +腌 +岸 +f +痼 +瞅 +尊 +咀 +眩 +飙 +忌 +仝 +迦 +熬 +毫 +胯 +篑 +茄 +腺 +凄 +舛 +碴 +锵 +诧 +羯 +後 +漏 +汤 +宓 +仞 +蚁 +壶 +谰 +皑 +铄 +棰 +罔 +辅 +晶 +苦 +牟 +闽 +\ +烃 +饮 +聿 +丙 +蛳 +朱 +煤 +涔 +鳖 +犁 +罐 +荼 +砒 +淦 +妤 +黏 +戎 +孑 +婕 +瑾 +戢 +钵 +枣 +捋 +砥 +衩 +狙 +桠 +稣 +阎 +肃 +梏 +诫 +孪 +昶 +婊 +衫 +嗔 +侃 +塞 +蜃 +樵 +峒 +貌 +屿 +欺 +缫 +阐 +栖 +诟 +珞 +荭 +吝 +萍 +嗽 +恂 +啻 +蜴 +磬 +峋 +俸 +豫 +谎 +徊 +镍 +韬 +魇 +晴 +U +囟 +猜 +蛮 +坐 +囿 +伴 +亭 +肝 +佗 +蝠 +妃 +胞 +滩 +榴 +氖 +垩 +苋 +砣 +扪 +馏 +姓 +轩 +厉 +夥 +侈 +禀 +垒 +岑 +赏 +钛 +辐 +痔 +披 +纸 +碳 +“ +坞 +蠓 +挤 +荥 +沅 +悔 +铧 +帼 +蒌 +蝇 +a +p +y +n +g +哀 +浆 +瑶 +凿 +桶 +馈 +皮 +奴 +苜 +佤 +伶 +晗 +铱 +炬 +优 +弊 +氢 +恃 +甫 +攥 +端 +锌 +灰 +稹 +炝 +曙 +邋 +亥 +眶 +碾 +拉 +萝 +绔 +捷 +浍 +腋 +姑 +菖 +凌 +涞 +麽 +锢 +桨 +潢 +绎 +镰 +殆 +锑 +渝 +铬 +困 +绽 +觎 +匈 +糙 +暑 +裹 +鸟 +盔 +肽 +迷 +綦 +『 +亳 +佝 +俘 +钴 +觇 +骥 +仆 +疝 +跪 +婶 +郯 +瀹 +唉 +脖 +踞 +针 +晾 +忒 +扼 +瞩 +叛 +椒 +疟 +嗡 +邗 +肆 +跆 +玫 +忡 +捣 +咧 +唆 +艄 +蘑 +潦 +笛 +阚 +沸 +泻 +掊 +菽 +贫 +斥 +髂 +孢 +镂 +赂 +麝 +鸾 +屡 +衬 +苷 +恪 +叠 +希 +粤 +爻 +喝 +茫 +惬 +郸 +绻 +庸 +撅 +碟 +宄 +妹 +膛 +叮 +饵 +崛 +嗲 +椅 +冤 +搅 +咕 +敛 +尹 +垦 +闷 +蝉 +霎 +勰 +败 +蓑 +泸 +肤 +鹌 +幌 +焦 +浠 +鞍 +刁 +舰 +乙 +竿 +裔 +。 +茵 +函 +伊 +兄 +丨 +娜 +匍 +謇 +莪 +宥 +似 +蝽 +翳 +酪 +翠 +粑 +薇 +祢 +骏 +赠 +叫 +Q +噤 +噻 +竖 +芗 +莠 +潭 +俊 +羿 +耜 +O +郫 +趁 +嗪 +囚 +蹶 +芒 +洁 +笋 +鹑 +敲 +硝 +啶 +堡 +渲 +揩 +』 +携 +宿 +遒 +颍 +扭 +棱 +割 +萜 +蔸 +葵 +琴 +捂 +饰 +衙 +耿 +掠 +募 +岂 +窖 +涟 +蔺 +瘤 +柞 +瞪 +怜 +匹 +距 +楔 +炜 +哆 +秦 +缎 +幼 +茁 +绪 +痨 +恨 +楸 +娅 +瓦 +桩 +雪 +嬴 +伏 +榔 +妥 +铿 +拌 +眠 +雍 +缇 +‘ +卓 +搓 +哌 +觞 +噩 +屈 +哧 +髓 +咦 +巅 +娑 +侑 +淫 +膳 +祝 +勾 +姊 +莴 +胄 +疃 +薛 +蜷 +胛 +巷 +芙 +芋 +熙 +闰 +勿 +窃 +狱 +剩 +钏 +幢 +陟 +铛 +慧 +靴 +耍 +k +浙 +浇 +飨 +惟 +绗 +祜 +澈 +啼 +咪 +磷 +摞 +诅 +郦 +抹 +跃 +壬 +吕 +肖 +琏 +颤 +尴 +剡 +抠 +凋 +赚 +泊 +津 +宕 +殷 +倔 +氲 +漫 +邺 +涎 +怠 +$ +垮 +荬 +遵 +俏 +叹 +噢 +饽 +蜘 +孙 +筵 +疼 +鞭 +羧 +牦 +箭 +潴 +c +眸 +祭 +髯 +啖 +坳 +愁 +芩 +驮 +倡 +巽 +穰 +沃 +胚 +怒 +凤 +槛 +剂 +趵 +嫁 +v +邢 +灯 +鄢 +桐 +睽 +檗 +锯 +槟 +婷 +嵋 +圻 +诗 +蕈 +颠 +遭 +痢 +芸 +怯 +馥 +竭 +锗 +徜 +恭 +遍 +籁 +剑 +嘱 +苡 +龄 +僧 +桑 +潸 +弘 +澶 +楹 +悲 +讫 +愤 +腥 +悸 +谍 +椹 +呢 +桓 +葭 +攫 +阀 +翰 +躲 +敖 +柑 +郎 +笨 +橇 +呃 +魁 +燎 +脓 +葩 +磋 +垛 +玺 +狮 +沓 +砜 +蕊 +锺 +罹 +蕉 +翱 +虐 +闾 +巫 +旦 +茱 +嬷 +枯 +鹏 +贡 +芹 +汛 +矫 +绁 +拣 +禺 +佃 +讣 +舫 +惯 +乳 +趋 +疲 +挽 +岚 +虾 +衾 +蠹 +蹂 +飓 +氦 +铖 +孩 +稞 +瑜 +壅 +掀 +勘 +妓 +畅 +髋 +W +庐 +牲 +蓿 +榕 +练 +垣 +唱 +邸 +菲 +昆 +婺 +穿 +绡 +麒 +蚱 +掂 +愚 +泷 +涪 +漳 +妩 +娉 +榄 +讷 +觅 +旧 +藤 +煮 +呛 +柳 +腓 +叭 +庵 +烷 +阡 +罂 +蜕 +擂 +猖 +咿 +媲 +脉 +【 +沏 +貅 +黠 +熏 +哲 +烁 +坦 +酵 +兜 +× +潇 +撒 +剽 +珩 +圹 +乾 +摸 +樟 +帽 +嗒 +襄 +魂 +轿 +憬 +锡 +〕 +喃 +皆 +咖 +隅 +脸 +残 +泮 +袂 +鹂 +珊 +囤 +捆 +咤 +误 +徨 +闹 +淙 +芊 +淋 +怆 +囗 +拨 +梳 +渤 +R +G +绨 +蚓 +婀 +幡 +狩 +麾 +谢 +唢 +裸 +旌 +伉 +纶 +裂 +驳 +砼 +咛 +澄 +樨 +蹈 +宙 +澍 +倍 +貔 +操 +勇 +蟠 +摈 +砧 +虬 +够 +缁 +悦 +藿 +撸 +艹 +摁 +淹 +豇 +虎 +榭 +ˉ +吱 +d +° +喧 +荀 +踱 +侮 +奋 +偕 +饷 +犍 +惮 +坑 +璎 +徘 +宛 +妆 +袈 +倩 +窦 +昂 +荏 +乖 +K +怅 +撰 +鳙 +牙 +袁 +酞 +X +痿 +琼 +闸 +雁 +趾 +荚 +虻 +涝 +《 +杏 +韭 +偈 +烤 +绫 +鞘 +卉 +症 +遢 +蓥 +诋 +杭 +荨 +匆 +竣 +簪 +辙 +敕 +虞 +丹 +缭 +咩 +黟 +m +淤 +瑕 +咂 +铉 +硼 +茨 +嶂 +痒 +畸 +敬 +涿 +粪 +窘 +熟 +叔 +嫔 +盾 +忱 +裘 +憾 +梵 +赡 +珙 +咯 +娘 +庙 +溯 +胺 +葱 +痪 +摊 +荷 +卞 +乒 +髦 +寐 +铭 +坩 +胗 +枷 +爆 +溟 +嚼 +羚 +砬 +轨 +惊 +挠 +罄 +竽 +菏 +氧 +浅 +楣 +盼 +枢 +炸 +阆 +杯 +谏 +噬 +淇 +渺 +俪 +秆 +墓 +泪 +跻 +砌 +痰 +垡 +渡 +耽 +釜 +讶 +鳎 +煞 +呗 +韶 +舶 +绷 +鹳 +缜 +旷 +铊 +皱 +龌 +檀 +霖 +奄 +槐 +艳 +蝶 +旋 +哝 +赶 +骞 +蚧 +腊 +盈 +丁 +` +蜚 +矸 +蝙 +睨 +嚓 +僻 +鬼 +醴 +夜 +彝 +磊 +笔 +拔 +栀 +糕 +厦 +邰 +纫 +逭 +纤 +眦 +膊 +馍 +躇 +烯 +蘼 +冬 +诤 +暄 +骶 +哑 +瘠 +」 +臊 +丕 +愈 +咱 +螺 +擅 +跋 +搏 +硪 +谄 +笠 +淡 +嘿 +骅 +谧 +鼎 +皋 +姚 +歼 +蠢 +驼 +耳 +胬 +挝 +涯 +狗 +蒽 +孓 +犷 +凉 +芦 +箴 +铤 +孤 +嘛 +坤 +V +茴 +朦 +挞 +尖 +橙 +诞 +搴 +碇 +洵 +浚 +帚 +蜍 +漯 +柘 +嚎 +讽 +芭 +荤 +咻 +祠 +秉 +跖 +埃 +吓 +糯 +眷 +馒 +惹 +娼 +鲑 +嫩 +讴 +轮 +瞥 +靶 +褚 +乏 +缤 +宋 +帧 +删 +驱 +碎 +扑 +俩 +俄 +偏 +涣 +竹 +噱 +皙 +佰 +渚 +唧 +斡 +# +镉 +刀 +崎 +筐 +佣 +夭 +贰 +肴 +峙 +哔 +艿 +匐 +牺 +镛 +缘 +仡 +嫡 +劣 +枸 +堀 +梨 +簿 +鸭 +蒸 +亦 +稽 +浴 +{ +衢 +束 +槲 +j +阁 +揍 +疥 +棋 +潋 +聪 +窜 +乓 +睛 +插 +冉 +阪 +苍 +搽 +「 +蟾 +螟 +幸 +仇 +樽 +撂 +慢 +跤 +幔 +俚 +淅 +覃 +觊 +溶 +妖 +帛 +侨 +曰 +妾 +泗 +· +: +瀘 +風 +Ë +( +) +∶ +紅 +紗 +瑭 +雲 +頭 +鶏 +財 +許 +• +¥ +樂 +焗 +麗 +— +; +滙 +東 +榮 +繪 +興 +… +門 +業 +π +楊 +國 +顧 +é +盤 +寳 +Λ +龍 +鳳 +島 +誌 +緣 +結 +銭 +萬 +勝 +祎 +璟 +優 +歡 +臨 +時 +購 += +★ +藍 +昇 +鐵 +觀 +勅 +農 +聲 +畫 +兿 +術 +發 +劉 +記 +專 +耑 +園 +書 +壴 +種 +Ο +● +褀 +號 +銀 +匯 +敟 +锘 +葉 +橪 +廣 +進 +蒄 +鑽 +阝 +祙 +貢 +鍋 +豊 +夬 +喆 +團 +閣 +開 +燁 +賓 +館 +酡 +沔 +順 ++ +硚 +劵 +饸 +陽 +車 +湓 +復 +萊 +氣 +軒 +華 +堃 +迮 +纟 +戶 +馬 +學 +裡 +電 +嶽 +獨 +マ +シ +サ +ジ +燘 +袪 +環 +❤ +臺 +灣 +専 +賣 +孖 +聖 +攝 +線 +▪ +α +傢 +俬 +夢 +達 +莊 +喬 +貝 +薩 +劍 +羅 +壓 +棛 +饦 +尃 +璈 +囍 +醫 +G +I +A +# +N +鷄 +髙 +嬰 +啓 +約 +隹 +潔 +賴 +藝 +~ +寶 +籣 +麺 +  +嶺 +√ +義 +網 +峩 +長 +∧ +魚 +機 +構 +② +鳯 +偉 +L +B +㙟 +畵 +鴿 +' +詩 +溝 +嚞 +屌 +藔 +佧 +玥 +蘭 +織 +1 +3 +9 +0 +7 +點 +砭 +鴨 +鋪 +銘 +廳 +弍 +‧ +創 +湯 +坶 +℃ +卩 +骝 +& +烜 +荘 +當 +潤 +扞 +係 +懷 +碶 +钅 +蚨 +讠 +☆ +叢 +爲 +埗 +涫 +塗 +→ +楽 +現 +鯨 +愛 +瑪 +鈺 +忄 +悶 +藥 +飾 +樓 +視 +孬 +ㆍ +燚 +苪 +師 +① +丼 +锽 +│ +韓 +標 +è +兒 +閏 +匋 +張 +漢 +Ü +髪 +會 +閑 +檔 +習 +裝 +の +峯 +菘 +輝 +И +雞 +釣 +億 +浐 +K +O +R +8 +H +E +P +T +W +D +S +C +M +F +姌 +饹 +» +晞 +廰 +ä +嵯 +鷹 +負 +飲 +絲 +冚 +楗 +澤 +綫 +區 +❋ +← +質 +靑 +揚 +③ +滬 +統 +産 +協 +﹑ +乸 +畐 +經 +運 +際 +洺 +岽 +為 +粵 +諾 +崋 +豐 +碁 +ɔ +V +2 +6 +齋 +誠 +訂 +´ +勑 +雙 +陳 +無 +í +泩 +媄 +夌 +刂 +i +c +t +o +r +a +嘢 +耄 +燴 +暃 +壽 +媽 +靈 +抻 +體 +唻 +É +冮 +甹 +鎮 +錦 +ʌ +蜛 +蠄 +尓 +駕 +戀 +飬 +逹 +倫 +貴 +極 +Я +Й +寬 +磚 +嶪 +郎 +職 +| +間 +n +d +剎 +伈 +課 +飛 +橋 +瘊 +№ +譜 +骓 +圗 +滘 +縣 +粿 +咅 +養 +濤 +彳 +® +% +Ⅱ +啰 +㴪 +見 +矞 +薬 +糁 +邨 +鲮 +顔 +罱 +З +選 +話 +贏 +氪 +俵 +競 +瑩 +繡 +枱 +β +綉 +á +獅 +爾 +™ +麵 +戋 +淩 +徳 +個 +劇 +場 +務 +簡 +寵 +h +實 +膠 +轱 +圖 +築 +嘣 +樹 +㸃 +營 +耵 +孫 +饃 +鄺 +飯 +麯 +遠 +輸 +坫 +孃 +乚 +閃 +鏢 +㎡ +題 +廠 +關 +↑ +爺 +將 +軍 +連 +篦 +覌 +參 +箸 +- +窠 +棽 +寕 +夀 +爰 +歐 +呙 +閥 +頡 +熱 +雎 +垟 +裟 +凬 +勁 +帑 +馕 +夆 +疌 +枼 +馮 +貨 +蒤 +樸 +彧 +旸 +靜 +龢 +暢 +㐱 +鳥 +珺 +鏡 +灡 +爭 +堷 +廚 +Ó +騰 +診 +┅ +蘇 +褔 +凱 +頂 +豕 +亞 +帥 +嘬 +⊥ +仺 +桖 +複 +饣 +絡 +穂 +顏 +棟 +納 +▏ +濟 +親 +設 +計 +攵 +埌 +烺 +ò +頤 +燦 +蓮 +撻 +節 +講 +濱 +濃 +娽 +洳 +朿 +燈 +鈴 +護 +膚 +铔 +過 +補 +Z +U +5 +4 +坋 +闿 +䖝 +餘 +缐 +铞 +貿 +铪 +桼 +趙 +鍊 +[ +㐂 +垚 +菓 +揸 +捲 +鐘 +滏 +𣇉 +爍 +輪 +燜 +鴻 +鮮 +動 +鹞 +鷗 +丄 +慶 +鉌 +翥 +飮 +腸 +⇋ +漁 +覺 +來 +熘 +昴 +翏 +鲱 +圧 +鄉 +萭 +頔 +爐 +嫚 +г +貭 +類 +聯 +幛 +輕 +訓 +鑒 +夋 +锨 +芃 +珣 +䝉 +扙 +嵐 +銷 +處 +ㄱ +語 +誘 +苝 +歸 +儀 +燒 +楿 +內 +粢 +葒 +奧 +麥 +礻 +滿 +蠔 +穵 +瞭 +態 +鱬 +榞 +硂 +鄭 +黃 +煙 +祐 +奓 +逺 +* +瑄 +獲 +聞 +薦 +讀 +這 +樣 +決 +問 +啟 +們 +執 +説 +轉 +單 +隨 +唘 +帶 +倉 +庫 +還 +贈 +尙 +皺 +■ +餅 +產 +○ +∈ +報 +狀 +楓 +賠 +琯 +嗮 +禮 +` +傳 +> +≤ +嗞 +Φ +≥ +換 +咭 +∣ +↓ +曬 +ε +応 +寫 +″ +終 +様 +純 +費 +療 +聨 +凍 +壐 +郵 +ü +黒 +∫ +製 +塊 +調 +軽 +確 +撃 +級 +馴 +Ⅲ +涇 +繹 +數 +碼 +證 +狒 +処 +劑 +< +晧 +賀 +衆 +] +櫥 +兩 +陰 +絶 +對 +鯉 +憶 +◎ +p +e +Y +蕒 +煖 +頓 +測 +試 +鼽 +僑 +碩 +妝 +帯 +≈ +鐡 +舖 +權 +喫 +倆 +ˋ +該 +悅 +ā +俫 +. +f +s +b +m +k +g +u +j +貼 +淨 +濕 +針 +適 +備 +l +/ +給 +謢 +強 +觸 +衛 +與 +⊙ +$ +緯 +變 +⑴ +⑵ +⑶ +㎏ +殺 +∩ +幚 +─ +價 +▲ +離 +ú +ó +飄 +烏 +関 +閟 +﹝ +﹞ +邏 +輯 +鍵 +驗 +訣 +導 +歷 +屆 +層 +▼ +儱 +錄 +熳 +ē +艦 +吋 +錶 +辧 +飼 +顯 +④ +禦 +販 +気 +対 +枰 +閩 +紀 +幹 +瞓 +貊 +淚 +△ +眞 +墊 +Ω +獻 +褲 +縫 +緑 +亜 +鉅 +餠 +{ +} +◆ +蘆 +薈 +█ +◇ +溫 +彈 +晳 +粧 +犸 +穩 +訊 +崬 +凖 +熥 +П +舊 +條 +紋 +圍 +Ⅳ +筆 +尷 +難 +雜 +錯 +綁 +識 +頰 +鎖 +艶 +□ +殁 +殼 +⑧ +├ +▕ +鵬 +ǐ +ō +ǒ +糝 +綱 +▎ +μ +盜 +饅 +醬 +籤 +蓋 +釀 +鹽 +據 +à +ɡ +辦 +◥ +彐 +┌ +婦 +獸 +鲩 +伱 +ī +蒟 +蒻 +齊 +袆 +腦 +寧 +凈 +妳 +煥 +詢 +偽 +謹 +啫 +鯽 +騷 +鱸 +損 +傷 +鎻 +髮 +買 +冏 +儥 +両 +﹢ +∞ +載 +喰 +z +羙 +悵 +燙 +曉 +員 +組 +徹 +艷 +痠 +鋼 +鼙 +縮 +細 +嚒 +爯 +≠ +維 +" +鱻 +壇 +厍 +帰 +浥 +犇 +薡 +軎 +² +應 +醜 +刪 +緻 +鶴 +賜 +噁 +軌 +尨 +镔 +鷺 +槗 +彌 +葚 +濛 +請 +溇 +緹 +賢 +訪 +獴 +瑅 +資 +縤 +陣 +蕟 +栢 +韻 +祼 +恁 +伢 +謝 +劃 +涑 +總 +衖 +踺 +砋 +凉 +籃 +駿 +苼 +瘋 +昽 +紡 +驊 +腎 +﹗ +響 +杋 +剛 +嚴 +禪 +歓 +槍 +傘 +檸 +檫 +炣 +勢 +鏜 +鎢 +銑 +尐 +減 +奪 +惡 +θ +僮 +婭 +臘 +ū +ì +殻 +鉄 +∑ +蛲 +焼 +緖 +續 +紹 +懮 \ No newline at end of file diff --git a/batch_running_task/pytorchocr/utils/utility.py b/batch_running_task/pytorchocr/utils/utility.py new file mode 100644 index 0000000..70bb7b7 --- /dev/null +++ b/batch_running_task/pytorchocr/utils/utility.py @@ -0,0 +1,70 @@ +import os +import imghdr +import cv2 +import logging + +def get_image_file_list(img_file): + imgs_lists = [] + if img_file is None or not os.path.exists(img_file): + raise Exception("not found any img file in {}".format(img_file)) + + img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'GIF'} + if os.path.isfile(img_file) and imghdr.what(img_file) in img_end: + imgs_lists.append(img_file) + elif os.path.isdir(img_file): + for single_file in os.listdir(img_file): + file_path = os.path.join(img_file, single_file) + if imghdr.what(file_path) in img_end: + imgs_lists.append(file_path) + if len(imgs_lists) == 0: + raise Exception("not found any img file in {}".format(img_file)) + return imgs_lists + + +def check_and_read_gif(img_path): + if os.path.basename(img_path)[-3:] in ['gif', 'GIF']: + gif = cv2.VideoCapture(img_path) + ret, frame = gif.read() + if not ret: + # logger = logging.getLogger('ppocr') + print("Cannot read {}. This gif image maybe corrupted.") + # logger.info("Cannot read {}. This gif image maybe corrupted.") + return None, False + if len(frame.shape) == 2 or frame.shape[-1] == 1: + frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB) + imgvalue = frame[:, :, ::-1] + return imgvalue, True + return None, False + + +def check_and_read(img_path): + if os.path.basename(img_path)[-3:] in ['gif', 'GIF']: + gif = cv2.VideoCapture(img_path) + ret, frame = gif.read() + if not ret: + logger = logging.getLogger('ppocr') + logger.info("Cannot read {}. This gif image maybe corrupted.") + return None, False + if len(frame.shape) == 2 or frame.shape[-1] == 1: + frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB) + imgvalue = frame[:, :, ::-1] + return imgvalue, True, False + elif os.path.basename(img_path)[-3:] in ['pdf']: + import fitz + from PIL import Image + imgs = [] + with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) + return imgs, False, True + return None, False, False \ No newline at end of file diff --git a/batch_running_task/scihub_pdf_dataset.py b/batch_running_task/scihub_pdf_dataset.py new file mode 100644 index 0000000..0d0f546 --- /dev/null +++ b/batch_running_task/scihub_pdf_dataset.py @@ -0,0 +1,827 @@ + +from get_data_utils import * +from utils import collect_mfdetrec_res_per_page, formula_in_text +from torch.utils.data import IterableDataset,get_worker_info,DataLoader, Dataset +from utils import Timers,convert_boxes +import torch +from utils import collect_paragraph_image_and_its_coordinate + +def update_det_boxes(dt_boxes, mfdetrec_res): + new_dt_boxes = dt_boxes + for mf_box in mfdetrec_res: + flag, left_box, right_box = False, None, None + for idx, text_box in enumerate(new_dt_boxes): + if 'bbox' in mf_box: + bbox = mf_box['bbox'] + elif 'poly' in mf_box: + xmin, ymin = int(mf_box['poly'][0]), int(mf_box['poly'][1]) + xmax, ymax = int(mf_box['poly'][4]), int(mf_box['poly'][5]) + bbox= [xmin, ymin, xmax, ymax] + else: + raise NotImplementedError("mf_box should have bbox or poly") + ret, left_box, right_box = formula_in_text(bbox, text_box) + if ret: + new_dt_boxes.pop(idx) + if left_box is not None: + new_dt_boxes.append(left_box) + if right_box is not None: + new_dt_boxes.append(right_box) + break + + return new_dt_boxes + +def clean_pdf_path(pdf_path): + return pdf_path[len("opendata:"):] if pdf_path.startswith("opendata:") else pdf_path + + +class ImageTransformersUtils: + + def prepare_for_mfd_model(self, im:np.ndarray): + if self.mfd_pre_transform is None :return im + assert im.ndim==3 + im = [im] + im = np.stack(self.mfd_pre_transform(im)) + im = im[..., ::-1].transpose((0, 3, 1, 2)) # BGR to RGB, BHWC to BCHW, (n, 3, h, w) + im = np.ascontiguousarray(im) # contiguous + im = im.astype('float')/255 + im = torch.from_numpy(im) + return im[0] + + def prepare_for_text_det(self,image): + return self.text_det_transform(image)[0] + +class PDFImageDataset(IterableDataset,DatasetUtils,ImageTransformersUtils): + #client = build_client() + def __init__(self, metadata_filepath, aug, input_format, + mfd_pre_transform, det_pre_transform=None, + return_original_image=False,timer=Timers(False), + partion_num = 1, + partion_idx = 0): + super().__init__() + self.metadata= self.smart_read_json(metadata_filepath) + self.metadata= np.array_split(self.metadata, partion_num)[partion_idx] + self.dpi = 200 + self.aug = aug + self.input_format = input_format + self.mfd_pre_transform = mfd_pre_transform + self.return_original_image = return_original_image + self.det_pre_transform = det_pre_transform + self.timer = timer + + def get_pdf_by_index(self,index): + pdf_path = self.metadata[index]['path'] + return self.get_pdf_buffer(pdf_path) + + def __iter__(self): + worker_info = get_worker_info() + if worker_info is None: # single-process data loading, return the full iterator + self.current_pdf_index = 0 + self.current_page_index = 0 + else: # in a worker process + # split workload + num_workers = worker_info.num_workers + worker_id = worker_info.id + self.current_pdf_index = worker_id + self.current_page_index = 0 + + self.pdf = self.get_pdf_by_index(self.current_pdf_index) + return self + + @property + def current_doc(self): + if len(self.last_read_pdf_buffer)==0:return None + return list(self.last_read_pdf_buffer.values())[0] + + def prepare_for_text_det(self,image): + return self.text_det_transform(image)[0] + + def read_data_based_on_current_state(self): + #print(f"read image from current_page_index={self.current_page_index} ") + + with self.timer("load_page"): + page = self.get_pdf_by_index(self.current_pdf_index).load_page(self.current_page_index) + with self.timer("from_page_to_pimage"): + oimage = process_pdf_page_to_image(page, self.dpi) + original_image = oimage[:, :, ::-1] if self.input_format == "RGB" else oimage + height, width = original_image.shape[:2] + with self.timer("get_layout_image"): + layout_image = self.aug.get_transform(original_image).apply_image(original_image) + layout_image = torch.as_tensor(layout_image.astype("float32").transpose(2, 0, 1))[:,:1042,:800] ## it will be 1043x800 --> 1042:800 + ## lets make sure the image has correct size + # if layout_image.size(1) < 1042: + # layout_image = torch.nn.functional.pad(layout_image, (0, 0, 0, 1042-layout_image.size(1))) + with self.timer("get_mfd_image"): + mfd_image=self.prepare_for_mfd_model(oimage) + with self.timer("get_det_image"): + det_images = torch.from_numpy(self.det_pre_transform(original_image)[0]) + + output= {"pdf_index":self.current_pdf_index, "page_index":self.current_page_index, "mfd_image":mfd_image, "layout_image":layout_image, "det_images":det_images, "height":height, "width":width} + if self.return_original_image: + output['oimage'] = original_image + return output + + def go_to_next_pdf(self): + worker_info = get_worker_info() + step_for_pdf= 1 if worker_info is None else worker_info.num_workers + + self.current_pdf_index += step_for_pdf + # pdf_path = self.metadata[self.current_pdf_index]['path'] + # error_count = 0 + # while (not self.check_path_exists(pdf_path) or self.get_pdf_buffer(pdf_path) is None) and error_count<10 : + # tqdm.write(f"[Error]: {pdf_path}") + # self.current_pdf_index += step_for_pdf + # pdf_path = self.metadata[self.current_pdf_index]['path'] + # error_count+=1 + # if pdf_path is None: + # raise NotImplementedError(f"Seem you use a very bad dataset that we can't find any pdf file, anymore") + self.current_page_index = 0 + if self.current_pdf_index >= len(self.metadata): + raise StopIteration + + def check_should_skip(self): + pdf_now = self.get_pdf_by_index(self.current_pdf_index) + error_count = 0 + while pdf_now is None and error_count<10: + self.go_to_next_pdf() + pdf_now = self.get_pdf_by_index(self.current_pdf_index) + error_count+=1 + if error_count>=10: + raise NotImplementedError(f"Seem you use a very bad dataset that we can't find any pdf file, anymore") + current_pdf_page_num = pdf_now.page_count + if self.current_page_index >= current_pdf_page_num: + self.go_to_next_pdf() + + def __next__(self): + + fail_times = 0 + output = None + while output is None and fail_times<=10: + self.check_should_skip() + try: + output = self.read_data_based_on_current_state() + self.current_page_index += 1 + fail_times = 0 + except StopIteration: + self.clean_pdf_buffer() + raise StopIteration + except: + fail_times +=1 + if fail_times>10 or output is None: + self.clean_pdf_buffer() + raise StopIteration + return output + + + +class RecImageDataset(Dataset, DatasetUtils,ImageTransformersUtils): + error_count=0 + def __init__(self, metadata_filepath, + partion_num = 1, + partion_idx = 0): + super().__init__() + self.metadata= self.smart_read_json(metadata_filepath) + self.metadata= np.array_split(self.metadata, partion_num)[partion_idx] + self.dpi = 200 + self.timer = Timers(False) + self.client = build_client() + def __len__(self): + return len(self.metadata) + + def __getitem__(self, index) : + pdf_metadata = self.metadata[index] + return self.get_cropped_image_list_via_remove_mfd_part(pdf_metadata, self.client) + + @staticmethod + def collect_location_and_dt_box_from_page_metadata(pdf_path, pdf_page_metadata): + location_keys = [] + page_id = pdf_page_metadata['page_id'] + mfd_res_list = None + for bbox_metadata in pdf_page_metadata['layout_dets']: + if bbox_metadata['category_id']!=15:continue + if "sub_boxes" in bbox_metadata:# and any([b['text']!="" for b in bbox_metadata['sub_boxes']]): + ## this mean we have + continue + if mfd_res_list is None: + mfd_res_list = collect_mfdetrec_res_per_page(pdf_page_metadata['layout_dets']) # List[Dict] [{'bbox':[a,b,c,d]}, {'bbox':[a,b,c,d]}] + bbox_id = tuple(bbox_metadata['poly']) + tmp_box = np.array(bbox_metadata['poly']).reshape(-1, 2) + tmp_box = sorted_boxes(tmp_box[None])[0].astype('float32') + dt_boxes = [tmp_box] + if mfd_res_list: + dt_boxes = update_det_boxes(dt_boxes, mfd_res_list) + # logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(len(dt_boxes), aft-bef)) + if len(dt_boxes) == 1 and bbox_metadata.get('text',"")!="": + #print("we can skip this one because it has no formula, and origin ocr is corr") + continue + ## this mean we do not need modify it, lets skip + for dt_box in dt_boxes: + #print(dt_box) + ### from dt_box to get bbox + sub_box_id = (dt_box[0][0],dt_box[0][1],dt_box[1][0],dt_box[1][1],dt_box[2][0],dt_box[2][1],dt_box[3][0],dt_box[3][1]) + location= (clean_pdf_path(pdf_path),page_id,bbox_id,sub_box_id) + location_keys.append(location) + return location_keys + + + def get_cropped_image_list_via_remove_mfd_part(self,pdf_metadata,client): + + images_pool = {} + pdf_path = pdf_metadata['path'] + height = pdf_metadata['height'] + width = pdf_metadata['width'] + + if pdf_path.startswith('s3'): + pdf_path = "opendata:"+pdf_path + try: + with read_pdf_from_path(pdf_path, client) as pdf: + + for pdf_page_metadata in pdf_metadata['doc_layout_result']: + page_id = pdf_page_metadata['page_id'] + page = pdf.load_page(page_id) + ori_im = process_pdf_page_to_image(page, 200, output_width=width,output_height=height) + location_keys = self.collect_location_and_dt_box_from_page_metadata(pdf_path, pdf_page_metadata) + for location in location_keys: + _,_,_,sub_box_id = location + dt_box = np.array(sub_box_id).reshape(-1, 2) + img_crop = get_rotate_crop_image(ori_im, dt_box, padding=10) + images_pool[location] = img_crop + + return (pdf_path,images_pool) + except KeyboardInterrupt: + raise + except: + traceback.print_exc() + raise + tqdm.write(f"[Error]: {pdf_path}") + return (pdf_path,{}) + +class DetImageDataset(Dataset, DatasetUtils,ImageTransformersUtils): + error_count=0 + def __init__(self, metadata_filepath, + det_pre_transform, + partion_num = 1, + partion_idx = 0): + super().__init__() + self.metadata= self.smart_read_json(metadata_filepath) + self.metadata= np.array_split(self.metadata, partion_num)[partion_idx] + self.dpi = 200 + self.timer = Timers(False) + self.det_pre_transform = det_pre_transform + self.client = build_client() + + def __len__(self): + return len(self.metadata) + + def extract_det_image(self, pdf_id): + client = self.client + images_pool = {} + pdf_metadata = self.metadata[pdf_id] + pdf_path = pdf_metadata['path'] + output_width =1472 #pdf_metadata['width']#1472 + output_height=1920 #pdf_metadata['height']#1920 + if pdf_path.startswith('s3'): + pdf_path = "opendata:"+pdf_path + detimages = [] + rough_layout_this_batch = [] + with read_pdf_from_path(pdf_path, client) as pdf: + for pdf_page_metadata in pdf_metadata['doc_layout_result']: + page_id = pdf_page_metadata['page_id'] + try: + page = pdf.load_page(page_id) + except: + continue + layout_dets = [] + for res in pdf_page_metadata["layout_dets"]: + xmin, ymin = int(res['poly'][0]), int(res['poly'][1]) + xmax, ymax = int(res['poly'][4]), int(res['poly'][5]) + bbox= [xmin, ymin, xmax, ymax] + bbox= convert_boxes([bbox], pdf_metadata['width'], pdf_metadata['height'], output_width, output_height)[0] + res = res.copy() + xmin, ymin, xmax, ymax = bbox + res['poly'] = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax] + res['pdf_path']=clean_pdf_path(pdf_path) + res['page_id'] =page_id + layout_dets.append(res) + if len(layout_dets)>0: + oimage = process_pdf_page_to_image(page, 200, output_width=output_width,output_height=output_height) + original_image = oimage + det_images = torch.from_numpy(self.det_pre_transform(original_image)[0]) + rough_layout_this_batch.append(layout_dets) + detimages.append(det_images) + + return (detimages,rough_layout_this_batch) + + + def __getitem__(self, index) : + + return self.extract_det_image(index) + +class DetPageInfoImageDataset(Dataset, DatasetUtils,ImageTransformersUtils): + error_count=0 + def __init__(self, metadata_filepath, + det_pre_transform, + partion_num = 1, + partion_idx = 0, + page_num_for_name=None): + super().__init__() + if page_num_for_name is None: + filename = metadata_filepath.split("/")[-1].replace('.jsonl','.json') + page_num_for_name_path = f"opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared/page_num_map/{filename}" + page_num_for_name_list = self.smart_read_json(page_num_for_name_path) + page_num_for_name={} + for pdf_path, page_num in page_num_for_name_list: + if pdf_path.startswith("s3:"): pdf_path = "opendata:"+ pdf_path + page_num_for_name[pdf_path] = page_num + tqdm.write(f"we load page_num_for_name from {page_num_for_name_path}") + metadata= self.smart_read_json(metadata_filepath) + metadata= np.array_split(metadata, partion_num)[partion_idx] + tqdm.write("we filte out good metadata") + self.metadata = [] + self.pdf_id_and_page_id_pair = [] + for row in metadata: + if row['path'].startswith("s3:"): row['path'] = "opendata:"+ row['path'] + if row['path'] not in page_num_for_name:continue + if page_num_for_name[row['path']]<=0:continue + + path = row['path'] + + page_num = page_num_for_name[path] + row['page_num'] = page_num_for_name[path] + for page_id in range(page_num): + self.pdf_id_and_page_id_pair.append((len(self.metadata), page_id)) + self.metadata.append(row) + self.dpi = 200 + self.det_pre_transform = det_pre_transform + self.timer = Timers(False) + def __len__(self): + return len(self.pdf_id_and_page_id_pair) + + def get_pdf_by_pdf_id(self,pdf_id): + pdf_path = self.metadata[pdf_id]['path'] + return self.get_pdf_buffer(pdf_path) + + def extract_det_image(self, index): + current_pdf_index, current_page_index = self.pdf_id_and_page_id_pair[index] + pdf_metadata = self.metadata[current_pdf_index] + pdf_path = clean_pdf_path(pdf_metadata['path']) + output_width =1472 #pdf_metadata['width']#1472 + output_height=1920 #pdf_metadata['height']#1920 + detimages = [] + rough_layout_this_batch = [] + for pdf_page_metadata in pdf_metadata['doc_layout_result']: + page_id = pdf_page_metadata['page_id'] + if page_id != current_page_index:continue + layout_dets = [] + for res in pdf_page_metadata["layout_dets"]: + xmin, ymin = int(res['poly'][0]), int(res['poly'][1]) + xmax, ymax = int(res['poly'][4]), int(res['poly'][5]) + bbox= [xmin, ymin, xmax, ymax] + bbox= convert_boxes([bbox], pdf_metadata['width'], pdf_metadata['height'], output_width, output_height)[0] + res = res.copy() + xmin, ymin, xmax, ymax = bbox + res['poly'] = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax] + res['pdf_path']=clean_pdf_path(pdf_path) + res['page_id'] =page_id + layout_dets.append(res) + if len(layout_dets)>0: + page = self.get_pdf_by_pdf_id(current_pdf_index).load_page(current_page_index) + oimage = process_pdf_page_to_image(page, 200, output_width=output_width,output_height=output_height) + original_image = oimage + det_images = torch.from_numpy(self.det_pre_transform(original_image)[0]) + rough_layout_this_batch.append(layout_dets) + detimages.append(det_images) + + return (detimages,rough_layout_this_batch) + + + def __getitem__(self, index) : + + return self.extract_det_image(index) + +class PageInfoDataset(Dataset,DatasetUtils,ImageTransformersUtils): + error_count=0 + #client = build_client() + def __init__(self, metadata_filepath, aug, input_format, mfd_pre_transform, det_pre_transform=None, + return_original_image=False,timer=Timers(False), + partion_num = 1, + partion_idx = 0, + page_num_for_name=None): + super().__init__() + self.build_pdf_id_and_page_id_pair(metadata_filepath,page_num_for_name,partion_num,partion_idx) + self.dpi = 200 + self.aug = aug + self.input_format = input_format + self.mfd_pre_transform = mfd_pre_transform + self.return_original_image = return_original_image + self.det_pre_transform = det_pre_transform + self.timer = timer + + def build_pdf_id_and_page_id_pair(self,metadata_filepath,page_num_for_name,partion_num,partion_idx): + if page_num_for_name is None: + filename = metadata_filepath.split("/")[-1].replace('.jsonl','.json') + page_num_for_name_path = f"opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared/page_num_map/{filename}" + page_num_for_name_list = self.smart_read_json(page_num_for_name_path) + page_num_for_name={} + for pdf_path, page_num in page_num_for_name_list: + if pdf_path.startswith("s3:"): pdf_path = "opendata:"+ pdf_path + page_num_for_name[pdf_path] = page_num + tqdm.write(f"we load page_num_for_name from {page_num_for_name_path}") + metadata= self.smart_read_json(metadata_filepath) + metadata= np.array_split(metadata, partion_num)[partion_idx] + tqdm.write("we filte out good metadata") + self.metadata = [] + self.pdf_id_and_page_id_pair = [] + for row in metadata: + if row['path'].startswith("s3:"): row['path'] = "opendata:"+ row['path'] + if row['path'] not in page_num_for_name:continue + if page_num_for_name[row['path']]<=0:continue + + path = row['path'] + + page_num = page_num_for_name[path] + row['page_num'] = page_num_for_name[path] + for page_id in range(page_num): + self.pdf_id_and_page_id_pair.append((len(self.metadata), page_id)) + self.metadata.append(row) + + def __len__(self): + return len(self.pdf_id_and_page_id_pair) + + + def get_pdf_by_pdf_id(self,pdf_id): + pdf_path = self.metadata[pdf_id]['path'] + try: + return self.get_pdf_buffer(pdf_path) + except Exception as e: + raise(f'page={pdf_id} not in {pdf_path}:', e) + + + def retreive_resource(self,index): + current_pdf_index, current_page_index = self.pdf_id_and_page_id_pair[index] + with self.timer("load_page"): + page = self.get_pdf_by_pdf_id(current_pdf_index).load_page(current_page_index) + with self.timer("from_page_to_pimage"): + oimage = process_pdf_page_to_image(page, self.dpi) + original_image = oimage[:, :, ::-1] if self.input_format == "RGB" else oimage + height, width = original_image.shape[:2] + with self.timer("get_layout_image"): + layout_image = self.aug.get_transform(original_image).apply_image(original_image) + layout_image = torch.as_tensor(layout_image.astype("float32").transpose(2, 0, 1))[:,:1042,:800] ## it will be 1043x800 --> 1042:800 + ## lets make sure the image has correct size + # if layout_image.size(1) < 1042: + # layout_image = torch.nn.functional.pad(layout_image, (0, 0, 0, 1042-layout_image.size(1))) + with self.timer("get_mfd_image"): + mfd_image=self.prepare_for_mfd_model(oimage) + with self.timer("get_det_image"): + det_images = torch.from_numpy(self.det_pre_transform(original_image)[0]) + + output= {"pdf_index":current_pdf_index, "page_index":current_page_index, "mfd_image":mfd_image, "layout_image":layout_image, "det_images":det_images, "height":height, "width":width} + if self.return_original_image: + output['oimage'] = original_image + return output + + def __getitem__(self, index): + assert self.error_count < 10 + try: + out = self.retreive_resource(index) + self.error_count = 0 + except: + random_index = np.random.randint(0,len(self.pdf_id_and_page_id_pair)) + self.error_count +=1 + out = self[random_index] + return out + +class PageInfoWithPairDataset(PageInfoDataset): + def build_pdf_id_and_page_id_pair(self,metadata_filepath,pdf_id_and_page_id_pair,partion_num,partion_idx): + ### this time the page_num_for_name is just the pdf_id_and_page_id_pair + + metadata= self.smart_read_json(metadata_filepath) + metadata= np.array_split(metadata, partion_num)[partion_idx] + self.metadata = metadata + track_id_to_pdf_id = {metadata[i]['track_id']:i for i in range(len(metadata))} + self.pdf_id_and_page_id_pair=[] + for pdf_id, page_id in pdf_id_and_page_id_pair: + if isinstance(pdf_id, str): + pdf_id = track_id_to_pdf_id[pdf_id] + self.pdf_id_and_page_id_pair.append((pdf_id, page_id)) + + +class AddonDataset(Dataset,DatasetUtils,ImageTransformersUtils): + error_count = 0 + dpi = 200 + def __init__(self, metadata_filepath,pdfid_pageid_list, aug, input_format, mfd_pre_transform, det_pre_transform=None, + return_original_image=False,timer=Timers(False), + partion_num = 1, + partion_idx = 0): + super().__init__() + self.metadata_filepath = metadata_filepath + self.metadata= self.smart_read_json(metadata_filepath) + self.pdfid_pageid_list = pdfid_pageid_list + self.aug = aug + self.input_format = input_format + self.mfd_pre_transform = mfd_pre_transform + self.return_original_image = return_original_image + self.det_pre_transform = det_pre_transform + self.timer = timer + + def __len__(self): + return len(self.pdfid_pageid_list) + + def get_pdf_by_pdf_id(self,pdf_id): + pdf_path = self.metadata[pdf_id]['path'] + return self.get_pdf_buffer(pdf_path) + + def retreive_resource(self,index): + current_pdf_index, current_page_index = self.pdfid_pageid_list[index] + with self.timer("load_page"): + page = self.get_pdf_by_pdf_id(current_pdf_index).load_page(current_page_index) + with self.timer("from_page_to_pimage"): + oimage = process_pdf_page_to_image(page, self.dpi) + original_image = oimage[:, :, ::-1] if self.input_format == "RGB" else oimage + height, width = original_image.shape[:2] + with self.timer("get_layout_image"): + layout_image = self.aug.get_transform(original_image).apply_image(original_image) + layout_image = torch.as_tensor(layout_image.astype("float32").transpose(2, 0, 1))[:,:1042,:800] ## it will be 1043x800 --> 1042:800 + ## lets make sure the image has correct size + # if layout_image.size(1) < 1042: + # layout_image = torch.nn.functional.pad(layout_image, (0, 0, 0, 1042-layout_image.size(1))) + with self.timer("get_mfd_image"): + mfd_image=self.prepare_for_mfd_model(oimage) + with self.timer("get_det_image"): + det_images = torch.from_numpy(self.det_pre_transform(original_image)[0]) + + output= {"pdf_index":current_pdf_index, "page_index":current_page_index, "mfd_image":mfd_image, "layout_image":layout_image, "det_images":det_images, "height":height, "width":width} + if self.return_original_image: + output['oimage'] = original_image + return output + + def __getitem__(self, index): + assert self.error_count < 10 + try: + out = self.retreive_resource(index) + self.error_count = 0 + except: + current_pdf_index, current_page_index = self.pdfid_pageid_list[index] + pdf_path = self.metadata[current_pdf_index]['path'] + print(f"fail for pdf={pdf_path} and page={current_page_index}") + random_index = np.random.randint(0,len(self.pdfid_pageid_list)) + self.error_count +=1 + out = self[random_index] + return out + + +def get_croped_image(image_pil, bbox): + x_min, y_min, x_max, y_max = bbox + croped_img = image_pil.crop((x_min, y_min, x_max, y_max)) + return croped_img + +from transformers import ImageProcessingMixin,ProcessorMixin + +class MFRImageDataset(Dataset, DatasetUtils,ImageTransformersUtils): + error_count=0 + def __init__(self, metadata_filepath,mfr_transform, + partion_num = 1, + partion_idx = 0): + super().__init__() + self.metadata= self.smart_read_json(metadata_filepath) + self.metadata= np.array_split(self.metadata, partion_num)[partion_idx] + self.dpi = 200 + self.timer = Timers(False) + self.mfr_transform=mfr_transform + self.client = build_client() + def __len__(self): + return len(self.metadata) + + def mfr_preprocessing(self,raw_image): + if isinstance(self.mfr_transform,(ImageProcessingMixin,ProcessorMixin)): + image_tensor = self.mfr_transform(raw_image, return_tensors="pt")['pixel_values'][0] + else: + image_tensor = self.mfr_transform(raw_image) + + return image_tensor + + def extract_mfr_image(self, pdf_metadata): + client = self.client + images_pool = {} + pdf_path = pdf_metadata['path'] + height = pdf_metadata['height'] + width = pdf_metadata['width'] + if pdf_path.startswith('s3'): + pdf_path = "opendata:"+pdf_path + + with read_pdf_from_path(pdf_path, client) as pdf: + for pdf_page_metadata in pdf_metadata['doc_layout_result']: + page_id = pdf_page_metadata['page_id'] + try: + page = pdf.load_page(page_id) + except: + continue + ori_im = process_pdf_page_to_image(page, 200, output_width=width,output_height=height) + for bbox_metadata in pdf_page_metadata['layout_dets']: + if bbox_metadata['category_id'] not in [13, 14]:continue + if bbox_metadata.get('latex',"")!="": + #print("we can skip this one because it has latex") + continue # skip the part that has latex parsed + + [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax] = bbox_metadata['poly'] + bbox_id = tuple(bbox_metadata['poly']) + location= (clean_pdf_path(pdf_path),page_id,bbox_id) + bbox_img = get_croped_image(Image.fromarray(ori_im), [xmin, ymin, xmax, ymax]) + image_tensor = self.mfr_preprocessing(bbox_img) + images_pool[location] = image_tensor + return (pdf_path,images_pool) + # except KeyboardInterrupt: + # raise + # except: + # traceback.print_exc() + # tqdm.write(f"[Error]: {pdf_path}") + # return (pdf_path,{}) + + def __getitem__(self, index) : + pdf_metadata = self.metadata[index] + return self.extract_mfr_image(pdf_metadata) + + +import traceback +def deal_with_one_pdf(pdf_metadata,client): + + images_pool = {} + pdf_path = pdf_metadata['path'] + height = pdf_metadata['height'] + width = pdf_metadata['width'] + if pdf_path.startswith('s3'): + pdf_path = "opendata:"+pdf_path + try: + with read_pdf_from_path(pdf_path, client) as pdf: + for pdf_page_metadata in pdf_metadata['doc_layout_result']: + page_id = pdf_page_metadata['page_id'] + page = pdf.load_page(page_id) + ori_im = process_pdf_page_to_image(page, 200, output_width=width,output_height=height) + bbox_id = 0 + for bbox_metadata in pdf_page_metadata['layout_dets']: + if bbox_metadata['category_id']!=15:continue + location= (clean_pdf_path(pdf_path),page_id,bbox_id) + tmp_box = np.array(bbox_metadata['poly']).reshape(-1, 2) + tmp_box = sorted_boxes(tmp_box[None])[0].astype('float32') + img_crop = get_rotate_crop_image(ori_im, tmp_box, padding=10) + bbox_id+=1 + images_pool[location] = img_crop + + return (pdf_path,images_pool) + except KeyboardInterrupt: + raise + except: + traceback.print_exc() + tqdm.write(f"[Error]: {pdf_path}") + return (pdf_path,{}) + +import cv2 +from tqdm import tqdm +def sorted_boxes(dt_boxes): + """ + Sort text boxes in order from top to bottom, left to right + args: + dt_boxes(array):detected text boxes with shape [4, 2] + return: + sorted boxes(array) with shape [4, 2] + """ + num_boxes = dt_boxes.shape[0] + sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0])) + _boxes = list(sorted_boxes) + + for i in range(num_boxes - 1): + for j in range(i, -1, -1): + if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \ + (_boxes[j + 1][0][0] < _boxes[j][0][0]): + tmp = _boxes[j] + _boxes[j] = _boxes[j + 1] + _boxes[j + 1] = tmp + else: + break + return _boxes + +def get_rotate_crop_image(img, points, padding=10)->np.ndarray: + """ + Extracts a rotated and cropped image patch defined by the quadrilateral `points` + with an additional padding. + + Args: + img (numpy.ndarray): The input image. + points (numpy.ndarray): A (4, 2) array containing the coordinates of the quadrilateral. + padding (int): The number of pixels to expand the bounding box on each side. + + Returns: + numpy.ndarray: The cropped and rotated image patch. + """ + assert len(points) == 4, "shape of points must be 4*2" + + # Calculate the bounding box with padding + img_height, img_width = img.shape[0:2] + left = max(0, int(np.min(points[:, 0])) - padding) + right = min(img_width, int(np.max(points[:, 0])) + padding) + top = max(0, int(np.min(points[:, 1])) - padding) + bottom = min(img_height, int(np.max(points[:, 1])) + padding) + + # Crop the image with padding + img_crop = img[top:bottom, left:right, :].copy() + + # Adjust points to the new cropped region + points[:, 0] -= left + points[:, 1] -= top + + # Calculate the width and height of the rotated crop + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]) + ) + ) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]) + ) + ) + + # Define the destination points for perspective transformation + pts_std = np.float32( + [ + [0, 0], + [img_crop_width, 0], + [img_crop_width, img_crop_height], + [0, img_crop_height], + ] + ) + + # Perform the perspective transformation + M = cv2.getPerspectiveTransform(points, pts_std) + dst_img = cv2.warpPerspective( + img_crop, + M, + (img_crop_width, img_crop_height), + borderMode=cv2.BORDER_REPLICATE, + flags=cv2.INTER_CUBIC, + ) + + # Rotate the image if the height/width ratio is >= 1.5 + dst_img_height, dst_img_width = dst_img.shape[0:2] + if dst_img_height * 1.0 / dst_img_width >= 1.5: + dst_img = np.rot90(dst_img) + + return dst_img + + +def custom_collate_fn(batches): + + return_batch = {} + for batch in batches: + for key,val in batch.items(): + if key not in return_batch: + return_batch[key] = [] + return_batch[key].append(val) + + keys = list(return_batch.keys()) + for key in keys: + if key in ["pdf_index", "page_index","height", "width"]: + return_batch[key] = torch.tensor(return_batch[key]) + elif key in ["mfd_image", "layout_image", "det_images"]: + return_batch[key] = torch.stack(return_batch[key]) + elif key in ['oimage']: + return_batch[key] = return_batch[key] + return return_batch + +def rec_collate_fn(batches): + + location_abs = [] + images_list = [] + for images_pool in batches: + for location, image in images_pool.items(): + location_abs.append(location) + images_list.append(image) + return location_abs,images_list + +def none_collate_fn(batches): + return batches + +from typing import List, Tuple +def concat_collate_fn(batches: List[Tuple[torch.Tensor,torch.Tensor]]): + list_1 = [] + list_2 = [] + for tensor1, tensor2 in batches: + if tensor1 is None:continue + list_1.append(tensor1) + list_2.append(tensor2) + if len(list_1)==0: + return [], [] + return torch.cat(list_1), torch.cat(list_2) + +def tuple_list_collate_fn(batches: List[Tuple[List,List]]): + list_1 = [] + list_2 = [] + for tensor1, tensor2 in batches: + if len(tensor1)==0:continue + list_1.extend(tensor1) + list_2.extend(tensor2) + if len(list_1)==0: + return None, [] + return torch.stack(list_1), list_2 \ No newline at end of file diff --git a/batch_running_task/task_det/batch_deal_with_det.py b/batch_running_task/task_det/batch_deal_with_det.py new file mode 100644 index 0000000..ff65d77 --- /dev/null +++ b/batch_running_task/task_det/batch_deal_with_det.py @@ -0,0 +1,144 @@ + +import warnings +warnings.filterwarnings("ignore", category=RuntimeWarning) +from rough_det import * +import yaml +# from rough_layout_with_aync import * ## async is not safe, lets disable it +from get_data_utils import * +RESULT_SAVE_PATH="opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared" +#RESULT_SAVE_PATH="tianning:s3://temp/debug" +INPUT_LOAD_PATH="opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub" +CURRENT_END_SIGN=".current_end.sign" +import socket +hostname= socket.gethostname() +LOCKSERVER="http://10.140.52.123:8000" if hostname.startswith('SH') else "http://paraai-n32-h-01-ccs-master-2:32453" +from datetime import datetime,timedelta + +from batch_run_utils import BatchModeConfig, process_files,dataclass,obtain_processed_filelist +from simple_parsing import ArgumentParser +from tqdm.auto import tqdm +import traceback + +@dataclass +class BatchDetConfig(BatchModeConfig): + inner_batch_size: int = 16 + batch_size: int = 16 + num_workers: int = 4 + result_save_path: str=RESULT_SAVE_PATH + check_lock: bool = True + page_num_per_batch: int = 32 +if __name__ == '__main__': + + parser = ArgumentParser() + parser.add_arguments(BatchDetConfig, dest="config") + args = parser.parse_args() + args = args.config + all_file_list = obtain_processed_filelist(args) + + if len(all_file_list)==0: + exit() + + with open('configs/model_configs.yaml') as f: + model_configs = yaml.load(f, Loader=yaml.FullLoader) + + img_size = model_configs['model_args']['img_size'] + conf_thres= model_configs['model_args']['conf_thres'] + iou_thres = model_configs['model_args']['iou_thres'] + device = model_configs['model_args']['device'] + dpi = model_configs['model_args']['pdf_dpi'] + + task_name = "layoutV1" + version = "det_patch_good" + layout_model = None + mfd_model = None + client = None + ocrmodel = None + page_num_map_whole = None #get_page_num_map_whole() + for inputs_path in tqdm(all_file_list, leave=False, position=1): + if os.path.exists(CURRENT_END_SIGN): + break + filename = os.path.basename(inputs_path) + result_save_root = os.path.join(args.result_save_path, task_name, version) + + if inputs_path.startswith('s3'): + inputs_path = "opendata:"+inputs_path + + if client is None: + client = build_client() + if not check_path_exists(inputs_path,client): + tqdm.write(f"[Skip]: no {inputs_path} ") + continue + + POSSIABLE_RESULT_SAVE_DIR_LIST=[ + result_save_root, + os.path.join("opendata:s3://llm-pdf-text/pdf_gpu_output/ebook_index_v4/scihub/v001/scihub/"), + ] + + skip = False + for result_old_dir in POSSIABLE_RESULT_SAVE_DIR_LIST: + result_old_path = os.path.join(result_old_dir, filename) + if check_path_exists(result_old_path,client) and not args.redo: + tqdm.write(f"\n [Skip]: existed {result_old_path} \n ") + skip = True + break + if skip:continue + + + + partion_num = 1 + for partion_idx in range(partion_num): + if partion_num > 1: + filename_with_partion = f"{filename.replace('.jsonl','')}.{partion_idx:02d}_{partion_num:02d}.jsonl" + else: + filename_with_partion = filename + + skip = False + for result_old_dir in POSSIABLE_RESULT_SAVE_DIR_LIST: + result_old_path = os.path.join(result_old_dir, filename_with_partion) + if not args.redo and check_path_exists(result_old_path,client): + tqdm.write(f"[Skip]: existed {result_old_path} ") + skip = True + break + if skip:continue + + + result_path = os.path.join(result_save_root, filename_with_partion) + if args.check_lock: + lock_path = os.path.join(LOCKSERVER, "checklocktime", filename_with_partion) + last_start_time = check_lock_and_last_start_time(lock_path,client) + if last_start_time and not args.redo: + date_string = last_start_time + date_format = "%Y-%m-%d %H:%M:%S" + date = datetime.strptime(date_string, date_format) + deltatime = datetime.now() - date + if deltatime < timedelta(hours=1): + tqdm.write(f"\n [Skip]: {filename_with_partion} is locked by {date_string} created at {last_start_time} [now is {deltatime}]\n ") + continue + + create_last_start_time_lock(os.path.join(LOCKSERVER,"createlocktime", filename_with_partion),client) + + print(f"now we deal with {inputs_path} to {result_path}") + os.makedirs(os.path.dirname(result_path), exist_ok=True) + + + if ocrmodel is None: + ocrmodel = ModifiedPaddleOCR() + + + deal_with_one_dataset(inputs_path, result_path, + ocrmodel,ocrmodel.batch_det_model.prepare_image, + pdf_batch_size =args.page_num_per_batch, + image_batch_size=128, + num_workers = args.num_workers, + partion_num = partion_num, + partion_idx = partion_idx) + print(f""" +========================================= +finish dealing with {result_path} +========================================= + """) + # except: + # traceback.print_exc() + # tqdm.write(f"[Error]: {filename_with_partion} failed") + # finally: + # pass \ No newline at end of file diff --git a/batch_running_task/task_det/filte_out_left_namelist.py b/batch_running_task/task_det/filte_out_left_namelist.py new file mode 100644 index 0000000..72cc56a --- /dev/null +++ b/batch_running_task/task_det/filte_out_left_namelist.py @@ -0,0 +1,25 @@ +import os +Already_Done={} +empty_filelist = [] +with open("finished.det_patch.filelist", "r") as f: + for line in f: + line = line.strip().split() + if len(line)<4:continue + size = int(line[-2]) + filename = line[-1] + abspath = f"opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared/layoutV1/result/{filename}" + if size == 0: + empty_filelist.append(filename) + continue + Already_Done[filename]=abspath + +Should_do = [] +with open('scihub_collection/sci_hub.need_det.filelist','r') as f: + for line in f: + name = line.strip().split("/")[-1] + if name in Already_Done:continue + Should_do.append(name) +print(f"write to sci_index_files.remain.filelist") +with open('scihub_collection/sci_hub.need_det.remain.filelist','w') as f: + for name in Should_do: + f.write("opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared/layoutV1/result/"+name+'\n') \ No newline at end of file diff --git a/batch_running_task/task_det/rough_det.py b/batch_running_task/task_det/rough_det.py new file mode 100644 index 0000000..0189d01 --- /dev/null +++ b/batch_running_task/task_det/rough_det.py @@ -0,0 +1,144 @@ + +import os,sys,warnings +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ['CUDA_MODULE_LOADING'] = 'LAZY' +warnings.simplefilter(action='ignore', category=FutureWarning) +warnings.simplefilter(action='ignore', category=UserWarning) +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from get_data_utils import * +import numpy as np +from tqdm.auto import tqdm +from torch.utils.data import Dataset, TensorDataset, DataLoader +from dataaccelerate import DataPrefetcher ,sendall2gpu, DataSimfetcher +import torch +from task_layout.rough_layout import ModifiedPaddleOCR,inference_det,det_postprocess,save_result +from scihub_pdf_dataset import DetPageInfoImageDataset, DetImageDataset,concat_collate_fn,tuple_list_collate_fn +from utils import collect_paragraph_image_and_its_coordinate +try: + client=build_client() +except: + client=None +eps=1e-7 +import math + + +from typing import List, Dict +import time +def deal_with_one_dataset(pdf_path, result_path, text_detector,det_pre_transform, + pdf_batch_size =32, + image_batch_size=256, + num_workers=8, + partion_num = 1, + partion_idx = 0): + + images_dataset = DetPageInfoImageDataset(pdf_path,det_pre_transform,partion_num = partion_num, partion_idx = partion_idx) + data_to_save = fast_deal_with_one_dataset(images_dataset,text_detector, pdf_batch_size =pdf_batch_size, + image_batch_size=image_batch_size,num_workers=num_workers) + save_result(data_to_save, images_dataset,result_path,add_on_mode=True) + + +class DetDataPrefetcher(DataPrefetcher): + def preload(self): + try: + self.batch = next(self.loader) + except StopIteration: + self.batch = None + return + with torch.cuda.stream(self.stream): + tensor, information = self.batch + if tensor is not None: + self.batch = (sendall2gpu(tensor,self.device), information) + else: + self.batch = None, information + +def gpu_inference(canvas_tensor_this_batch,text_detector,det_inner_batch_size,oriheight,oriwidth): + det_model = text_detector.batch_det_model.net + with torch.inference_mode(): + ### do inner_batch_size batch forward + dt_boxes_result_list = [] + for i in tqdm(range(0, len(canvas_tensor_this_batch), det_inner_batch_size), position=3, desc="inner_batch", leave=False): + data = canvas_tensor_this_batch[i:i+det_inner_batch_size] + if isinstance(data, list): data = torch.stack(data) + data=data.cuda() + dt_boxaes_batch = det_model(data) + + dt_boxaes_batch = dt_boxaes_batch['maps'].cpu()[:,0] + dt_boxes_result = det_postprocess(dt_boxaes_batch,text_detector,oriheight,oriwidth) + dt_boxes_result_list.extend(dt_boxes_result) + return dt_boxes_result_list + + +def fast_deal_with_one_dataset(images_dataset:DetPageInfoImageDataset,text_detector:ModifiedPaddleOCR, + pdf_batch_size =32, + image_batch_size=256, + num_workers=8): + + dataloader = DataLoader(images_dataset, batch_size=pdf_batch_size,collate_fn=tuple_list_collate_fn, + num_workers=num_workers,pin_memory=False, + prefetch_factor=2) + featcher = DetDataPrefetcher(dataloader,device='cuda') + batch = featcher.next() + data_loading = [] + model_train = [] + last_record_time = time.time() + data_to_save = {} + pbar = None + oriheight = 1920 # used for correct giving boxing and postprocessing + oriwidth = 1472 # used for correct giving boxing and postprocessing + while batch is not None: + #for batch in dataloader: + ########## format data ################ + detimages, rough_layout_this_batch = batch + update_seq_len = len(detimages) if detimages is not None else 0 + if detimages is not None and len(detimages)>0: + canvas_tensor_this_batch, partition_per_batch,_,_ = collect_paragraph_image_and_its_coordinate(detimages, rough_layout_this_batch,2) + location = [] + for global_page_id in range(len(partition_per_batch)-1): + start_id = partition_per_batch[global_page_id] + end_id = partition_per_batch[global_page_id+1] + pdf_path = rough_layout_this_batch[global_page_id][0]['pdf_path'] + page_id = rough_layout_this_batch[global_page_id][0]['page_id'] + for image_id in range(0, end_id-start_id): + location.append((pdf_path, page_id, image_id)) + assert len(location) == len(canvas_tensor_this_batch) + data_loading.append(time.time() - last_record_time);last_record_time =time.time() + ########## format computing ################ + dt_boxes_list = gpu_inference(canvas_tensor_this_batch,text_detector,image_batch_size,oriheight,oriwidth) + if pbar:pbar.set_description(f"[Data][{np.mean(data_loading[-10:]):.2f}] [Model][{np.mean(model_train[-10:]):.2f}]") + for dt_boxes, (pdf_path, page_id, line_image_id) in zip(dt_boxes_list, location): + page_id= int(page_id) + line_image_id=int(line_image_id) + for line_box in dt_boxes: + p1, p2, p3, p4 = line_box.tolist() + if pdf_path not in data_to_save: + data_to_save[pdf_path] = {'height':oriheight, 'width':oriwidth} + if page_id not in data_to_save[pdf_path]: + data_to_save[pdf_path][page_id] = [] + data_to_save[pdf_path][page_id].append( + { + 'category_id': 15, + 'poly': p1 + p2 + p3 + p4, + } + ) + model_train.append(time.time() - last_record_time) + + if pbar: + pbar.update(update_seq_len) + pbar.set_description(f"[Data][{np.mean(data_loading[-10:]):.2f}] [Model][{np.mean(model_train[-10:]):.2f}]") + last_record_time =time.time() + if pbar is None: + pbar = tqdm(total=len(images_dataset)-update_seq_len,position=2,desc="pages",leave=False, bar_format='{l_bar}{bar}{r_bar}') + batch = featcher.next() + return data_to_save + +if __name__ == "__main__": + + ocr_mode = 'batch' + batch_size = 128 + num_workers= 8 + metadata_filepath = "part-66210c190659-012745.jsonl" + text_detector = ModifiedPaddleOCR() + images_dataset = DetPageInfoImageDataset(metadata_filepath,det_pre_transform=text_detector.batch_det_model.prepare_image) + data_to_save = fast_deal_with_one_dataset(images_dataset,text_detector,pdf_batch_size=2, image_batch_size=128 ,num_workers=num_workers) + #print(data_to_save) + save_result(data_to_save, images_dataset, "test_result/result.det.jsonl") diff --git a/batch_running_task/task_det/run_det.sh b/batch_running_task/task_det/run_det.sh new file mode 100644 index 0000000..79c54e8 --- /dev/null +++ b/batch_running_task/task_det/run_det.sh @@ -0,0 +1,6 @@ +#!/bin/bash +#SBATCH -J ParseSciHUB +#SBATCH -o .log/%j-ParseSciHUB.out +#SBATCH -e .log/%j-ParseSciHUB.out +unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY +python batch_running_task/task_det/batch_deal_with_det.py --root_path $1 --index_part $2 --num_parts $3 --shuffle --num_workers 8 # --accelerated_layout --accelerated_mfd \ No newline at end of file diff --git a/batch_running_task/task_layout/batch_deal_with_addon_layout.py b/batch_running_task/task_layout/batch_deal_with_addon_layout.py new file mode 100644 index 0000000..709fc68 --- /dev/null +++ b/batch_running_task/task_layout/batch_deal_with_addon_layout.py @@ -0,0 +1,189 @@ + +from rough_layout import * +# from rough_layout_with_aync import * ## async is not safe, lets disable it +from batch_running_task.get_data_utils import * +RESULT_SAVE_PATH="opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared" +#RESULT_SAVE_PATH="tianning:s3://temp/debug" +INPUT_LOAD_PATH="opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub" +LOCKSERVER="http://10.140.52.123:8000" +from datetime import datetime,timedelta +import socket +hostname= socket.gethostname() +if __name__ == '__main__': + import argparse, logging, os + import numpy as np + from tqdm.auto import tqdm + import traceback + parser = argparse.ArgumentParser() + parser.add_argument("--root_path", type=str) + parser.add_argument("--index_part", type=int, default=0) + parser.add_argument('--num_parts', type=int, default=1) + + parser.add_argument('--verbose', '-v', action='store_true', help='', default=False) + parser.add_argument('--redo', action='store_true', help='', default=False) + parser.add_argument('--do_not_det', action='store_true', help='', default=False) + parser.add_argument('--do_rec', action='store_true', help='', default=False) + parser.add_argument('--shuffle', action='store_true', help='', default=False) + parser.add_argument('--inner_batch_size', type=int, default=16) + parser.add_argument('--batch_size', type=int, default=16) + parser.add_argument('--num_workers', type=int, default=4) + parser.add_argument('--result_save_path', type=str, default=RESULT_SAVE_PATH) + parser.add_argument('--accelerated_layout', action='store_true', help='', default=False) + parser.add_argument('--accelerated_mfd', action='store_true', help='', default=False) + parser.add_argument('--async_mode', action='store_true', help='', default=False) + + args = parser.parse_args() + + assert not args.async_mode, "async_mode is not safe, please disable it" + root_path = args.root_path + if os.path.isdir(root_path): + ###### do not let the program scan the dir ######## + ##### thus the only dir case is that use a dir path like data/archive_json/quant-ph_0004055 + raise NotImplementedError + all_file_list = [root_path] + elif os.path.isfile(root_path): + if root_path.endswith('.jsonl'): + all_file_list = [root_path] + else: + with open(root_path,'r') as f: + all_file_list = [t.strip() for t in f.readlines()] + else: + raise NotImplementedError + index_part= args.index_part + num_parts = args.num_parts + totally_paper_num = len(all_file_list) + if totally_paper_num > 1: + divided_nums = np.linspace(0, totally_paper_num - 1, num_parts+1) + divided_nums = [int(s) for s in divided_nums] + start_index = divided_nums[index_part] + end_index = divided_nums[index_part + 1] + else: + start_index = 0 + end_index = 1 + verbose = True + if args.shuffle: + np.random.shuffle(all_file_list) + + all_file_list = all_file_list[start_index: end_index] + + if len(all_file_list)==0: + print(f"Index {index_part} has no file to process") + exit() + + with open('configs/model_configs.yaml') as f: + model_configs = yaml.load(f, Loader=yaml.FullLoader) + + img_size = model_configs['model_args']['img_size'] + conf_thres= model_configs['model_args']['conf_thres'] + iou_thres = model_configs['model_args']['iou_thres'] + device = model_configs['model_args']['device'] + dpi = model_configs['model_args']['pdf_dpi'] + + task_name = "layoutV6" + patch_version= "patch" + layout_model = None + mfd_model = None + client = None + ocrmodel = None + page_num_map_whole = None #get_page_num_map_whole() + for inputs_path_tuple in tqdm(all_file_list, leave=False, position=1): + inputs_path, pdfid_and_pageid_list_str = inputs_path_tuple.strip().split() + filename = os.path.basename(inputs_path) + if "layoutV" in inputs_path: + result_save_root = os.path.join(os.path.dirname(os.path.dirname(inputs_path)),patch_version ) + inputs_path = os.path.join(INPUT_LOAD_PATH,filename) + else: + result_save_root = os.path.join(args.result_save_path, task_name, patch_version) + + if inputs_path.startswith('s3'): + inputs_path = "opendata:"+inputs_path + # assert inputs_path.startswith('opendata:s3') + # assert result_path.startswith('opendata:s3') + if client is None: + client = build_client() + if not check_path_exists(inputs_path,client): + tqdm.write(f"[Skip]: no {inputs_path} ") + continue + + POSSIABLE_RESULT_SAVE_DIR_LIST=[ + os.path.join(args.result_save_path, "layoutV9", patch_version), + os.path.join(args.result_save_path, "layoutV8", patch_version), + os.path.join(args.result_save_path, "layoutV7", patch_version), + os.path.join(args.result_save_path, "layoutV6", patch_version), + os.path.join(args.result_save_path, "layoutV5", patch_version), + os.path.join(args.result_save_path, "layoutV3", patch_version), + os.path.join(args.result_save_path, "layoutV2", patch_version), + os.path.join(args.result_save_path, "layoutV1", patch_version), + os.path.join("opendata:s3://llm-pdf-text/pdf_gpu_output/ebook_index_v4/scihub/v001/scihub/"), + ] + + skip = False + for result_old_dir in POSSIABLE_RESULT_SAVE_DIR_LIST: + result_old_path = os.path.join(result_old_dir, filename) + if check_path_exists(result_old_path,client) and not args.redo: + tqdm.write(f"[Skip]: existed {result_old_path} ") + skip = True + break + if skip:continue + + + + partion_num = 1 + for partion_idx in range(partion_num): + if partion_num > 1: + filename_with_partion = f"{filename.replace('.jsonl','')}.{partion_idx}_{partion_num}.jsonl" + else: + filename_with_partion = filename + + skip = False + for result_old_dir in POSSIABLE_RESULT_SAVE_DIR_LIST: + result_old_path = os.path.join(result_old_dir, filename_with_partion) + if not args.redo and check_path_exists(result_old_path,client): + tqdm.write(f"[Skip]: existed {result_old_path} ") + skip = True + break + if skip:continue + + + result_path = os.path.join(result_save_root, filename_with_partion) + + print(f"now we deal with {inputs_path} to {result_path}") + os.makedirs(os.path.dirname(result_path), exist_ok=True) + if not inputs_path.startswith("opendata:"): + page_num_for_name_path = os.path.join(os.path.dirname(os.path.dirname(inputs_path)), + "page_num_map", + os.path.basename(inputs_path).replace(".jsonl",".json") + ) + with open(page_num_for_name_path,'r') as f: + page_num_for_name_list = json.load(f) + page_num_for_name={} + for pdf_path, page_num in page_num_for_name_list: + if pdf_path.startswith("s3:"): pdf_path = "opendata:"+ pdf_path + page_num_for_name[pdf_path] = page_num + page_num_map_whole = page_num_for_name + tqdm.write(f"we load page_num_for_name from {page_num_for_name_path}") + if layout_model is None:layout_model = get_layout_model(model_configs,args.accelerated_layout) + if mfd_model is None:mfd_model = get_batch_YOLO_model(model_configs,batch_size=args.inner_batch_size,use_tensorRT=args.accelerated_mfd) + if ocrmodel is None:ocrmodel = ModifiedPaddleOCR(show_log=True) + + pdfid_and_pageid_list_split = pdfid_and_pageid_list_str.split('|') + pdfid_and_pageid_list=[] + for pdfid_and_pageid in pdfid_and_pageid_list_split: + pdfid, pageid = pdfid_and_pageid.split(',') + pdfid_and_pageid_list.append([int(pdfid), int(pageid)]) + deal_with_page_addon_dataset(inputs_path, pdfid_and_pageid_list, result_path, + layout_model, mfd_model, ocrmodel=ocrmodel, + inner_batch_size=args.inner_batch_size, + batch_size=args.batch_size, + num_workers=args.num_workers, + do_text_det = not args.do_not_det, + do_text_rec = args.do_rec, + partion_num = partion_num, + partion_idx = partion_idx + ) + print(f""" +========================================= +finish dealing with {result_path} +========================================= + """) + \ No newline at end of file diff --git a/batch_running_task/task_layout/batch_deal_with_layout.py b/batch_running_task/task_layout/batch_deal_with_layout.py new file mode 100644 index 0000000..07b25d6 --- /dev/null +++ b/batch_running_task/task_layout/batch_deal_with_layout.py @@ -0,0 +1,178 @@ + +from rough_layout import * +from get_data_utils import * +RESULT_SAVE_PATH="opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared" +INPUT_LOAD_PATH="opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub" +# from rough_layout_with_aync import * ## async is not safe, lets disable it +#RESULT_SAVE_PATH="tianning:s3://temp/debug" +CURRENT_END_SIGN=".current_end.sign" +from datetime import datetime,timedelta +import socket +hostname= socket.gethostname() +LOCKSERVER="http://10.140.52.123:8000" if hostname.startswith('SH') else "http://paraai-n32-h-01-ccs-master-2:32453" +from batch_run_utils import BatchModeConfig, process_files,dataclass,obtain_processed_filelist +from simple_parsing import ArgumentParser + +@dataclass +class BatchLayoutConfig(BatchModeConfig): + do_not_det: bool = False + do_rec: bool = False + inner_batch_size: int = 16 + batch_size: int = 16 + num_workers: int = 4 + accelerated_layout: bool = False + accelerated_mfd: bool = False + async_mode: bool = False + result_save_path: str=RESULT_SAVE_PATH + use_lock: bool = True + debug:bool = False + def from_dict(kargs): + return BatchLayoutConfig(**kargs) + def to_dict(self): + return self.__dict__ + +if __name__ == '__main__': + + from tqdm.auto import tqdm + import traceback + parser = ArgumentParser() + parser.add_arguments(BatchLayoutConfig, dest="config") + args = parser.parse_args() + args = args.config + #args.check_lock = hostname.startswith('SH') + assert not args.async_mode, "async_mode is not safe, please disable it" + all_file_list = obtain_processed_filelist(args) + + if len(all_file_list)==0: + exit() + + with open('configs/model_configs.yaml') as f: + model_configs = yaml.load(f, Loader=yaml.FullLoader) + + img_size = model_configs['model_args']['img_size'] + conf_thres= model_configs['model_args']['conf_thres'] + iou_thres = model_configs['model_args']['iou_thres'] + device = model_configs['model_args']['device'] + dpi = model_configs['model_args']['pdf_dpi'] + + task_name = "layoutV6" + layout_model = None + mfd_model = None + client = None + ocrmodel = None + page_num_map_whole = None #get_page_num_map_whole() + for inputs_path in tqdm(all_file_list, leave=False, position=1): + if os.path.exists(CURRENT_END_SIGN): + break + filename = os.path.basename(inputs_path) + if "layoutV" in inputs_path: + result_save_root = os.path.dirname(inputs_path) + inputs_path = os.path.join(INPUT_LOAD_PATH,filename) + else: + result_save_root = os.path.join(args.result_save_path, task_name, "result") + + if inputs_path.startswith('s3'): + inputs_path = "opendata:"+inputs_path + # assert inputs_path.startswith('opendata:s3') + # assert result_path.startswith('opendata:s3') + if client is None: + client = build_client() + if not check_path_exists(inputs_path,client): + tqdm.write(f"[Skip]: no {inputs_path} ") + continue + + POSSIABLE_RESULT_SAVE_DIR_LIST=[ + os.path.join(args.result_save_path, "layoutV9", "result"), + os.path.join(args.result_save_path, "layoutV8", "result"), + os.path.join(args.result_save_path, "layoutV7", "result"), + os.path.join(args.result_save_path, "layoutV6", "result"), + os.path.join(args.result_save_path, "layoutV5", "result"), + os.path.join(args.result_save_path, "layoutV3", "result"), + os.path.join(args.result_save_path, "layoutV2", "result"), + os.path.join(args.result_save_path, "layoutV1", "result"), + os.path.join("opendata:s3://llm-pdf-text/pdf_gpu_output/ebook_index_v4/scihub/v001/scihub/"), + ] + + skip = False + for result_old_dir in POSSIABLE_RESULT_SAVE_DIR_LIST: + result_old_path = os.path.join(result_old_dir, filename) + if check_path_exists(result_old_path,client) and not args.redo: + tqdm.write(f"[Skip]: existed {result_old_path} ") + skip = True + break + if skip:continue + + + + partion_num = 1 + for partion_idx in range(partion_num): + if partion_num > 1: + filename_with_partion = f"{filename.replace('.jsonl','')}.{partion_idx}_{partion_num}.jsonl" + else: + filename_with_partion = filename + + skip = False + for result_old_dir in POSSIABLE_RESULT_SAVE_DIR_LIST: + result_old_path = os.path.join(result_old_dir, filename_with_partion) + if not args.redo and check_path_exists(result_old_path,client): + tqdm.write(f"[Skip]: existed {result_old_path} ") + skip = True + break + if skip:continue + + + result_path = os.path.join(result_save_root, filename_with_partion) + + lock_path = os.path.join(LOCKSERVER, "checklocktime", filename_with_partion) + last_start_time = check_lock_and_last_start_time(lock_path,client) + if last_start_time and not args.redo: + date_string = last_start_time + date_format = "%Y-%m-%d %H:%M:%S" + date = datetime.strptime(date_string, date_format) + deltatime = datetime.now() - date + if deltatime < timedelta(hours=1): + tqdm.write(f"[Skip]: {filename_with_partion} is locked by {date_string} created at {last_start_time} [now is {deltatime}]") + continue + + create_last_start_time_lock(os.path.join(LOCKSERVER,"createlocktime", filename_with_partion),client) + + print(f"now we deal with {inputs_path} to {result_path}") + os.makedirs(os.path.dirname(result_path), exist_ok=True) + if not inputs_path.startswith("opendata:"): + page_num_for_name_path = os.path.join(os.path.dirname(os.path.dirname(inputs_path)), + "page_num_map", + os.path.basename(inputs_path).replace(".jsonl",".json") + ) + with open(page_num_for_name_path,'r') as f: + page_num_for_name_list = json.load(f) + page_num_for_name={} + for pdf_path, page_num in page_num_for_name_list: + if pdf_path.startswith("s3:"): pdf_path = "opendata:"+ pdf_path + page_num_for_name[pdf_path] = page_num + page_num_map_whole = page_num_for_name + tqdm.write(f"we load page_num_for_name from {page_num_for_name_path}") + if layout_model is None:layout_model = get_layout_model(model_configs,args.accelerated_layout) + if mfd_model is None:mfd_model = get_batch_YOLO_model(model_configs,batch_size=args.inner_batch_size,use_tensorRT=args.accelerated_mfd) + if ocrmodel is None:ocrmodel = ModifiedPaddleOCR(show_log=True) + + try: + deal_with_page_info_dataset(inputs_path, result_path, + layout_model, mfd_model, ocrmodel=ocrmodel, + inner_batch_size=args.inner_batch_size, + batch_size=args.batch_size, + num_workers=args.num_workers, + do_text_det = not args.do_not_det, + do_text_rec = args.do_rec, + partion_num = partion_num, + partion_idx = partion_idx,page_num_for_name=page_num_map_whole + ) + print(f""" +========================================= +finish dealing with {result_path} +========================================= + """) + except: + traceback.print_exc() + tqdm.write(f"[Error]: {filename_with_partion} failed") + finally: + pass \ No newline at end of file diff --git a/batch_running_task/task_layout/batch_deal_with_layout_fixmissing_page.py b/batch_running_task/task_layout/batch_deal_with_layout_fixmissing_page.py new file mode 100644 index 0000000..e6ce5c0 --- /dev/null +++ b/batch_running_task/task_layout/batch_deal_with_layout_fixmissing_page.py @@ -0,0 +1,135 @@ + +from batch_deal_with_layout import * + +if __name__ == '__main__': + + from tqdm.auto import tqdm + import traceback + parser = ArgumentParser() + parser.add_arguments(BatchLayoutConfig, dest="config") + args = parser.parse_args() + args = args.config + #args.check_lock = hostname.startswith('SH') + assert not args.async_mode, "async_mode is not safe, please disable it" + all_file_list = obtain_processed_filelist(args) + + if len(all_file_list)==0: + exit() + + with open('configs/model_configs.yaml') as f: + model_configs = yaml.load(f, Loader=yaml.FullLoader) + + img_size = model_configs['model_args']['img_size'] + conf_thres= model_configs['model_args']['conf_thres'] + iou_thres = model_configs['model_args']['iou_thres'] + device = model_configs['model_args']['device'] + dpi = model_configs['model_args']['pdf_dpi'] + + + version = "fix_missing_page_version2" + layout_model = None + mfd_model = None + client = None + ocrmodel = None + page_num_map_whole = None #get_page_num_map_whole() + for inputs_line in tqdm(all_file_list, leave=False, position=1): + + splited_line = inputs_line.split() + inputs_path = splited_line[0] + json_str = " ".join(splited_line[1:]) + page_num_for_name = json.loads(json_str) + + if os.path.exists(CURRENT_END_SIGN): + break + filename = os.path.basename(inputs_path) + #assert "layoutV" in inputs_path + result_save_root = os.path.join(os.path.dirname(os.path.dirname(inputs_path)),version) + #inputs_path = os.path.join(INPUT_LOAD_PATH,filename) + + if inputs_path.startswith('s3'): + inputs_path = "opendata:"+inputs_path + # assert inputs_path.startswith('opendata:s3') + # assert result_path.startswith('opendata:s3') + if client is None: + client = build_client() + + if not check_path_exists(inputs_path,client): + tqdm.write(f"[Skip]: no {inputs_path} ") + continue + + POSSIABLE_RESULT_SAVE_DIR_LIST=[ + result_save_root, + os.path.join("opendata:s3://llm-pdf-text/pdf_gpu_output/ebook_index_v4/scihub/v001/scihub/"), + ] + + skip = False + for result_old_dir in POSSIABLE_RESULT_SAVE_DIR_LIST: + result_old_path = os.path.join(result_old_dir, filename) + if check_path_exists(result_old_path,client) and not args.redo: + tqdm.write(f"[Skip]: existed {result_old_path} ") + skip = True + break + if skip:continue + + + + partion_num = 1 + for partion_idx in range(partion_num): + if partion_num > 1: + filename_with_partion = f"{filename.replace('.jsonl','')}.{partion_idx}_{partion_num}.jsonl" + else: + filename_with_partion = filename + + skip = False + for result_old_dir in POSSIABLE_RESULT_SAVE_DIR_LIST: + result_old_path = os.path.join(result_old_dir, filename_with_partion) + if not args.redo and check_path_exists(result_old_path,client): + tqdm.write(f"[Skip]: existed {result_old_path} ") + skip = True + break + if skip:continue + + + result_path = os.path.join(result_save_root, filename_with_partion) + if args.use_lock: + lock_path = os.path.join(LOCKSERVER, "checklocktime", filename_with_partion) + last_start_time = check_lock_and_last_start_time(lock_path,client) + if last_start_time and not args.redo: + date_string = last_start_time + date_format = "%Y-%m-%d %H:%M:%S" + date = datetime.strptime(date_string, date_format) + deltatime = datetime.now() - date + if deltatime < timedelta(hours=0.1): + tqdm.write(f"[Skip]: {filename_with_partion} is locked by {date_string} created at {last_start_time} [now is {deltatime}]") + continue + + create_last_start_time_lock(os.path.join(LOCKSERVER,"createlocktime", filename_with_partion),client) + + print(f"now we deal with {inputs_path} to {result_path}") + os.makedirs(os.path.dirname(result_path), exist_ok=True) + if args.debug:raise + if layout_model is None:layout_model = get_layout_model(model_configs,args.accelerated_layout) + if mfd_model is None:mfd_model = get_batch_YOLO_model(model_configs,batch_size=args.inner_batch_size,use_tensorRT=args.accelerated_mfd) + if ocrmodel is None:ocrmodel = ModifiedPaddleOCR(show_log=True) + + try: + deal_with_page_info_dataset_for_missing_page(inputs_path, result_path, + layout_model, mfd_model, ocrmodel=ocrmodel, + inner_batch_size=args.inner_batch_size, + batch_size=args.batch_size, + num_workers=args.num_workers, + do_text_det = not args.do_not_det, + do_text_rec = args.do_rec, + partion_num = partion_num, + partion_idx = partion_idx,page_num_for_name=page_num_for_name + ) + print(f""" +========================================= +finish dealing with {result_path} +========================================= + """) + except: + traceback.print_exc() + tqdm.write(f"[Error]: {filename_with_partion} failed") + finally: + pass \ No newline at end of file diff --git a/batch_running_task/task_layout/batch_text_detector.py b/batch_running_task/task_layout/batch_text_detector.py new file mode 100644 index 0000000..7bb17ba --- /dev/null +++ b/batch_running_task/task_layout/batch_text_detector.py @@ -0,0 +1,699 @@ +import os +import sys +import copy +import cv2 +import numpy as np +import time +import json +import torch +from shapely.geometry import Polygon +import pyclipper +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from pytorchocr.base_ocr_v20 import BaseOCRV20 +from pytorchocr.utils.utility import get_image_file_list, check_and_read_gif +from pytorchocr.data import create_operators, transform +from pytorchocr.postprocess import build_post_process +from pytorchocr import pytorchocr_utility as utility +from dataclasses import dataclass + + +class TextDetector(BaseOCRV20): + def __init__(self, args, **kwargs): + self.args = args + self.det_algorithm = args.det_algorithm + pre_process_list = [{ + 'DetResizeForTest': { + 'limit_side_len': args.det_limit_side_len, + 'limit_type': args.det_limit_type, + } + }, { + 'NormalizeImage': { + 'std': [0.229, 0.224, 0.225], + 'mean': [0.485, 0.456, 0.406], + 'scale': '1./255.', + 'order': 'hwc' + } + }, { + 'ToCHWImage': None + }, { + 'KeepKeys': { + 'keep_keys': ['image', 'shape'] + } + }] + postprocess_params = {} + if self.det_algorithm == "DB": + postprocess_params['name'] = 'DBPostProcess' + postprocess_params["thresh"] = args.det_db_thresh + postprocess_params["box_thresh"] = args.det_db_box_thresh + postprocess_params["max_candidates"] = 1000 + postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio + postprocess_params["use_dilation"] = args.use_dilation + postprocess_params["score_mode"] = args.det_db_score_mode + elif self.det_algorithm == "DB++": + postprocess_params['name'] = 'DBPostProcess' + postprocess_params["thresh"] = args.det_db_thresh + postprocess_params["box_thresh"] = args.det_db_box_thresh + postprocess_params["max_candidates"] = 1000 + postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio + postprocess_params["use_dilation"] = args.use_dilation + postprocess_params["score_mode"] = args.det_db_score_mode + pre_process_list[1] = { + 'NormalizeImage': { + 'std': [1.0, 1.0, 1.0], + 'mean': + [0.48109378172549, 0.45752457890196, 0.40787054090196], + 'scale': '1./255.', + 'order': 'hwc' + } + } + elif self.det_algorithm == "EAST": + postprocess_params['name'] = 'EASTPostProcess' + postprocess_params["score_thresh"] = args.det_east_score_thresh + postprocess_params["cover_thresh"] = args.det_east_cover_thresh + postprocess_params["nms_thresh"] = args.det_east_nms_thresh + elif self.det_algorithm == "SAST": + pre_process_list[0] = { + 'DetResizeForTest': { + 'resize_long': args.det_limit_side_len + } + } + postprocess_params['name'] = 'SASTPostProcess' + postprocess_params["score_thresh"] = args.det_sast_score_thresh + postprocess_params["nms_thresh"] = args.det_sast_nms_thresh + self.det_sast_polygon = args.det_sast_polygon + if self.det_sast_polygon: + postprocess_params["sample_pts_num"] = 6 + postprocess_params["expand_scale"] = 1.2 + postprocess_params["shrink_ratio_of_width"] = 0.2 + else: + postprocess_params["sample_pts_num"] = 2 + postprocess_params["expand_scale"] = 1.0 + postprocess_params["shrink_ratio_of_width"] = 0.3 + elif self.det_algorithm == "PSE": + postprocess_params['name'] = 'PSEPostProcess' + postprocess_params["thresh"] = args.det_pse_thresh + postprocess_params["box_thresh"] = args.det_pse_box_thresh + postprocess_params["min_area"] = args.det_pse_min_area + postprocess_params["box_type"] = args.det_pse_box_type + postprocess_params["scale"] = args.det_pse_scale + self.det_pse_box_type = args.det_pse_box_type + elif self.det_algorithm == "FCE": + pre_process_list[0] = { + 'DetResizeForTest': { + 'rescale_img': [1080, 736] + } + } + postprocess_params['name'] = 'FCEPostProcess' + postprocess_params["scales"] = args.scales + postprocess_params["alpha"] = args.alpha + postprocess_params["beta"] = args.beta + postprocess_params["fourier_degree"] = args.fourier_degree + postprocess_params["box_type"] = args.det_fce_box_type + else: + print("unknown det_algorithm:{}".format(self.det_algorithm)) + sys.exit(0) + + self.preprocess_op = create_operators(pre_process_list) + self.postprocess_op = build_post_process(postprocess_params) + + use_gpu = args.use_gpu + self.use_gpu = torch.cuda.is_available() and use_gpu + + self.weights_path = args.det_model_path + self.yaml_path = args.det_yaml_path + network_config = utility.AnalysisConfig(self.weights_path, self.yaml_path) + super(TextDetector, self).__init__(network_config, **kwargs) + self.load_pytorch_weights(self.weights_path) + self.net.eval() + if self.use_gpu: + self.net.cuda() + + def order_points_clockwise(self, pts): + """ + reference from: https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py + # sort the points based on their x-coordinates + """ + xSorted = pts[np.argsort(pts[:, 0]), :] + + # grab the left-most and right-most points from the sorted + # x-roodinate points + leftMost = xSorted[:2, :] + rightMost = xSorted[2:, :] + + # now, sort the left-most coordinates according to their + # y-coordinates so we can grab the top-left and bottom-left + # points, respectively + leftMost = leftMost[np.argsort(leftMost[:, 1]), :] + (tl, bl) = leftMost + + rightMost = rightMost[np.argsort(rightMost[:, 1]), :] + (tr, br) = rightMost + + rect = np.array([tl, tr, br, bl], dtype="float32") + return rect + + def clip_det_res(self, points, img_height, img_width): + for pno in range(points.shape[0]): + points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1)) + points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1)) + return points + + def clip_det_res_batch(self, points, img_height, img_width): + # Clip the points to the image borders + points[:, :, 0] = np.clip(points[:, :, 0], 0, img_width - 1) + points[:, :, 1] = np.clip(points[:, :, 1], 0, img_height - 1) + return points + + def filter_tag_det_res(self, dt_boxes, image_shape): + img_height, img_width = image_shape[0:2] + dt_boxes_new = [] + for box in dt_boxes: + box = self.order_points_clockwise(box) + box = self.clip_det_res(box, img_height, img_width) + rect_width = int(np.linalg.norm(box[0] - box[1])) + rect_height = int(np.linalg.norm(box[0] - box[3])) + if rect_width <= 3 or rect_height <= 3: + continue + dt_boxes_new.append(box) + dt_boxes = np.array(dt_boxes_new) + return dt_boxes + + def order_points_clockwise_batch(self, pts_batch): + """ + Orders a batch of points in a clockwise manner. + + Args: + pts_batch (numpy.ndarray): Array of shape (N, 4, 2) containing N sets of four points. + + Returns: + numpy.ndarray: Array of shape (N, 4, 2) with points ordered as top-left, top-right, + bottom-right, bottom-left. + """ + # Sort points in each set by x-coordinates + xSorted = np.sort(pts_batch, axis=1, order=['x']) + + # Separate left-most and right-most points + leftMost = xSorted[:, :2, :] + rightMost = xSorted[:, 2:, :] + + # Sort left-most points by y-coordinates + leftMost = leftMost[np.argsort(leftMost[:, :, 1], axis=1)] + tl = leftMost[:, 0, :] + bl = leftMost[:, 1, :] + + # Sort right-most points by y-coordinates + rightMost = rightMost[np.argsort(rightMost[:, :, 1], axis=1)] + tr = rightMost[:, 0, :] + br = rightMost[:, 1, :] + + # Combine the points into the ordered rectangle + rect = np.stack((tl, tr, br, bl), axis=1) + return rect + + def filter_tag_det_res_new(self, dt_boxes, image_shape): + img_height, img_width = image_shape[0:2] + + # Order points clockwise and clip them + ordered_boxes = self.order_points_clockwise_batch(dt_boxes) + clipped_boxes = self.clip_det_res_batch(ordered_boxes,img_height, img_width) + + # Calculate widths and heights + widths = np.linalg.norm(clipped_boxes[:, 0] - clipped_boxes[:, 1], axis=1).astype(int) + heights = np.linalg.norm(clipped_boxes[:, 0] - clipped_boxes[:, 3], axis=1).astype(int) + + # Filter out boxes with width or height <= 3 + valid_indices = (widths > 3) & (heights > 3) + dt_boxes_new = clipped_boxes[valid_indices] + + return dt_boxes_new + + def filter_tag_det_res_only_clip(self, dt_boxes, image_shape): + img_height, img_width = image_shape[0:2] + dt_boxes_new = [] + for box in dt_boxes: + box = self.clip_det_res(box, img_height, img_width) + dt_boxes_new.append(box) + dt_boxes = np.array(dt_boxes_new) + return dt_boxes + + def prepare_image(self, img): + data = {'image': img} + data = transform(data, self.preprocess_op) + img, shape_list = data + return data + def preprocess(self,img): + + img, shape_list = self.prepare_image(img) + if img is None: + return None, 0 + img = np.expand_dims(img, axis=0) + shape_list = np.expand_dims(shape_list, axis=0) + inp = torch.from_numpy(img) + if self.use_gpu: + inp = inp.cuda() + return inp, shape_list + + def postprocess(self, outputs,shape_list,ori_shape): + preds = {} + if self.det_algorithm == "EAST": + preds['f_geo'] = outputs['f_geo'].cpu().numpy() + preds['f_score'] = outputs['f_score'].cpu().numpy() + elif self.det_algorithm == 'SAST': + preds['f_border'] = outputs['f_border'].cpu().numpy() + preds['f_score'] = outputs['f_score'].cpu().numpy() + preds['f_tco'] = outputs['f_tco'].cpu().numpy() + preds['f_tvo'] = outputs['f_tvo'].cpu().numpy() + elif self.det_algorithm in ['DB', 'PSE', 'DB++']: + preds['maps'] = outputs['maps'].cpu().numpy() + elif self.det_algorithm == 'FCE': + for i, (k, output) in enumerate(outputs.items()): + preds['level_{}'.format(i)] = output + else: + raise NotImplementedError + + post_result = self.postprocess_op(preds, shape_list) + dt_boxes = post_result[0]['points'] + if (self.det_algorithm == "SAST" and + self.det_sast_polygon) or (self.det_algorithm in ["PSE", "FCE"] and + self.postprocess_op.box_type == 'poly'): + dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_shape) + else: + dt_boxes = self.filter_tag_det_res(dt_boxes, ori_shape) + return dt_boxes + + def __call__(self, img): + ori_shape = img.shape + inp,shape_list = self.preprocess(img) + starttime = time.time() + with torch.no_grad(): + outputs = self.net(inp) + dt_boxes = self.postprocess(outputs,shape_list,ori_shape) + elapse = time.time() - starttime + return dt_boxes, elapse + + +fast_config = {'use_gpu': True, + 'gpu_mem': 500, + 'warmup': False, + 'image_dir': './doc/imgs/1.jpg', + 'det_algorithm': 'DB', + 'det_model_path': 'models/pytorch_paddle_weight/en_ptocr_v3_det_infer.pth', + 'det_limit_side_len': 960, + 'det_limit_type': 'max', + 'det_db_thresh': 0.3, + 'det_db_box_thresh': 0.6, + 'det_db_unclip_ratio': 1.5, + 'max_batch_size': 10, + 'use_dilation': False, + 'det_db_score_mode': 'fast', + 'det_east_score_thresh': 0.8, + 'det_east_cover_thresh': 0.1, + 'det_east_nms_thresh': 0.2, + 'det_sast_score_thresh': 0.5, + 'det_sast_nms_thresh': 0.2, + 'det_sast_polygon': False, + 'det_pse_thresh': 0, + 'det_pse_box_thresh': 0.85, + 'det_pse_min_area': 16, + 'det_pse_box_type': 'box', + 'det_pse_scale': 1, + 'scales': [8, 16, 32], + 'alpha': 1.0, + 'beta': 1.0, + 'fourier_degree': 5, + 'det_fce_box_type': 'poly', + 'rec_algorithm': 'CRNN', + 'rec_model_path': 'weights/en_ptocr_v4_rec_infer.pth', + 'rec_image_inverse': True, + 'rec_image_shape': '3, 32, 320', + 'rec_char_type': 'ch', + 'rec_batch_num': 6, + 'max_text_length': 25, + 'use_space_char': True, + 'drop_score': 0.5, + 'limited_max_width': 1280, + 'limited_min_width': 16, + 'vis_font_path': '/mnt/data/zhangtianning/projects/doc/fonts/simfang.ttf', + 'rec_char_dict_path': './pytorchocr/utils/en_dict.txt', + 'use_angle_cls': False, + 'cls_model_path': None, + 'cls_image_shape': '3, 48, 192', + 'label_list': ['0', '180'], + 'cls_batch_num': 6, + 'cls_thresh': 0.9, + 'enable_mkldnn': False, + 'use_pdserving': False, + 'e2e_algorithm': 'PGNet', + 'e2e_model_path': None, + 'e2e_limit_side_len': 768, + 'e2e_limit_type': 'max', + 'e2e_pgnet_score_thresh': 0.5, + 'e2e_char_dict_path': '/mnt/data/zhangtianning/projects/pytorchocr/utils/ic15_dict.txt', + 'e2e_pgnet_valid_set': 'totaltext', + 'e2e_pgnet_polygon': True, + 'e2e_pgnet_mode': 'fast', + 'sr_model_path': None, + 'sr_image_shape': '3, 32, 128', + 'sr_batch_num': 1, + 'det_yaml_path': 'configs/det/det_ppocr_v3.yml', + 'rec_yaml_path': './configs/rec/PP-OCRv4/en_PP-OCRv4_rec.yml', + 'cls_yaml_path': None, + 'e2e_yaml_path': None, + 'sr_yaml_path': None, + 'use_mp': False, + 'total_process_num': 1, + 'process_id': 0, + 'benchmark': False, + 'save_log_path': './log_output/', + 'show_log': True} +from argparse import Namespace +@dataclass +class PostProcessConfig: + thresh:float + unclip_ratio:float + max_candidates:int + min_size:int + box_thresh:float + + +class BatchTextDetector(TextDetector): + def __init__(self, **kwargs): + args = Namespace(**fast_config) + super().__init__(args, **kwargs) + + def batch_forward(self, _input_image_batch,shape_list_batch,ori_shape_list): + with torch.no_grad(): + dt_boxaes_batch = self.net(_input_image_batch) + pred_batch = self.discard_batch(dt_boxaes_batch) + dt_boxes_list=self.batch_postprocess(pred_batch, shape_list_batch,ori_shape_list) + return dt_boxes_list + def batch_process(self, img_batch, ori_shape_list): + _input_image_batch = [] + shape_list_batch = [] + for img in img_batch: + _input_image, shape_list = self.preprocess(img) + _input_image_batch.append(_input_image) + shape_list_batch.append(shape_list) + _input_image_batch = torch.cat(_input_image_batch) + shape_list_batch = np.stack(shape_list_batch) + return self.batch_forward(self, _input_image_batch,shape_list_batch,ori_shape_list) + + def discard_batch(self, outputs): + preds = {} + if self.det_algorithm == "EAST": + raise NotImplementedError + preds['f_geo'] = outputs['f_geo'].cpu().numpy() + preds['f_score'] = outputs['f_score'].cpu().numpy() + elif self.det_algorithm == 'SAST': + raise NotImplementedError + preds['f_border'] = outputs['f_border'].cpu().numpy() + preds['f_score'] = outputs['f_score'].cpu().numpy() + preds['f_tco'] = outputs['f_tco'].cpu().numpy() + preds['f_tvo'] = outputs['f_tvo'].cpu().numpy() + elif self.det_algorithm in ['DB', 'PSE', 'DB++']: + preds = [{'maps':outputs['maps'][j:j+1]} for j in range(len(outputs['maps']))] + elif self.det_algorithm == 'FCE': + for i, (k, output) in enumerate(outputs.items()): + preds['level_{}'.format(i)] = output + else: + raise NotImplementedError + return preds + + def fast_postprocess(self,preds, shape_list ): + #return fast_torch_postprocess(self.postprocess_op,preds, shape_list) + config = PostProcessConfig(thresh=self.postprocess_op.thresh, + unclip_ratio=self.postprocess_op.unclip_ratio, + max_candidates=self.postprocess_op.max_candidates, + min_size=self.postprocess_op.min_size, + box_thresh=self.postprocess_op.box_thresh) + + #raise NotImplementedError(f"{config}") + if isinstance(preds, dict): + preds = preds['maps'][:, 0, :, :] + + #return fast_torch_postprocess(preds, shape_list,config) + if len(shape_list) == 1: + return fast_torch_postprocess(preds, shape_list,config) + else: + return fast_torch_postprocess_multiprocess(preds, shape_list,config) + + + def batch_postprocess(self, preds_list, shape_list_list,ori_shape_list): + dt_boxes_list=[] + for preds, shape_list,ori_shape in zip(preds_list, shape_list_list,ori_shape_list): + post_result = self.fast_postprocess(preds, shape_list) + dt_boxes = post_result[0]['points'] + if (self.det_algorithm == "SAST" and self.det_sast_polygon) or (self.det_algorithm in ["PSE", "FCE"] and self.postprocess_op.box_type == 'poly'): + raise NotImplementedError + dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_shape) + else: + dt_boxes = self.filter_tag_det_res(dt_boxes, ori_shape) + dt_boxes_list.append(dt_boxes) + return dt_boxes_list + +def fast_torch_postprocess(pred_batch, shape_list,config): + """ + Accelerate below + def __call__(self, outs_dict, shape_list): + pred = outs_dict['maps'] + pred = pred[:, 0, :, :] + segmentation = pred > self.thresh + if isinstance(segmentation, torch.Tensor): + segmentation = segmentation.cpu().numpy() + + + boxes_batch = [] + for batch_index in range(pred.shape[0]): + src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] + if self.dilation_kernel is not None: + mask = cv2.dilate(np.array(segmentation[batch_index]).astype(np.uint8),self.dilation_kernel) + else: + mask = segmentation[batch_index] + boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,src_w, src_h) + + boxes_batch.append({'points': boxes}) + return boxes_batch + """ + + if isinstance(pred_batch, torch.Tensor):pred_batch= pred_batch.cpu().numpy() + segmentation_batch = pred_batch > config.thresh + if isinstance(segmentation_batch, torch.Tensor):segmentation_batch = segmentation_batch.cpu().numpy() + + boxes_batch = [] + for batch_index in range(pred_batch.shape[0]): + src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] + boxes = boxes_from_bitmap(pred_batch[batch_index], segmentation_batch[batch_index],src_w, src_h,config) + #boxes = boxes_from_bitmap_without_score(self,segmentation_batch[batch_index],src_w, src_h) + boxes_batch.append({'points': boxes}) + return boxes_batch + +def get_contours_multiprocess(segmentation_mask): + """Process a single segmentation batch and find contours.""" + outs= cv2.findContours((segmentation_mask * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) + if len(outs) == 3: + img, contours, _ = outs[0], outs[1], outs[2] + elif len(outs) == 2: + contours, _ = outs[0], outs[1] + return contours +from concurrent.futures import ThreadPoolExecutor +def fast_torch_postprocess_multiprocess(pred_batch, shape_list, config): + """ + Accelerate below + def __call__(self, outs_dict, shape_list): + pred = outs_dict['maps'] + pred = pred[:, 0, :, :] + segmentation = pred > self.thresh + if isinstance(segmentation, torch.Tensor): + segmentation = segmentation.cpu().numpy() + + + boxes_batch = [] + for batch_index in range(pred.shape[0]): + src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] + if self.dilation_kernel is not None: + mask = cv2.dilate(np.array(segmentation[batch_index]).astype(np.uint8),self.dilation_kernel) + else: + mask = segmentation[batch_index] + boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,src_w, src_h) + + boxes_batch.append({'points': boxes}) + return boxes_batch + """ + if isinstance(pred_batch, torch.Tensor):pred_batch= pred_batch.cpu().numpy() + segmentation_batch = pred_batch > config.thresh + if isinstance(segmentation_batch, torch.Tensor):segmentation_batch = segmentation_batch.cpu().numpy() + + + num_threads = min(8, len(segmentation_batch)) + + with ThreadPoolExecutor(max_workers=num_threads) as executor: + contours_batch = list(executor.map(get_contours_multiprocess, segmentation_batch)) + + boxes_batch = [] + for batch_index in range(pred_batch.shape[0]): + src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] + boxes = boxes_from_contours(pred_batch[batch_index],contours_batch[batch_index],src_w, src_h, config) + boxes_batch.append({'points': boxes}) + + # def boxes_from_bitmap_wrapper(args): + # pred_now,segmentation_now,src_w, src_h, config = args + # boxes = boxes_from_bitmap(pred_now,segmentation_now,src_w, src_h, config) + # return {'points': boxes} + + # with ThreadPoolExecutor(max_workers=num_threads) as executor: + # src_h_list=[src_h for src_h, src_w, ratio_h, ratio_w in shape_list] + # src_w_list=[src_w for src_h, src_w, ratio_h, ratio_w in shape_list] + # configlist=[config]*len(shape_list) + # boxes_batch = list(executor.map(boxes_from_bitmap_wrapper, zip(pred_batch,segmentation_batch,src_w_list,src_h_list,configlist))) + return boxes_batch + +def deal_with_on_contours(contour, score_table, height, width, dest_height, dest_width, config): + points, sside = get_mini_boxes(contour) + if sside < config.min_size:return + points =np.array(points) + score = box_score_fast(score_table, points.reshape(-1, 2)) + if config.box_thresh > score:return + box = unclip(points,config.unclip_ratio).reshape(-1, 1, 2) + box, sside = get_mini_boxes(box) + if sside < config.min_size + 2:return + box = np.array(box) + box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width) + box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height) + return box, score + +def boxes_from_contours(pred, contours, dest_width, dest_height,config): + ''' + _bitmap: single map with shape (1, H, W), + whose values are binarized as {0, 1} + ''' + + height, width = pred.shape + num_contours = min(len(contours), config.max_candidates) + + boxes = [] + scores = [] + for index in range(num_contours): + contour = contours[index] + result = deal_with_on_contours(contour, pred, height, width, dest_height, dest_width, config) + if result is None:continue + box, score = result + boxes.append(box) + scores.append(score) + return np.array(boxes), scores + +def boxes_from_bitmap(pred, _bitmap, dest_width, dest_height,config:PostProcessConfig): + ''' + _bitmap: single map with shape (1, H, W), + whose values are binarized as {0, 1} + ''' + + bitmap = _bitmap + height, width = bitmap.shape + + outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) + if len(outs) == 3: + img, contours, _ = outs[0], outs[1], outs[2] + elif len(outs) == 2: + contours, _ = outs[0], outs[1] + + return boxes_from_contours(pred, contours, dest_width, dest_height,config) + +def boxes_from_bitmap_without_score(self, _bitmap, dest_width, dest_height): + ''' + _bitmap: single map with shape (1, H, W), + whose values are binarized as {0, 1} + ''' + + bitmap = _bitmap + height, width = bitmap.shape + + outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) + if len(outs) == 3: + img, contours, _ = outs[0], outs[1], outs[2] + elif len(outs) == 2: + contours, _ = outs[0], outs[1] + + num_contours = min(len(contours), self.max_candidates) + + boxes = [] + for index in range(num_contours): + contour = contours[index] + points, sside = get_mini_boxes(contour) + if sside < self.min_size:continue + points =np.array(points) + box = unclip(points).reshape(-1, 1, 2) + box, sside = get_mini_boxes(box) + if sside < self.min_size + 2:continue + box = np.array(box) + box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width) + box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height) + boxes.append(box.astype(np.int16)) + return np.array(boxes, dtype=np.int16) + +def obtain_score_mask(_box, h, w): + #h, w = bitmap.shape[:2] + box = _box.copy() + xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int32), 0, w - 1) + xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int32), 0, w - 1) + ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int32), 0, h - 1) + ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int32), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + box[:, 0] = box[:, 0] - xmin + box[:, 1] = box[:, 1] - ymin + cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1) + return xmin, xmax, ymin, ymax, mask + +def box_score_fast(bitmap, _box): + ''' + box_score_fast: use bbox mean score as the mean score + ''' + h, w = bitmap.shape[:2] + xmin, xmax, ymin, ymax, mask = obtain_score_mask(_box,h, w) + crop = bitmap[ymin:ymax + 1, xmin:xmax + 1] + mask = torch.BoolTensor(mask) + return crop[mask].mean().item() + +def get_mini_boxes(contour): + bounding_box = cv2.minAreaRect(contour) + points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) + + index_1, index_2, index_3, index_4 = 0, 1, 2, 3 + if points[1][1] > points[0][1]: + index_1 = 0 + index_4 = 1 + else: + index_1 = 1 + index_4 = 0 + if points[3][1] > points[2][1]: + index_2 = 2 + index_3 = 3 + else: + index_2 = 3 + index_3 = 2 + + box = [ + points[index_1], points[index_2], points[index_3], points[index_4] + ] + return box, min(bounding_box[1]) + +def unclip(box, unclip_ratio): + poly = Polygon(box) + distance = poly.area * unclip_ratio / poly.length + offset = pyclipper.PyclipperOffset() + offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + expanded = np.array(offset.Execute(distance)) + return expanded + +def deal_with_on_contours_without_score(contour, height, width, dest_height, dest_width, config): + points, sside = get_mini_boxes(contour) + if sside < config.min_size:return + points =np.array(points) + box = unclip(points,config.unclip_ratio).reshape(-1, 1, 2) + box, sside = get_mini_boxes(box) + if sside < config.min_size + 2:return + box = np.array(box) + box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width) + box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height) + return box \ No newline at end of file diff --git a/batch_running_task/task_layout/get_batch_layout_model.py b/batch_running_task/task_layout/get_batch_layout_model.py new file mode 100644 index 0000000..e31a30d --- /dev/null +++ b/batch_running_task/task_layout/get_batch_layout_model.py @@ -0,0 +1,189 @@ +from detectron2.utils.logger import setup_logger +setup_logger() +import sys,os +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) +from modules.layoutlmv3.model_init import * +from utils import Timers +def inference( + self, + batched_inputs: List[Dict[str, torch.Tensor]], + detected_instances: Optional[List[Instances]] = None, + do_postprocess: bool = True, + timers = None + ): + """ + Run inference on the given inputs. + + Args: + batched_inputs (list[dict]): same as in :meth:`forward` + detected_instances (None or list[Instances]): if not None, it + contains an `Instances` object per image. The `Instances` + object contains "pred_boxes" and "pred_classes" which are + known boxes in the image. + The inference will then skip the detection of bounding boxes, + and only predict other per-ROI outputs. + do_postprocess (bool): whether to apply post-processing on the outputs. + + Returns: + When do_postprocess=True, same as in :meth:`forward`. + Otherwise, a list[Instances] containing raw network outputs. + """ + assert not self.training + + with timers('inference/preprocess_image'): + images = self.preprocess_image(batched_inputs) + # features = self.backbone(images.tensor) + with timers('inference/get_batch'): + input = self.get_batch(batched_inputs, images) + with timers('inference/get_features'): + features = self.backbone(input) + with timers('inference/merge_proposals'): + if detected_instances is None: + if self.proposal_generator is not None: + with timers('merge_proposals/compute_proposals'): + proposals, _ = self.proposal_generator(images, features, None) + else: + assert "proposals" in batched_inputs[0] + proposals = [x["proposals"].to(self.device) for x in batched_inputs] + with timers('inference/merge_proposals/roi_heads'): + results, _ = self.roi_heads(images, features, proposals, None) + else: + detected_instances = [x.to(self.device) for x in detected_instances] + results = self.roi_heads.forward_with_given_boxes(features, detected_instances) + with timers('inference/postprocess'): + if do_postprocess: + assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess." + results = GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes) + return results + + + +class Layoutlmv3_BatchPredictor(Layoutlmv3_Predictor): + + timers = Timers(False) + + def batch_predict(self, image_and_height_and_width,timers): + with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258 + images,heights, widths = image_and_height_and_width + inputs =[ {"image": image, "height": height, "width": width} for image, height, width in zip(images,heights, widths)] + #inputs = {"image": images, "height": heights, "width": widths} + predictions = inference(self.predictor.model,inputs,timers=timers) + return predictions + + def compile(self): + self.predictor.model.backbone = torch.compile(self.predictor.model.backbone) + # self.predictor.model.proposal_generator = torch.compile(self.predictor.model.proposal_generator) + # self.predictor.model.roi_heads = torch.compile(self.predictor.model.roi_heads) + + def __call__(self, image, ignore_catids=[], dtype=torch.float32): + with self.timers('inference'): + with torch.cuda.amp.autocast(dtype=dtype): + outputslist = self.batch_predict(image, self.timers) + use_old_bbox_collection = False + if use_old_bbox_collection: + with self.timers('wholepost'): + page_layout_result_list = [] + for outputs in outputslist: + page_layout_result = { + "layout_dets": [] + } + + # Convert tensor data to numpy arrays + with self.timers('wholepost/to_numpy'): + boxes = outputs["instances"].to("cpu")._fields["pred_boxes"].tensor.numpy() + labels = outputs["instances"].to("cpu")._fields["pred_classes"].numpy() + scores = outputs["instances"].to("cpu")._fields["scores"].numpy() + + with self.timers('wholepost/compute_mask'): + # Create a mask for filtering out the ignored categories + mask = np.isin(labels, ignore_catids, invert=True) + + with self.timers('wholepost/slicing'): + # Apply the mask to filter out the ignored categories + filtered_boxes = boxes[mask] + filtered_labels = labels[mask] + filtered_scores = scores[mask] + + with self.timers('wholepost/stack'): + # Collect the layout details + polys = np.column_stack([ + filtered_boxes[:, 0], filtered_boxes[:, 1], + filtered_boxes[:, 2], filtered_boxes[:, 1], + filtered_boxes[:, 2], filtered_boxes[:, 3], + filtered_boxes[:, 0], filtered_boxes[:, 3] + ]) + + with self.timers('wholepost/restack_layout'): + # Populate the layout_dets + for i in range(len(filtered_labels)): + page_layout_result["layout_dets"].append({ + "category_id": filtered_labels[i], + "poly": polys[i].tolist(), + "score": filtered_scores[i] + }) + + page_layout_result_list.append(page_layout_result) + else: + with self.timers('wholepost'): + page_layout_result_list = [] + for outputs in outputslist: + page_layout_result = { + "layout_dets": [] + } + instances = outputs["instances"] + # Convert tensor data to numpy arrays + with self.timers('wholepost/to_numpy1'): + boxes = instances._fields["pred_boxes"].tensor + labels = instances._fields["pred_classes"] + scores = instances._fields["scores"] + + with self.timers('wholepost/compute_mask'): + # Create a mask for filtering out the ignored categories + ignore_catids_tensor = torch.tensor(ignore_catids, device=labels.device) + mask = ~torch.isin(labels, ignore_catids_tensor) + + with self.timers('wholepost/slicing'): + # Apply the mask to filter out the ignored categories + filtered_boxes = boxes[mask] + filtered_labels = labels[mask] + filtered_scores = scores[mask] + + with self.timers('wholepost/to_numpy2'): + filtered_boxes = filtered_boxes.cpu().numpy() + filtered_labels = filtered_labels.cpu().numpy() + filtered_scores = filtered_scores.cpu().numpy() + + with self.timers('wholepost/stack'): + # Collect the layout details + polys = np.column_stack([ + filtered_boxes[:, 0], filtered_boxes[:, 1], + filtered_boxes[:, 2], filtered_boxes[:, 1], + filtered_boxes[:, 2], filtered_boxes[:, 3], + filtered_boxes[:, 0], filtered_boxes[:, 3] + ]) + + with self.timers('wholepost/restack_layout'): + # Populate the layout_dets + for i in range(len(filtered_labels)): + page_layout_result["layout_dets"].append({ + "category_id": filtered_labels[i], + "poly": polys[i].tolist(), + "score": filtered_scores[i] + }) + + page_layout_result_list.append(page_layout_result) + #self.timers.log() + return page_layout_result_list + + + +def get_layout_model(model_configs, accelerated): + model = Layoutlmv3_BatchPredictor(model_configs['model_args']['layout_weight']) + compiled = accelerated + if compiled: + model.compile() + model.iscompiled = True + else: + #model.compile() + model.iscompiled = False + return model \ No newline at end of file diff --git a/batch_running_task/task_layout/get_batch_yolo.py b/batch_running_task/task_layout/get_batch_yolo.py new file mode 100644 index 0000000..4a179d8 --- /dev/null +++ b/batch_running_task/task_layout/get_batch_yolo.py @@ -0,0 +1,88 @@ +from ultralytics.engine.results import Results +from ultralytics.utils import ops +from ultralytics.utils import ARGV +from ultralytics.data.augment import LetterBox +from ultralytics.utils.checks import check_imgsz +def build_mfd_predictor( + self, + stream: bool = False, + predictor=None, + **kwargs, + ): + + + is_cli = (ARGV[0].endswith("yolo") or ARGV[0].endswith("ultralytics")) and any( + x in ARGV for x in ("predict", "track", "mode=predict", "mode=track") + ) + + custom = {"conf": 0.25, "batch": 1, "save": is_cli, "mode": "predict"} # method defaults + args = {**self.overrides, **custom, **kwargs} # highest priority args on the right + prompts = args.pop("prompts", None) # for SAM-type models + + if not self.predictor: + self.predictor = predictor or self._smart_load("predictor")(overrides=args, _callbacks=self.callbacks) + self.predictor.setup_model(model=self.model, verbose=is_cli) +class mfd_process: + def __init__(self, imgsz, stride, pt): + self.imgsz = imgsz + + self.stride = stride + self.pt = pt + def __call__(self, im): + """ + Pre-transform input image before inference. + + Args: + im (List(np.ndarray)): (N, 3, h, w) for tensor, [(h, w, 3) x N] for list. + + Returns: + (list): A list of transformed images. + """ + imgsz = check_imgsz(self.imgsz, stride=self.stride, min_dim=2) + same_shapes = len({x.shape for x in im}) == 1 + letterbox = LetterBox(imgsz, auto=same_shapes and self.pt, stride=self.stride) + return [letterbox(image=x) for x in im] + +def fastpostprocess(self, preds, img, orig_imgs): + """Post-processes predictions and returns a list of Results objects.""" + + preds = ops.non_max_suppression( + preds, + self.args.conf, + self.args.iou, + agnostic=self.args.agnostic_nms, + max_det=self.args.max_det, + classes=self.args.classes, + ) + # if not isinstance(orig_imgs, list): # input images are a torch.Tensor, not a list + # orig_imgs = ops.convert_torch2numpy_batch(orig_imgs) ## <-- this step only convert the channel order back and to cpu and to uni8, no need this + results = [] + for i, pred in enumerate(preds): + orig_img = orig_imgs[i] + #pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape) # <-- lets do it outside since now we will feed normlized batch + img_path = self.batch[0][i] + results.append(Results(orig_img, path=img_path, names=self.model.names, boxes=pred)) + return results +# import ultralytics +# ultralytics.models.yolo.detect.DetectionPredictor.postprocess = fastpostprocess +from ultralytics import YOLO +import os +def get_batch_YOLO_model(model_configs, batch_size,use_tensorRT=True)->YOLO: + weight_path = model_configs['model_args']['mfd_weight'] + + engine_weight= model_configs['model_args']['mfd_weight'][:-3]+f'.b{batch_size}.engine' + if os.path.exists(engine_weight) and use_tensorRT: + mfd_model = YOLO(engine_weight,task='detect') + else: + mfd_model = YOLO(weight_path) + #mfd_model = YOLO(engine_weight,task='detect') + #mfd_model = YOLO(weight_path) + img_size = model_configs['model_args']['img_size'] + img_size = (1888,1472) # <---- please fix use this, in normal YOLO assign it is automatively correct, but when using .engine file, it is not correct + + conf_thres= model_configs['model_args']['conf_thres'] + iou_thres = model_configs['model_args']['iou_thres'] + build_mfd_predictor(mfd_model , imgsz=img_size, conf=conf_thres, iou=iou_thres, verbose=False) + return mfd_model + + diff --git a/batch_running_task/task_layout/no_paddle_ocr.py b/batch_running_task/task_layout/no_paddle_ocr.py new file mode 100644 index 0000000..7766ee0 --- /dev/null +++ b/batch_running_task/task_layout/no_paddle_ocr.py @@ -0,0 +1,5 @@ +from .batch_text_detector import BatchTextDetector +class ModifiedPaddleOCR: + def __init__(self, **kwargs): + self.batch_det_model = BatchTextDetector() + \ No newline at end of file diff --git a/batch_running_task/task_layout/rough_layout.py b/batch_running_task/task_layout/rough_layout.py new file mode 100644 index 0000000..c9fbfaf --- /dev/null +++ b/batch_running_task/task_layout/rough_layout.py @@ -0,0 +1,577 @@ + + +import os, warnings +os.environ['CUDA_MODULE_LOADING'] = 'LAZY' +warnings.simplefilter(action='ignore', category=FutureWarning) +warnings.simplefilter(action='ignore', category=UserWarning) +### redirect to the parent folder of this file +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from utils import * +from get_data_utils import * +from scihub_pdf_dataset import PDFImageDataset, PageInfoDataset, PageInfoWithPairDataset, custom_collate_fn,DataLoader,AddonDataset,clean_pdf_path +from task_layout.get_batch_yolo import mfd_process, get_batch_YOLO_model +from task_layout.get_batch_layout_model import get_layout_model +from task_layout.no_paddle_ocr import ModifiedPaddleOCR +import numpy as np +import torch +from tqdm.auto import tqdm +import yaml +from dataaccelerate import DataPrefetcher +from ultralytics.utils import ops +import copy +import traceback + +def clean_layout_dets(layout_dets): + rows = [] + for t in layout_dets: + rows.append({ + "category_id":int(t['category_id']), + "poly":[int(t) for t in t['poly']], + "score":float(t['score']) + }) + + return rows + +def inference_layout(layout_pair,layout_model,inner_batch_size): + + layout_images, heights, widths = layout_pair + origin_length = len(layout_images) + + if len(layout_images)0 else None) + + data_to_save = {} + inner_batch_size = inner_batch_size + #pbar = tqdm(total=len(dataset.metadata),position=2,desc="PDF Pages",leave=False) + pbar = None + pdf_passed = set() + featcher = DataPrefetcher(dataloader,device='cuda') + batch = featcher.next() + data_loading = [] + model_train = [] + last_record_time = time.time() + while batch is not None: + #for batch in dataloader: + try: + data_loading.append(time.time() - last_record_time);last_record_time =time.time() + if pbar:pbar.set_description(f"[Data][{np.mean(data_loading[-10:]):.2f}] [Model][{np.mean(model_train[-10:]):.2f}]") + pdf_index_batch, page_ids_batch = batch["pdf_index"], batch["page_index"] + mfd_layout_images_batch, layout_images_batch, det_layout_images_batch = batch["mfd_image"], batch["layout_image"], batch["det_images"] + heights_batch, widths_batch = batch["height"], batch["width"] + oimage_list = batch.get('oimage',None) + pdf_index = set([t.item() for t in pdf_index_batch]) + new_pdf_processed = pdf_index - pdf_passed + pdf_passed = pdf_passed|pdf_index + + iterater = tqdm(range(0, len(mfd_layout_images_batch), inner_batch_size),position=3,leave=False,desc="mini-Batch") if len(mfd_layout_images_batch)>inner_batch_size else range(0, len(mfd_layout_images_batch), inner_batch_size) + + for j in iterater: + pdf_index = pdf_index_batch[j:j+inner_batch_size] + page_ids = page_ids_batch[j:j+inner_batch_size] + mfd_images = mfd_layout_images_batch[j:j+inner_batch_size] + layout_images = layout_images_batch[j:j+inner_batch_size] + heights = heights_batch[j:j+inner_batch_size] + widths = widths_batch[j:j+inner_batch_size] + oimages = oimage_list[j:j+inner_batch_size] if oimage_list is not None else None + detimages = det_layout_images_batch[j:j+inner_batch_size] + pdf_paths = [dataset.metadata[pdf_index]['path'] for pdf_index in pdf_index] + with timer('get_layout'): + layout_res = inference_layout((layout_images,heights, widths),layout_model,inner_batch_size) + with timer('get_mfd'): + mfd_res = inference_mfd(mfd_images,mfd_model,inner_batch_size) + with timer('combine_layout_mfd_result'): + rough_layout_this_batch, ori_shape_list = combine_layout_mfd_result(layout_res, mfd_res, heights, widths) + pdf_and_page_id_this_batch=[] + for pdf_path, page_id, layout_dets,ori_shape in zip(pdf_paths, page_ids, rough_layout_this_batch,ori_shape_list): + page_id = int(page_id) + if pdf_path not in data_to_save: + data_to_save[pdf_path] = {'height':ori_shape[0], 'width':ori_shape[1]} + data_to_save[pdf_path][page_id] = layout_dets + pdf_and_page_id_this_batch.append((pdf_path, page_id)) + + + if ocrmodel is not None: + if not do_text_det:continue + with timer('text_detection/collect_for_line_detect'): + det_height, det_width = detimages.shape[2:] + scale_height = int(heights[0])/int(det_height) + scale_width = int(widths[0])/int(det_width) + assert scale_height == scale_width + assert scale_height == 2 + canvas_tensor_this_batch, partition_per_batch,_,_ = collect_paragraph_image_and_its_coordinate(detimages, rough_layout_this_batch,scale_height) # 2 is the scale between detiamge and box_images + if len(canvas_tensor_this_batch)==0: + tqdm.write("WARNING: no text line to detect") + continue + with timer('text_detection/stack'): + canvas_tensor_this_batch = torch.stack(canvas_tensor_this_batch) + with timer('text_detection/det_net'): + dt_boxaes_batch = inference_det(canvas_tensor_this_batch,ocrmodel.batch_det_model.net,128) + with timer('text_detection/det_postprocess'): + dt_boxes_list = det_postprocess(dt_boxaes_batch,ocrmodel) + + + if do_text_rec: + raise NotImplementedError("do_text_rec is not implemented") + # with timer('text_detection/collect_for_text_images'): + # text_image_batch, text_image_position,text_line_bbox = collect_text_image_and_its_coordinate(single_page_mfdetrec_res_this_batch, partition_per_batch, oimages,dt_boxes_list) + # with timer('text_detection/get_line_text_rec'): + # rec_res, elapse = ocrmodel.text_recognizer(text_image_batch) + # for line_box, rec_result,(partition_id,text_block_id, text_line_id) in zip(text_line_bbox, rec_res,text_image_position): + # text, score = rec_result + # pdf_id, page_id = pdf_and_page_id_this_batch[partition_id] + # pdf_path = dataset.metadata[pdf_id]['path'] + # p1, p2, p3, p4 = line_box.tolist() + # #print(line_box) + # data_to_save[pdf_path][page_id].append( + # { + # 'category_id': 15, + # 'poly': p1 + p2 + p3 + p4, + # 'score': round(score, 2), + # 'text': text, + # } + + # ) + else: + for partition_id in range(len(partition_per_batch)-1): + pdf_path, page_id = pdf_and_page_id_this_batch[partition_id] + partition_start = partition_per_batch[partition_id] + partition_end = partition_per_batch[partition_id+1] + dt_boxes_this_partition = dt_boxes_list[partition_start:partition_end] + + for dt_boxes in dt_boxes_this_partition: #(1, 4, 2) + for line_box in dt_boxes: + p1, p2, p3, p4 = line_box.tolist() + data_to_save[pdf_path][page_id].append( + { + 'category_id': 15, + 'poly': p1 + p2 + p3 + p4, + } + ) + + + except KeyboardInterrupt: + raise + except: + traceback.print_exc() + print("ERROR: Fail to process batch") + update_seq = len(new_pdf_processed) + if pbar:pbar.update(update_seq) + timer.log() + model_train.append(time.time() - last_record_time);last_record_time =time.time() + if pbar:pbar.set_description(f"[Data][{np.mean(data_loading[-10:]):.2f}] [Model][{np.mean(model_train[-10:]):.2f}]") + batch = featcher.next() + if pbar is None: + pbar = tqdm(total=len(dataset.metadata)-update_seq,position=2,desc="PDF Pages",leave=False, bar_format='{l_bar}{bar}{r_bar}') + + ### next, we construct each result for each pdf in pdf wise and remove the page_id by the list position + save_result(data_to_save,dataset,result_path) + + +def deal_with_page_info_dataset(pdf_path, result_path, layout_model, mfd_model, + ocrmodel=None, inner_batch_size=4, + batch_size=32,num_workers=8, + do_text_det=False, + do_text_rec=False, + timer=Timers(False), + partion_num = 1, + partion_idx = 0,page_num_for_name=None): + dataset = PageInfoDataset(pdf_path,layout_model.predictor.aug,layout_model.predictor.input_format, + mfd_pre_transform=mfd_process(mfd_model.predictor.args.imgsz,mfd_model.predictor.model.stride,mfd_model.predictor.model.pt), + det_pre_transform=ocrmodel.batch_det_model.prepare_image, + return_original_image=do_text_rec, + partion_num = partion_num, + partion_idx = partion_idx,page_num_for_name=page_num_for_name + ) + data_to_save = fast_dealwith_one_dataset(dataset,layout_model, mfd_model, ocrmodel, + inner_batch_size=inner_batch_size, + batch_size=batch_size, + num_workers=num_workers, + do_text_det=do_text_det, + do_text_rec=do_text_rec, + timer= timer) + save_result(data_to_save,dataset,result_path) + +def deal_with_page_info_dataset_for_missing_page(pdf_path, result_path, layout_model, mfd_model, + ocrmodel=None, inner_batch_size=4, + batch_size=32,num_workers=8, + do_text_det=False, + do_text_rec=False, + timer=Timers(False), + partion_num = 1, + partion_idx = 0,page_num_for_name=None): + dataset = PageInfoWithPairDataset(pdf_path,layout_model.predictor.aug,layout_model.predictor.input_format, + mfd_pre_transform=mfd_process(mfd_model.predictor.args.imgsz,mfd_model.predictor.model.stride,mfd_model.predictor.model.pt), + det_pre_transform=ocrmodel.batch_det_model.prepare_image, + return_original_image=do_text_rec, + partion_num = partion_num, + partion_idx = partion_idx,page_num_for_name=page_num_for_name + ) + data_to_save = fast_dealwith_one_dataset(dataset,layout_model, mfd_model, ocrmodel, + inner_batch_size=inner_batch_size, + batch_size=batch_size, + num_workers=num_workers, + do_text_det=do_text_det, + do_text_rec=do_text_rec, + timer= timer) + save_result(data_to_save,dataset,result_path,add_on_mode=True) + +def deal_with_page_addon_dataset(metadata_path, pdfid_and_pageid_list, result_path, layout_model, mfd_model, + ocrmodel=None, inner_batch_size=4, + batch_size=32,num_workers=8, + do_text_det=False, + do_text_rec=False, + timer=Timers(False), + partion_num = 1, + partion_idx = 0): + dataset = AddonDataset(metadata_path, pdfid_and_pageid_list,layout_model.predictor.aug,layout_model.predictor.input_format, + mfd_pre_transform=mfd_process(mfd_model.predictor.args.imgsz,mfd_model.predictor.model.stride,mfd_model.predictor.model.pt), + det_pre_transform=ocrmodel.batch_det_model.prepare_image, + return_original_image=do_text_rec, + partion_num = partion_num, + partion_idx = partion_idx + ) + data_to_save = fast_dealwith_one_dataset(dataset,layout_model, mfd_model, ocrmodel, + inner_batch_size=inner_batch_size, + batch_size=batch_size, + num_workers=num_workers, + do_text_det=do_text_det, + do_text_rec=do_text_rec, + timer= timer) + save_result(data_to_save,dataset,result_path) + + + +def fast_dealwith_one_dataset(dataset,layout_model, mfd_model, ocrmodel, + inner_batch_size=4, + batch_size=32,num_workers=8, + do_text_det=False, + do_text_rec=False, + timer=Timers(False)) : + print(f"current dataset size={len(dataset)} images") + collate_fn = custom_collate_fn if do_text_rec else None + num_workers=min(num_workers,len(dataset.metadata)) + dataloader = DataLoader(dataset, batch_size=batch_size,collate_fn=collate_fn, + num_workers=num_workers,pin_memory=True, pin_memory_device='cuda', + prefetch_factor=2 if num_workers>0 else None) + + data_to_save = {} + inner_batch_size = inner_batch_size + #pbar = tqdm(total=len(dataset.metadata),position=2,desc="PDF Pages",leave=False) + pbar = None + pdf_passed = set() + featcher = DataPrefetcher(dataloader,device='cuda') + batch = featcher.next() + data_loading = [] + model_train = [] + last_record_time = time.time() + while batch is not None: + data_loading.append(time.time() - last_record_time);last_record_time =time.time() + if pbar:pbar.set_description(f"[Data][{np.mean(data_loading[-10:]):.2f}] [Model][{np.mean(model_train[-10:]):.2f}]") + pdf_index_batch, page_ids_batch = batch["pdf_index"], batch["page_index"] + mfd_layout_images_batch, layout_images_batch, det_layout_images_batch = batch["mfd_image"], batch["layout_image"], batch["det_images"] + heights_batch, widths_batch = batch["height"], batch["width"] + oimage_list = batch.get('oimage',None) + pdf_index = set([t.item() for t in pdf_index_batch]) + + + iterater = tqdm(range(0, len(mfd_layout_images_batch), inner_batch_size),position=3,leave=False,desc="mini-Batch") if len(mfd_layout_images_batch)>inner_batch_size else range(0, len(mfd_layout_images_batch), inner_batch_size) + + for j in iterater: + pdf_index = pdf_index_batch[j:j+inner_batch_size] + page_ids = page_ids_batch[j:j+inner_batch_size] + mfd_images = mfd_layout_images_batch[j:j+inner_batch_size] + layout_images = layout_images_batch[j:j+inner_batch_size] + heights = heights_batch[j:j+inner_batch_size] + widths = widths_batch[j:j+inner_batch_size] + oimages = oimage_list[j:j+inner_batch_size] if oimage_list is not None else None + detimages = det_layout_images_batch[j:j+inner_batch_size] + pdf_paths = [dataset.metadata[pdf_index]['path'] for pdf_index in pdf_index] + with timer('get_layout'): + layout_res = inference_layout((layout_images,heights, widths),layout_model,inner_batch_size) + with timer('get_mfd'): + mfd_res = inference_mfd(mfd_images,mfd_model,inner_batch_size) + with timer('combine_layout_mfd_result'): + rough_layout_this_batch, ori_shape_list = combine_layout_mfd_result(layout_res, mfd_res, heights, widths) + pdf_and_page_id_this_batch=[] + for pdf_path, page_id, layout_dets,ori_shape in zip(pdf_paths, page_ids, rough_layout_this_batch,ori_shape_list): + page_id = int(page_id) + if pdf_path not in data_to_save: + data_to_save[pdf_path] = {'height':ori_shape[0], 'width':ori_shape[1]} + data_to_save[pdf_path][page_id] = layout_dets + pdf_and_page_id_this_batch.append((pdf_path, page_id)) + + + if ocrmodel is not None: + if not do_text_det:continue + with timer('text_detection/collect_for_line_detect'): + det_height, det_width = detimages.shape[2:] + scale_height = int(heights[0])/int(det_height) + scale_width = int(widths[0])/int(det_width) + assert scale_height == scale_width + assert scale_height == 2 + canvas_tensor_this_batch, partition_per_batch,_,_ = collect_paragraph_image_and_its_coordinate(detimages, rough_layout_this_batch,scale_height) # 2 is the scale between detiamge and box_images + if len(canvas_tensor_this_batch)==0: + tqdm.write("WARNING: no text line to detect") + continue + with timer('text_detection/stack'): + canvas_tensor_this_batch = torch.stack(canvas_tensor_this_batch) + with timer('text_detection/det_net'): + dt_boxaes_batch = inference_det(canvas_tensor_this_batch,ocrmodel.batch_det_model.net,128) + with timer('text_detection/det_postprocess'): + dt_boxes_list = det_postprocess(dt_boxaes_batch,ocrmodel) + + ############### we better split the line box via mfd box #################### + + + if do_text_rec: + with timer('text_detection/collect_for_text_images'): + text_image_batch, text_image_position,text_line_bbox = collect_text_image_and_its_coordinate(single_page_mfdetrec_res_this_batch, partition_per_batch, oimages,dt_boxes_list) + with timer('text_detection/get_line_text_rec'): + rec_res, elapse = ocrmodel.text_recognizer(text_image_batch) + for line_box, rec_result,(partition_id,text_block_id, text_line_id) in zip(text_line_bbox, rec_res,text_image_position): + text, score = rec_result + pdf_id, page_id = pdf_and_page_id_this_batch[partition_id] + pdf_path = dataset.metadata[pdf_id]['path'] + p1, p2, p3, p4 = line_box.tolist() + #print(line_box) + data_to_save[pdf_path][page_id].append( + { + 'category_id': 15, + 'poly': p1 + p2 + p3 + p4, + 'score': round(score, 2), + 'text': text, + } + + ) + else: + for partition_id in range(len(partition_per_batch)-1): + pdf_path, page_id = pdf_and_page_id_this_batch[partition_id] + partition_start = partition_per_batch[partition_id] + partition_end = partition_per_batch[partition_id+1] + dt_boxes_this_partition = dt_boxes_list[partition_start:partition_end] + + for dt_boxes in dt_boxes_this_partition: #(1, 4, 2) + for line_box in dt_boxes: + p1, p2, p3, p4 = line_box.tolist() + data_to_save[pdf_path][page_id].append( + { + 'category_id': 15, + 'poly': p1 + p2 + p3 + p4, + } + ) + + + # except KeyboardInterrupt: + # raise + # except: + # traceback.print_exc() + # print("ERROR: Fail to process batch") + update_seq = len(page_ids_batch) + if pbar:pbar.update(update_seq) + timer.log() + model_train.append(time.time() - last_record_time);last_record_time =time.time() + if pbar:pbar.set_description(f"[Data][{np.mean(data_loading[-10:]):.2f}] [Model][{np.mean(model_train[-10:]):.2f}]") + batch = featcher.next() + if pbar is None: + pbar = tqdm(total=len(dataset),position=2,desc="PDF Pages",leave=False, bar_format='{l_bar}{bar}{r_bar}') + return data_to_save + ### next, we construct each result for each pdf in pdf wise and remove the page_id by the list position + + +def save_result(data_to_save,dataset,result_path,add_on_mode=False): + pdf_to_metadata = {clean_pdf_path(t['path']):t for t in dataset.metadata} + + new_data_to_save = [] + for pdf_path, layout_dets_per_page in data_to_save.items(): + pdf_path = clean_pdf_path(pdf_path) + new_pdf_dict = copy.deepcopy(pdf_to_metadata[pdf_path]) + new_pdf_dict['height'] = layout_dets_per_page.pop('height') + new_pdf_dict['width'] = layout_dets_per_page.pop('width') + pages = [t for t in layout_dets_per_page.keys()] + pages.sort() + #print(pages) + + new_pdf_dict["doc_layout_result"]=[] + for page_id in range(max(pages)+1): ### those , before, we may lost whole the last page for layoutV1-5 result + if add_on_mode and page_id not in layout_dets_per_page:continue + if page_id not in layout_dets_per_page: + print(f"WARNING: page {page_id} of PDF (availabel keys is {layout_dets_per_page.keys()}): {pdf_path} fail to parser!!! ") + now_row = {"page_id": page_id, "status": "fail", "layout_dets":[]} + else: + now_row = {"page_id": page_id, "layout_dets":layout_dets_per_page[page_id]} + new_pdf_dict["doc_layout_result"].append(now_row) + new_data_to_save.append(new_pdf_dict) + if "s3:" in new_data_to_save and dataset.client is None:dataset.client=build_client() + if result_path.startswith("s3:"):result_path = "opendata:"+result_path + write_jsonl_to_path(new_data_to_save, result_path, dataset.client) + +def test_dataset(pdf_path, layout_model, mfd_model, ocrmodel): + timer = Timers(True) + dataset = PDFImageDataset(pdf_path,layout_model.predictor.aug,layout_model.predictor.input_format, + + mfd_pre_transform=mfd_process(mfd_model.predictor.args.imgsz, + mfd_model.predictor.model.stride, + mfd_model.predictor.model.pt), + det_pre_transform=ocrmodel.batch_det_model.prepare_image, + return_original_image=True, timer=timer, + ) + print("======================================================") + for _ in dataset: + timer.log() + +if __name__ == "__main__": + + with open('configs/model_configs.yaml') as f: + model_configs = yaml.load(f, Loader=yaml.FullLoader) + + img_size = model_configs['model_args']['img_size'] + conf_thres= model_configs['model_args']['conf_thres'] + iou_thres = model_configs['model_args']['iou_thres'] + device = model_configs['model_args']['device'] + dpi = model_configs['model_args']['pdf_dpi'] + + accelerated = False + layout_model = get_layout_model(model_configs,accelerated) + + total_memory = get_gpu_memory() + if total_memory > 60: + inner_batch_size = 16 + elif total_memory > 30: + inner_batch_size = 8 + else: + inner_batch_size = 2 + print(f"totally gpu memory is {total_memory} we use inner batch size {inner_batch_size}") + mfd_model = get_batch_YOLO_model(model_configs,inner_batch_size,use_tensorRT=True) + ocrmodel = None + ocrmodel = ocr_model = ModifiedPaddleOCR(show_log=True) + timer = Timers(False,warmup=5) + #test_dataset("debug.jsonl", layout_model, mfd_model, ocrmodel) + #page_num_map_whole = get_page_num_map_whole() + page_num_map_whole = None + #filename = "part-66210c190659-012745.jsonl" + # deal_with_page_info_dataset(filename, + # f"{filename}.stage_1.jsonl", + # layout_model, mfd_model, ocrmodel=ocrmodel, + # inner_batch_size=inner_batch_size, batch_size=inner_batch_size,num_workers=8, + # do_text_det = True, + # do_text_rec = False, + # timer=timer,page_num_for_name=page_num_map_whole) + + with open('test.pairlist','r') as f: + for line in f: + splited_line = line.split() + pdf_path = splited_line[0] + json_str = " ".join(splited_line[1:]) + page_num_for_name = json.loads(json_str) + break + do_text_det = True + do_text_rec = False + batch_size = 16 + num_workers= 4 + + dataset = PageInfoWithPairDataset(pdf_path,layout_model.predictor.aug,layout_model.predictor.input_format, + mfd_pre_transform=mfd_process(mfd_model.predictor.args.imgsz,mfd_model.predictor.model.stride,mfd_model.predictor.model.pt), + det_pre_transform=ocrmodel.batch_det_model.prepare_image, + return_original_image=do_text_rec, + partion_num = 1, + partion_idx = 0,page_num_for_name=page_num_for_name + ) + data_to_save = fast_dealwith_one_dataset(dataset,layout_model, mfd_model, ocrmodel, + inner_batch_size=inner_batch_size, + batch_size=batch_size, + num_workers=num_workers, + do_text_det=do_text_det, + do_text_rec=do_text_rec, + timer= timer) + save_result(data_to_save,dataset,"test_result/result.addon.jsonl") + + + \ No newline at end of file diff --git a/batch_running_task/task_layout/rough_layout_with_aync.py b/batch_running_task/task_layout/rough_layout_with_aync.py new file mode 100644 index 0000000..f3d9d43 --- /dev/null +++ b/batch_running_task/task_layout/rough_layout_with_aync.py @@ -0,0 +1,192 @@ +from rough_layout import * +import asyncio + + + +async def deal_with_one_dataset_async(pdf_path, result_path, layout_model, mfd_model, + ocrmodel=None, inner_batch_size=4, + batch_size=32,num_workers=8, + do_text_det=False, + do_text_rec=False, + timer=Timers(False), + partion_num = 1, + partion_idx = 0): + dataset = PDFImageDataset(pdf_path,layout_model.predictor.aug,layout_model.predictor.input_format, + + mfd_pre_transform=mfd_process(mfd_model.predictor.args.imgsz,mfd_model.predictor.model.stride,mfd_model.predictor.model.pt), + det_pre_transform=ocrmodel.batch_det_model.prepare_image, + return_original_image=do_text_rec, + partion_num = partion_num, + partion_idx = partion_idx + ) + collate_fn = custom_collate_fn if do_text_rec else None + num_workers=min(num_workers,len(dataset.metadata)) + dataloader = DataLoader(dataset, batch_size=batch_size,collate_fn=collate_fn, + num_workers=num_workers,pin_memory=True, pin_memory_device='cuda', + prefetch_factor=3 if num_workers>0 else None) + queue = asyncio.Queue() + data_to_save = {} + postprocess_task = asyncio.create_task(cpu_postprocess(queue, ocrmodel,data_to_save)) + + featcher = DataPrefetcher(dataloader,device='cuda') + + inner_batch_size = inner_batch_size + pbar = None#tqdm(total=len(dataset.metadata),position=2,desc="PDF Pages",leave=True) + pdf_passed = set() + batch = featcher.next() + data_loading = [] + model_train = [] + last_record_time = time.time() + while batch is not None: + + data_loading.append(time.time() - last_record_time);last_record_time =time.time() + if pbar:pbar.set_description(f"[Data][{np.mean(data_loading[-10:]):.2f}] [Model][{np.mean(model_train[-10:]):.2f}]") + pdf_index_batch, page_ids_batch = batch["pdf_index"], batch["page_index"] + mfd_layout_images_batch, layout_images_batch, det_layout_images_batch = batch["mfd_image"], batch["layout_image"], batch["det_images"] + heights_batch, widths_batch = batch["height"], batch["width"] + oimage_list = batch.get('oimage',None) + pdf_index = set([t.item() for t in pdf_index_batch]) + new_pdf_processed = pdf_index - pdf_passed + pdf_passed = pdf_passed|pdf_index + iterater = tqdm(range(0, len(mfd_layout_images_batch), inner_batch_size),position=3,leave=False,desc="mini-Batch") if len(mfd_layout_images_batch)>inner_batch_size else range(0, len(mfd_layout_images_batch), inner_batch_size) + + for j in iterater: + try: + pdf_index = pdf_index_batch[j:j+inner_batch_size] + page_ids = page_ids_batch[j:j+inner_batch_size] + mfd_images = mfd_layout_images_batch[j:j+inner_batch_size] + images = layout_images_batch[j:j+inner_batch_size] + heights = heights_batch[j:j+inner_batch_size] + widths = widths_batch[j:j+inner_batch_size] + oimages = oimage_list[j:j+inner_batch_size] if oimage_list is not None else None + detimages = det_layout_images_batch[j:j+inner_batch_size] + + layout_pair = (images, layout_model) + mdf_pair = (mfd_images, mfd_model) + det_pair = (detimages, ocrmodel.batch_det_model.net) + size_pair = (heights, widths,inner_batch_size) + pdf_paths = [dataset.metadata[pdf_index]['path'] for pdf_index in pdf_index] + location_pair = (pdf_paths, page_ids) + await gpu_inference(queue, layout_pair, mdf_pair, det_pair, size_pair, location_pair, data_to_save, timer) + except KeyboardInterrupt: + raise + except: + traceback.print_exc() + print("ERROR: Fail to process batch") + update_seq = len(new_pdf_processed) + if pbar:pbar.update(update_seq) + timer.log() + model_train.append(time.time() - last_record_time);last_record_time =time.time() + if pbar:pbar.set_description(f"[Data][{np.mean(data_loading[-10:]):.2f}] [Model][{np.mean(model_train[-10:]):.2f}]") + batch = featcher.next() + if pbar is None: + pbar = tqdm(total=len(dataset.metadata)-update_seq,position=2,desc="PDF Pages",leave=False, bar_format='{l_bar}{bar}{r_bar}') + + await queue.join() + await queue.put(None) # Signal the consumer to exit + await postprocess_task # Wait for the consumer to finish + + tqdm.write("we finish generate data, lets collect and save it") + ### next, we construct each result for each pdf in pdf wise and remove the page_id by the list position + save_result(data_to_save,dataset,result_path) + +async def gpu_inference(queue, + layout_pair, + mdf_pair, + det_pair, + size_pair, + location_pair, + data_to_save, + timer): + layout_images, layout_model = layout_pair + mfd_images, mfd_model = mdf_pair + detimages, det_model = det_pair + heights, widths,inner_batch_size = size_pair + pdf_paths, page_ids= location_pair + with timer('get_layout'): + layout_res = inference_layout((layout_images,heights, widths),layout_model,inner_batch_size) + with timer('get_mfd'): + mfd_res = inference_mfd(mfd_images,mfd_model,inner_batch_size) + with timer('combine_layout_mfd_result'): + rough_layout_this_batch, ori_shape_list = combine_layout_mfd_result(layout_res, mfd_res, heights, widths) + + pdf_and_page_id_this_batch=[] + for pdf_path, page_id, layout_dets,ori_shape in zip(pdf_paths, page_ids, rough_layout_this_batch,ori_shape_list): + page_id = int(page_id) + if pdf_path not in data_to_save: + data_to_save[pdf_path] = {'height':ori_shape[0], 'width':ori_shape[1]} + data_to_save[pdf_path][page_id] = layout_dets + pdf_and_page_id_this_batch.append((pdf_path, page_id)) + + + with timer('text_detection/collect_for_line_detect'): + det_height, det_width = detimages.shape[2:] + scale_height = int(heights[0])/int(det_height) + scale_width = int(widths[0])/int(det_width) + assert scale_height == scale_width + assert scale_height == 2 + canvas_tensor_this_batch, partition_per_batch,_,_ = collect_paragraph_image_and_its_coordinate(detimages, rough_layout_this_batch,scale_height) # 2 is the scale between detiamge and box_images + with timer('text_detection/stack'): + canvas_tensor_this_batch = torch.stack(canvas_tensor_this_batch) + with timer('text_detection/det_net'): + dt_boxaes_batch = inference_det(canvas_tensor_this_batch,det_model,128) + + + torch.cuda.synchronize() + result = (dt_boxaes_batch,partition_per_batch,pdf_and_page_id_this_batch) + await queue.put(result) + +async def cpu_postprocess(queue, ocrmodel,data_to_save): + while True: + state = await queue.get() + if state is None: + queue.task_done() + break + dt_boxaes_batch,partition_per_batch,pdf_and_page_id_this_batch = state + dt_boxes_list = det_postprocess(dt_boxaes_batch,ocrmodel) + for partition_id in range(len(partition_per_batch)-1): + pdf_path, page_id = pdf_and_page_id_this_batch[partition_id] + partition_start = partition_per_batch[partition_id] + partition_end = partition_per_batch[partition_id+1] + dt_boxes_this_partition = dt_boxes_list[partition_start:partition_end] + for dt_boxes in dt_boxes_this_partition: #(1, 4, 2) + for line_box in dt_boxes: + p1, p2, p3, p4 = line_box.tolist() + data_to_save[pdf_path][page_id].append( + { + 'category_id': 15, + 'poly': p1 + p2 + p3 + p4, + } + ) + queue.task_done() + + +if __name__ == "__main__": + + with open('configs/model_configs.yaml') as f: + model_configs = yaml.load(f, Loader=yaml.FullLoader) + + img_size = model_configs['model_args']['img_size'] + conf_thres= model_configs['model_args']['conf_thres'] + iou_thres = model_configs['model_args']['iou_thres'] + device = model_configs['model_args']['device'] + dpi = model_configs['model_args']['pdf_dpi'] + + layout_model = get_layout_model(model_configs,accelerated=False) + #layout_model.compile() + total_memory = get_gpu_memory() + inner_batch_size = 16 if total_memory > 60 else 2 + print(f"totally gpu memory is {total_memory} we use inner batch size {inner_batch_size}") + mfd_model = get_batch_YOLO_model(model_configs, inner_batch_size) + ocrmodel = None + ocrmodel = ocr_model = ModifiedPaddleOCR(show_log=True) + timer = Timers(False,warmup=5) + + asyncio.run(deal_with_one_dataset_async("debug.jsonl", + "debug.stage_1.jsonl", + layout_model, mfd_model, ocrmodel=ocrmodel, + inner_batch_size=inner_batch_size, batch_size=16,num_workers=4, + timer=timer)) + + + \ No newline at end of file diff --git a/batch_running_task/task_layout/run_layout.sh b/batch_running_task/task_layout/run_layout.sh new file mode 100644 index 0000000..58cd9ef --- /dev/null +++ b/batch_running_task/task_layout/run_layout.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -J ParseSciHUB +#SBATCH -o .log/%j-ParseSciHUB.out +#SBATCH -e .log/%j-ParseSciHUB.out +export LD_LIBRARY_PATH=/mnt/cache/share/gcc/gcc-7.5.0/lib64:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} +export PATH=/mnt/cache/share/gcc/gcc-7.5.0/bin:$PATH + +GCC_VERSION=$(gcc -v 2>&1 | grep "gcc version" | awk '{print $3}') + +# Required version +REQUIRED_VERSION="7.5.0" + +# Check if the version matches +if [ "$GCC_VERSION" != "$REQUIRED_VERSION" ]; then + echo "[`hostname`] GCC version is not $REQUIRED_VERSION. Exiting." + exit 1 +else + echo "[`hostname`] GCC version is $REQUIRED_VERSION." +fi + +python batch_deal_with_layout.py --root_path $1 --index_part $2 --num_parts $3 --inner_batch_size 16 --batch_size 16 --num_workers 8 --redo # --accelerated_layout --accelerated_mfd \ No newline at end of file diff --git a/batch_running_task/task_layout/run_layout_for_missing_page.sh b/batch_running_task/task_layout/run_layout_for_missing_page.sh new file mode 100644 index 0000000..ccd172c --- /dev/null +++ b/batch_running_task/task_layout/run_layout_for_missing_page.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -J ParseSciHUB +#SBATCH -o .log/%j-ParseSciHUB.out +#SBATCH -e .log/%j-ParseSciHUB.out +export LD_LIBRARY_PATH=/mnt/cache/share/gcc/gcc-7.5.0/lib64:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} +export PATH=/mnt/cache/share/gcc/gcc-7.5.0/bin:$PATH + +GCC_VERSION=$(gcc -v 2>&1 | grep "gcc version" | awk '{print $3}') + +# Required version +REQUIRED_VERSION="7.5.0" + +# Check if the version matches +if [ "$GCC_VERSION" != "$REQUIRED_VERSION" ]; then + echo "[`hostname`] GCC version is not $REQUIRED_VERSION. Exiting." + exit 1 +else + echo "[`hostname`] GCC version is $REQUIRED_VERSION." +fi + +python batch_running_task/task_layout/batch_deal_with_layout_fixmissing_page.py --root_path $1 --index_part $2 --num_parts $3 --inner_batch_size 16 --batch_size 16 --num_workers 8 --accelerated_mfd --shuffle # --accelerated_layout \ No newline at end of file diff --git a/batch_running_task/task_layout/scan_finished.missingpart.py b/batch_running_task/task_layout/scan_finished.missingpart.py new file mode 100644 index 0000000..66a9b2e --- /dev/null +++ b/batch_running_task/task_layout/scan_finished.missingpart.py @@ -0,0 +1,52 @@ + +from batch_deal_with_layout import * +version = "fix_missing_page_version2" +client=None +finished_file_list = [] +if __name__ == '__main__': + + from tqdm.auto import tqdm + import traceback + parser = ArgumentParser() + parser.add_arguments(BatchLayoutConfig, dest="config") + args = parser.parse_args() + args = args.config + #args.check_lock = hostname.startswith('SH') + assert not args.async_mode, "async_mode is not safe, please disable it" + all_file_list = obtain_processed_filelist(args) + + for inputs_line in tqdm(all_file_list, leave=False, position=1): + + splited_line = inputs_line.split() + inputs_path = splited_line[0] + filename = os.path.basename(inputs_path) + #assert "layoutV" in inputs_path + result_save_root = os.path.join(os.path.dirname(os.path.dirname(inputs_path)),version) + #inputs_path = os.path.join(INPUT_LOAD_PATH,filename) + + if inputs_path.startswith('s3'): + inputs_path = "opendata:"+inputs_path + # assert inputs_path.startswith('opendata:s3') + # assert result_path.startswith('opendata:s3') + if client is None: + client = build_client() + if not check_path_exists(inputs_path,client): + tqdm.write(f"[Skip]: no {inputs_path} ") + continue + + POSSIABLE_RESULT_SAVE_DIR_LIST=[ + result_save_root, + os.path.join("opendata:s3://llm-pdf-text/pdf_gpu_output/ebook_index_v4/scihub/v001/scihub/"), + ] + + skip = False + for result_old_dir in POSSIABLE_RESULT_SAVE_DIR_LIST: + result_old_path = os.path.join(result_old_dir, filename) + if check_path_exists(result_old_path,client) and not args.redo: + #tqdm.write(f"[Skip]: existed {result_old_path} ") + skip = True + break + if skip: + finished_file_list.append(inputs_path) + with open('scan_finished.missingpart','w') as f: + f.write('\n'.join(finished_file_list)) \ No newline at end of file diff --git a/batch_running_task/task_mfr/batch_deal_with_mfr.py b/batch_running_task/task_mfr/batch_deal_with_mfr.py new file mode 100644 index 0000000..ba4662d --- /dev/null +++ b/batch_running_task/task_mfr/batch_deal_with_mfr.py @@ -0,0 +1,141 @@ + +import warnings +warnings.filterwarnings("ignore", category=RuntimeWarning) +from rough_mfr import * +import yaml +# from rough_layout_with_aync import * ## async is not safe, lets disable it +from batch_running_task.get_data_utils import * +RESULT_SAVE_PATH="opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared" +#RESULT_SAVE_PATH="tianning:s3://temp/debug" +INPUT_LOAD_PATH="opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub" +LOCKSERVER="http://10.140.52.123:8000" +from datetime import datetime,timedelta +import socket +hostname= socket.gethostname() +from batch_run_utils import BatchModeConfig, process_files,dataclass,obtain_processed_filelist +from simple_parsing import ArgumentParser +from tqdm.auto import tqdm +import traceback + +@dataclass +class BatchMFRConfig(BatchModeConfig): + inner_batch_size: int = 16 + batch_size: int = 16 + num_workers: int = 4 + result_save_path: str=RESULT_SAVE_PATH + check_lock: bool = True + update_origin: bool = False +if __name__ == '__main__': + task_name = "physics_part" + version = "final2" + + parser = ArgumentParser() + parser.add_arguments(BatchMFRConfig, dest="config") + args = parser.parse_args() + args = args.config + all_file_list = obtain_processed_filelist(args) + + if len(all_file_list)==0: + exit() + + with open('configs/model_configs.yaml') as f: + model_configs = yaml.load(f, Loader=yaml.FullLoader) + + img_size = model_configs['model_args']['img_size'] + conf_thres= model_configs['model_args']['conf_thres'] + iou_thres = model_configs['model_args']['iou_thres'] + device = model_configs['model_args']['device'] + dpi = model_configs['model_args']['pdf_dpi'] + + + layout_model = None + mfd_model = None + client = None + mfr_model = None + page_num_map_whole = None #get_page_num_map_whole() + for inputs_path in tqdm(all_file_list, leave=False, position=1): + filename = os.path.basename(inputs_path) + result_save_root = os.path.join(args.result_save_path, task_name, version) + + if inputs_path.startswith('s3'): + inputs_path = "opendata:"+inputs_path + # assert inputs_path.startswith('opendata:s3') + # assert result_path.startswith('opendata:s3') + if client is None: + client = build_client() + if not check_path_exists(inputs_path,client): + tqdm.write(f"[Skip]: no {inputs_path} ") + continue + + POSSIABLE_RESULT_SAVE_DIR_LIST=[ + #os.path.join(args.result_save_path, task_name, "mfr_patch"), + os.path.join(args.result_save_path, task_name, version), + os.path.join("opendata:s3://llm-pdf-text/pdf_gpu_output/ebook_index_v4/scihub/v001/scihub/"), + ] + + skip = False + for result_old_dir in POSSIABLE_RESULT_SAVE_DIR_LIST: + result_old_path = os.path.join(result_old_dir, filename) + if check_path_exists(result_old_path,client) and not args.redo: + tqdm.write(f"[Skip]: existed {result_old_path} ") + skip = True + break + if skip:continue + + + + partion_num = 1 + for partion_idx in range(partion_num): + if partion_num > 1: + filename_with_partion = f"{filename.replace('.jsonl','')}.{partion_idx:02d}_{partion_num:02d}.jsonl" + else: + filename_with_partion = filename + + skip = False + for result_old_dir in POSSIABLE_RESULT_SAVE_DIR_LIST: + result_old_path = os.path.join(result_old_dir, filename_with_partion) + if not args.redo and check_path_exists(result_old_path,client): + tqdm.write(f"[Skip]: existed {result_old_path} ") + skip = True + break + if skip:continue + + + result_path = os.path.join(result_save_root, filename_with_partion) + if args.check_lock: + lock_path = os.path.join(LOCKSERVER, "checklocktime", filename_with_partion) + last_start_time = check_lock_and_last_start_time(lock_path,client) + if last_start_time and not args.redo: + date_string = last_start_time + date_format = "%Y-%m-%d %H:%M:%S" + date = datetime.strptime(date_string, date_format) + deltatime = datetime.now() - date + if deltatime < timedelta(hours=1): + tqdm.write(f"[Skip]: {filename_with_partion} is locked by {date_string} created at {last_start_time} [now is {deltatime}]") + continue + + create_last_start_time_lock(os.path.join(LOCKSERVER,"createlocktime", filename_with_partion),client) + + print(f"now we deal with {inputs_path} to {result_path}") + os.makedirs(os.path.dirname(result_path), exist_ok=True) + + if mfr_model is None: + mfr_model, mfr_transform = mfr_model_init(model_configs['model_args']['mfr_weight'], device=device) + try: + deal_with_one_dataset(inputs_path, result_path, mfr_model, mfr_transform, + #batch_size = args.batch_size, + pdf_batch_size=16, image_batch_size=128, + num_workers = args.num_workers, + partion_num = partion_num, + partion_idx = partion_idx,update_origin=args.update_origin) + print(f""" +========================================= +finish dealing with {result_path} +========================================= + """) + except: + raise + traceback.print_exc() + tqdm.write(f"[Error]: {filename_with_partion} failed") + finally: + pass \ No newline at end of file diff --git a/batch_running_task/task_mfr/rough_mfr.py b/batch_running_task/task_mfr/rough_mfr.py new file mode 100644 index 0000000..5ad22fb --- /dev/null +++ b/batch_running_task/task_mfr/rough_mfr.py @@ -0,0 +1,224 @@ +import os,sys,warnings +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ['CUDA_MODULE_LOADING'] = 'LAZY' +warnings.simplefilter(action='ignore', category=FutureWarning) +warnings.simplefilter(action='ignore', category=UserWarning) +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from get_data_utils import * +from tqdm.auto import tqdm +from torch.utils.data import Dataset, TensorDataset, DataLoader +from dataaccelerate import DataPrefetcher +from scihub_pdf_dataset import MFRImageDataset,rec_collate_fn,deal_with_one_pdf,none_collate_fn,clean_pdf_path,Timers +import yaml,re +from torchvision import transforms +from task_mfr.unimernet_modeling import DonutTokenizer +try: + client=build_client() +except: + client=None +eps=1e-7 +import argparse +import torch +def latex_rm_whitespace(s: str): + """Remove unnecessary whitespace from LaTeX code. + """ + text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})' + letter = '[a-zA-Z]' + noletter = '[\W_^\d]' + names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)] + s = re.sub(text_reg, lambda match: str(names.pop(0)), s) + news = s + while True: + s = news + news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s) + news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news) + news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news) + if news == s: + break + return s + +def mfr_model_init2(weight_dir, device='cpu',batch_size=128): + args = argparse.Namespace(cfg_path="modules/UniMERNet/configs/demo.yaml", options=None) + import unimernet.tasks as tasks + from unimernet.common.config import Config + from unimernet.processors import load_processor + cfg = Config(args) + cfg.config.model.pretrained = os.path.join(weight_dir, "pytorch_model.bin") + cfg.config.model.model_config.model_name = weight_dir + cfg.config.model.tokenizer_config.path = weight_dir + task = tasks.setup_task(cfg) + model = task.build_model(cfg) + model = model.to(device) + vis_processor = load_processor('formula_image_eval', cfg.config.datasets.formula_rec_eval.vis_processor.eval) + mfr_transform = transforms.Compose([vis_processor, ]) + def gpu_inference(model, imgs): + return model.generate({'image': imgs})['pred_str'] + model.gpu_inference=gpu_inference + return model, mfr_transform + +def mfr_model_init(weight_dir, device='cpu',batch_size=128): + from tensorrt_llm.runtime import MultimodalModelRunner + from transformers import NougatProcessor,NougatImageProcessor + weight_dir ='models/MFR/unimernet' + args = argparse.Namespace(max_new_tokens=30, batch_size=batch_size, log_level='info', + visual_engine_dir=f'{weight_dir}/trt_engines.b{batch_size}/vision_encoder/', + visual_engine_name='model.engine', + llm_engine_dir=f'{weight_dir}/trt_engines.b{batch_size}/unimernet/1-gpu/bfloat16', + hf_model_dir=weight_dir, + input_text=None, num_beams=1, top_k=1, top_p=0.0, + temperature=1.0, repetition_penalty=1.0, + run_profiling=False, profiling_iterations=20, + check_accuracy=False, video_path=None, + image_path=None, path_sep=',', + enable_context_fmha_fp32_acc=None) + + tokenizer = DonutTokenizer(weight_dir) + model = MultimodalModelRunner(args) + vis_processor = NougatProcessor.from_pretrained(weight_dir).image_processor + + def gpu_inference(model, processed_image, batch_size=args.batch_size): + assert batch_size>=len(processed_image) + need_padding = batch_size - len(processed_image) + origin_length= len(processed_image) + processed_image = torch.nn.functional.pad(processed_image,(0,0,0,0,0,0,0,need_padding)).contiguous() + pre_prompt = ['Question: which city is this? Answer:']*len(processed_image) + post_prompt= [None]*len(processed_image) + decoder_input_ids = torch.IntTensor([[0]]) + decoder_input_ids = decoder_input_ids.repeat((batch_size, 1)) + max_new_tokens=30 + attention_mask=None + + + output_text = model.generate(pre_prompt, + post_prompt, + processed_image, + decoder_input_ids, + max_new_tokens, + attention_mask=attention_mask, + warmup=False) + output_text = output_text[:origin_length] + output_text = [t[0] for t in output_text] + return output_text + model.gpu_inference=gpu_inference + return model, vis_processor + + +class TensorDataset(Dataset): + def __init__(self, img_list): + self.img_list = img_list + def __len__(self): + return len(self.img_list) + + def __getitem__(self, idx): + return idx, self.img_list[idx] + +def deal_with_one_dataset(pdf_path, result_path, mfr_model, mfr_transform, + pdf_batch_size =32, + image_batch_size=256, + num_workers=8, + partion_num = 1, + partion_idx = 0, update_origin=False): + images_dataset = MFRImageDataset(pdf_path,mfr_transform,partion_num = partion_num, partion_idx = partion_idx) + data_to_save = fast_deal_with_one_dataset(images_dataset,mfr_model, + pdf_batch_size =pdf_batch_size, + image_batch_size=image_batch_size,num_workers=num_workers,update_origin=update_origin) + write_jsonl_to_path(data_to_save,result_path,images_dataset.client) + + +def fast_deal_with_one_dataset(images_dataset:MFRImageDataset, + mfr_model, + pdf_batch_size =32, + image_batch_size=256, + num_workers=8,update_origin=False): + + image_collecter = DataLoader(images_dataset,batch_size=pdf_batch_size,collate_fn=none_collate_fn, + num_workers=num_workers,pin_memory=False, + prefetch_factor=2) + location_to_mfr = {} + + for image_pool_list in tqdm(image_collecter,position=1,leave=True,desc="Images batch"): + no_image_pdf_list = [] + locations = [] + image_tensors = [] + for idx,(pdf_path, image_dict) in enumerate(image_pool_list): + if len(image_dict)==0: + no_image_pdf_list.append(pdf_path) + continue + for key,tensor in image_dict.items(): + locations.append(key) + image_tensors.append(tensor) + if len(image_tensors) == 0: + #tqdm.write("no mfr result, skip") + continue + + + dataset = TensorDataset(image_tensors) + if len(dataset)<=image_batch_size: + adapat_num_workers = 0 + elif len(dataset)<=2*image_batch_size: + adapat_num_workers = 1 + else: + adapat_num_workers = num_workers + dataloader_group = DataLoader(dataset, batch_size=image_batch_size, num_workers=adapat_num_workers, pin_memory=True, pin_memory_device='cuda') + featcher = DataPrefetcher(dataloader_group,device='cuda') + pbar = tqdm(total=len(dataset),position=2,leave=False,desc="GPU batch") + batch = featcher.next() + indexes=[] + mfr_res=[] + while batch is not None: + index, imgs = batch + output = mfr_model.gpu_inference(mfr_model, imgs) + mfr_res.extend(output) + indexes.extend([t.item() for t in index]) + pbar.update(len(imgs)) + batch = featcher.next() + assert len(mfr_res) == len(image_tensors) + + for index, latex in zip(indexes, mfr_res): + location = locations[index] + location_to_mfr[location] = latex_rm_whitespace(latex) + + + + patch_metadata_list = [] + for pdf_index, pdf_metadata in enumerate(tqdm(images_dataset.metadata)): + pdf_path = clean_pdf_path(pdf_metadata['path']) + patch_metadata = {'path':pdf_path,'doc_layout_result':[]} + for pdf_page_metadata in pdf_metadata['doc_layout_result']: + page_id = pdf_page_metadata['page_id'] + #print(pdf_page_metadata) + this_line_pool = {'page_id':page_id, 'layout_dets':[]} + for bbox_metadata in pdf_page_metadata['layout_dets']: + if bbox_metadata['category_id'] not in [13,14]:continue + category_id = bbox_metadata['category_id'] + bbox_id = tuple(bbox_metadata['poly']) + location= (pdf_path,page_id,bbox_id) + if location not in location_to_mfr: + if not update_origin:print(f"WARNING: one page {location} is not regitered, usually it is because page load fail") + continue + latex = location_to_mfr[location] + if update_origin: + bbox_metadata.update({'latex':latex}) + else: + this_line_pool['layout_dets'].append({'category_id':category_id, 'latex':latex}) + patch_metadata['doc_layout_result'].append(this_line_pool) + patch_metadata_list.append(patch_metadata) + if update_origin: + return images_dataset.metadata + else: + return patch_metadata_list + +if __name__ == "__main__": + + with open('configs/model_configs.yaml') as f: + model_configs = yaml.load(f, Loader=yaml.FullLoader) + device = model_configs['model_args']['device'] + image_batch_size=128 + mfr_model, mfr_transform = mfr_model_init(model_configs['model_args']['mfr_weight'], device=device, batch_size= image_batch_size) + images_dataset = MFRImageDataset("0000000-0000209.01000_00001.jsonl",mfr_transform) + images_dataset[0] + patch_metadata_list = fast_deal_with_one_dataset(images_dataset,mfr_model, + pdf_batch_size =2, + image_batch_size=image_batch_size,num_workers=8) + write_jsonj_to_path(patch_metadata_list, "test_result/result.mfr.test3.jsonl", None) + \ No newline at end of file diff --git a/batch_running_task/task_mfr/run_mfr.sh b/batch_running_task/task_mfr/run_mfr.sh new file mode 100644 index 0000000..60deed7 --- /dev/null +++ b/batch_running_task/task_mfr/run_mfr.sh @@ -0,0 +1,23 @@ +#!/bin/bash +#SBATCH -J ParseSciHUB +#SBATCH -o .log/%j-ParseSciHUB.out +#SBATCH -e .log/%j-ParseSciHUB.out +export LD_LIBRARY_PATH=/mnt/cache/share/gcc/gcc-7.5.0/lib64:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} +export PATH=/mnt/cache/share/gcc/gcc-7.5.0/bin:$PATH + +GCC_VERSION=$(gcc -v 2>&1 | grep "gcc version" | awk '{print $3}') + +# Required version +REQUIRED_VERSION="7.5.0" + +# Check if the version matches +if [ "$GCC_VERSION" != "$REQUIRED_VERSION" ]; then + echo "[`hostname`] GCC version is not $REQUIRED_VERSION. Exiting." + exit 1 +else + echo "[`hostname`] GCC version is $REQUIRED_VERSION." +fi +export OPENMPIPATH=/mnt/petrelfs/share/openmpi-3.1.2-cuda9.0 +export PATH=$OPENMPIPATH/bin:$PATH +export LD_LIBRARY_PATH=$OPENMPIPATH/lib:$LD_LIBRARY_PATH +python batch_running_task/task_mfr/batch_deal_with_mfr.py --root_path $1 --index_part $2 --num_parts $3 --shuffle --num_workers 8 --update_origin \ No newline at end of file diff --git a/batch_running_task/task_mfr/unimernet_modeling.py b/batch_running_task/task_mfr/unimernet_modeling.py new file mode 100644 index 0000000..74c2d14 --- /dev/null +++ b/batch_running_task/task_mfr/unimernet_modeling.py @@ -0,0 +1,813 @@ +import re +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import CrossEntropyLoss +from typing import Optional, Tuple, Union, List +from dataclasses import dataclass +from ftfy import fix_text +from transformers import PreTrainedTokenizerFast +from transformers import DonutSwinConfig, VisionEncoderDecoderConfig +from transformers import AutoModel, VisionEncoderDecoderModel, AutoImageProcessor, MBartForCausalLM +from unimernet_processor import VariableDonutProcessor, VariableDonutImageProcessor +from transformers.models.mbart.modeling_mbart import MBartDecoder +from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import shift_tokens_right +from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask +from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput, CausalLMOutputWithCrossAttentions, BaseModelOutputWithPastAndCrossAttentions +from transformers.models.donut.modeling_donut_swin import DonutSwinPatchEmbeddings, DonutSwinEmbeddings, DonutSwinModel, DonutSwinEncoder +from transformers.utils import logging, ModelOutput + + +logger = logging.get_logger(__name__) + + +class VariableDonutSwinConfig(DonutSwinConfig): + pass + + +class VariableDonutSwinEmbeddings(DonutSwinEmbeddings): + """ + Construct the patch and position embeddings. Optionally, also the mask token. + """ + + def __init__(self, config, use_mask_token=False): + super().__init__(config, use_mask_token) + + self.patch_embeddings = DonutSwinPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.patch_grid = self.patch_embeddings.grid_size + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None + self.position_embeddings = None + + if config.use_absolute_embeddings: + self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim)) + + self.row_embeddings = None + self.column_embeddings = None + if config.use_2d_embeddings: + self.row_embeddings = nn.Parameter(torch.zeros(1, self.patch_grid[0] + 1, config.embed_dim)) + self.column_embeddings = nn.Parameter(torch.zeros(1, self.patch_grid[1] + 1, config.embed_dim)) + + self.norm = nn.LayerNorm(config.embed_dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None + ) -> Tuple[torch.Tensor]: + + embeddings, output_dimensions = self.patch_embeddings(pixel_values) + # Layernorm across the last dimension (each patch is a single row) + embeddings = self.norm(embeddings) + batch_size, seq_len, embed_dim = embeddings.size() + + if bool_masked_pos is not None: + mask_tokens = self.mask_token.expand(batch_size, seq_len, -1) + # replace the masked visual tokens by mask_tokens + mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1.0 - mask) + mask_tokens * mask + + if self.position_embeddings is not None: + embeddings = embeddings + self.position_embeddings[:, :seq_len, :] + + if self.row_embeddings is not None and self.column_embeddings is not None: + # Repeat the x position embeddings across the y axis like 0, 1, 2, 3, 0, 1, 2, 3, ... + row_embeddings = self.row_embeddings[:, :output_dimensions[0], :].repeat_interleave(output_dimensions[1], + dim=1) + column_embeddings = self.column_embeddings[:, :output_dimensions[1], :].repeat(1, output_dimensions[0], 1) + + embeddings = embeddings + row_embeddings + column_embeddings + + embeddings = self.dropout(embeddings) + + return embeddings, output_dimensions + +class VariableDonutSwinModel(DonutSwinModel): + config_class = VariableDonutSwinConfig + + def __init__(self, config, add_pooling_layer=True, use_mask_token=False): + super().__init__(config) + self.config = config + self.num_layers = len(config.depths) + self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1)) + + self.embeddings = VariableDonutSwinEmbeddings(config, use_mask_token=use_mask_token) + self.encoder = DonutSwinEncoder(config, self.embeddings.patch_grid) + + self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + +@dataclass +class CausalLMOutputWithCrossAttentionsAndCounting(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + """ + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + counting: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class CustomMBartDecoder(MBartDecoder): + def __init__(self, config): + print("CustomMBartDecoder init") + super().__init__(config) + hidden_size = config.d_model + self.counting_context_weight = nn.Sequential( + nn.Linear(config.vocab_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, config.d_model) + ) + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + count_pred: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing + cross-attention on hidden heads. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of + shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing + `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more + control over how to convert `input_ids` indices into associated vectors than the model's internal + embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input = input_ids + input_shape = input.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + if self._use_flash_attention_2: + encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # embed positions + positions = self.embed_positions(input, past_key_values_length) + + hidden_states = inputs_embeds + positions.to(inputs_embeds.device) + + # TODO: add counting context weight to hidden_states + if count_pred is not None: + count_context_weight = self.counting_context_weight(count_pred) + hidden_states = hidden_states + 0.5 * count_context_weight.unsqueeze(1) + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + next_decoder_cache = () if use_cache else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + if attn_mask.size()[0] != len(self.layers): + raise ValueError( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {attn_mask.size()[0]}." + ) + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + head_mask[idx] if head_mask is not None else None, + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, + None, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=( + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None + ), + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[3 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + hidden_states = self.layer_norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +class CustomMBartForCausalLM(MBartForCausalLM): + def __init__(self, config, length_aware=True): + print("CustomMBartForCausalLM init") + super().__init__(config) + # Modify the decoder within MBartDecoderWrapper + self.model.decoder = CustomMBartDecoder(config) + self.counting_decoder = SeqCountingDecoder(config.d_model, config.vocab_size) + self.length_aware = length_aware + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + count_gt: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + if the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used + in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional + tensors are only required when the model is used as a decoder in a Sequence to Sequence model. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, MBartForCausalLM + + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25") + >>> model = MBartForCausalLM.from_pretrained("facebook/mbart-large-cc25", add_cross_attention=False) + >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> logits = outputs.logits + >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size] + >>> list(logits.shape) == expected_shape + True + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.length_aware: + count_pred = self.counting_decoder(encoder_hidden_states) + else: + count_pred = None + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + count_pred=count_pred, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + logits = self.lm_head(outputs[0]) + + loss = None + if labels is not None: + labels = labels.to(logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (logits, count_pred) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithCrossAttentionsAndCounting( + loss=loss, + logits=logits, + counting=count_pred, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +class CustomVisionEncoderDecoderModel(VisionEncoderDecoderModel): + def __init__(self, config, length_aware=True): + print("CustomVisionEncoderDecoderModel init") + super().__init__(config) + # Replace the MBartForCausalLM with your CustomMBartForCausalLM + self.decoder = CustomMBartForCausalLM(self.config.decoder, length_aware=length_aware) + self.length_aware = length_aware + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoProcessor, VisionEncoderDecoderModel + >>> import requests + >>> from PIL import Image + >>> import torch + + >>> processor = AutoProcessor.from_pretrained("microsoft/trocr-base-handwritten") + >>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") + + >>> # load image from the IAM dataset + >>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + + >>> # training + >>> model.config.decoder_start_token_id = processor.tokenizer.cls_token_id + >>> model.config.pad_token_id = processor.tokenizer.pad_token_id + >>> model.config.vocab_size = model.config.decoder.vocab_size + + >>> pixel_values = processor(image, return_tensors="pt").pixel_values + >>> text = "hello world" + >>> labels = processor.tokenizer(text, return_tensors="pt").input_ids + >>> outputs = model(pixel_values=pixel_values, labels=labels) + >>> loss = outputs.loss + + >>> # inference (generation) + >>> generated_ids = model.generate(pixel_values) + >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")} + + kwargs_decoder = { + argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") + } + + if encoder_outputs is None: + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + encoder_outputs = self.encoder( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs_encoder, + ) + elif isinstance(encoder_outputs, tuple): + encoder_outputs = BaseModelOutput(*encoder_outputs) + + encoder_hidden_states = encoder_outputs[0] + + # optionally project encoder_hidden_states + if ( + self.encoder.config.hidden_size != self.decoder.config.hidden_size + and self.decoder.config.cross_attention_hidden_size is None + ): + encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states) + + # else: + encoder_attention_mask = None + + if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None): + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + # Decode + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + use_cache=use_cache, + past_key_values=past_key_values, + return_dict=return_dict, + **kwargs_decoder, + ) + + # Compute loss independent from decoder (as some shift the logits inside them) + loss = None + if labels is not None: + logits = decoder_outputs.logits if return_dict else decoder_outputs[0] + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.reshape(-1, self.decoder.config.vocab_size), labels.reshape(-1)) + count_gt = kwargs_decoder.get("count_gt", None) + if self.length_aware and count_gt is not None: + count_gt = torch.log(count_gt.float() + 1) + count_pred = decoder_outputs.counting if return_dict else decoder_outputs[1] # dtype: torch.float32 + counting_loss_fct = nn.SmoothL1Loss() + counting_loss = counting_loss_fct(count_pred, count_gt) + # print("counting_loss", counting_loss) + loss += counting_loss + + + if not return_dict: + if loss is not None: + return (loss,) + decoder_outputs + encoder_outputs + else: + return decoder_outputs + encoder_outputs + + return Seq2SeqLMOutput( + loss=loss, + logits=decoder_outputs.logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +class SelfAttentionBlock(nn.Module): + def __init__(self, embed_size, num_heads): + super(SelfAttentionBlock, self).__init__() + self.self_attention = nn.MultiheadAttention(embed_dim=embed_size, num_heads=num_heads) + self.norm = nn.LayerNorm(embed_size) + + def forward(self, x): + attn_output, _ = self.self_attention(x, x, x) + x = self.norm(attn_output + x) + return x + +class SeqCountingDecoder(nn.Module): + def __init__(self, in_features, out_features, num_heads=8, num_layers=4): + super(SeqCountingDecoder, self).__init__() + self.attention_blocks = nn.ModuleList([ + SelfAttentionBlock(embed_size=in_features, num_heads=num_heads) + for _ in range(num_layers) + ]) + self.fc1 = nn.Linear(in_features, in_features // 2) + self.relu = nn.ReLU() + self.global_avg_pool = nn.AdaptiveAvgPool1d(1) + self.fc2 = nn.Linear(in_features // 2, out_features) + + def forward(self, x): + # x shape (bs, seq_len, d_model) + for block in self.attention_blocks: + x = block(x) + x = self.fc1(x) # Process each patch embedding + x = self.relu(x) + x = x.transpose(1, 2) # Prepare for global pooling + x = self.global_avg_pool(x) # Global pooling + x = x.squeeze(-1) # Remove the last dimension + x = self.fc2(x) # Predict token counts + return x # (bs, vocab_size) + + +class DonutEncoderDecoder(nn.Module): + + def __init__(self, model_name, num_tokens, pad_token_id, bos_token_id, eos_token_id, length_aware=True): + super().__init__() + config = VisionEncoderDecoderConfig.from_pretrained(model_name) + # encoder_config = vars(config.encoder) + # encoder = DonutSwinConfig(**encoder_config) + # config.encoder = encoder + # self.config = config + + # AutoModel.register(VariableDonutSwinConfig, VariableDonutSwinModel) + # # provider_options = { + # # "trt_engine_cache_enable": True, + # # "trt_engine_cache_path": "weights/unimernet_tensorRT/rt_cache" + # # } + + # # self.model = ORTModelForVision2Seq.from_pretrained("weights/unimernet_tensorRT", config=config, provider="TensorrtExecutionProvider", provider_options=provider_options) + + # self.model = CustomVisionEncoderDecoderModel.from_pretrained(model_name, config=self.config, length_aware=length_aware) + self.model = VisionEncoderDecoderModel.from_pretrained("weights/unimernet_clean") + + self.model.config.decoder_start_token_id = bos_token_id + self.model.config.pad_token_id = pad_token_id + self.model.config.eos_token_id = eos_token_id + self.model.decoder.resize_token_embeddings(num_tokens) + self.pad_token_id = pad_token_id + + def forward(self, pixel_values, decoder_input_ids, decoder_attention_mask, **kwargs): + num_channels = pixel_values.shape[1] + if num_channels == 1: + pixel_values = pixel_values.repeat(1, 3, 1, 1) + + labels = decoder_input_ids * 1 + labels = labels.masked_fill(labels == self.pad_token_id, -100) + + loss = self.model( + pixel_values=pixel_values, + decoder_input_ids=decoder_input_ids[:, :-1], + decoder_attention_mask=decoder_attention_mask[:, :-1], + labels=labels[:, 1:], + **kwargs # for Length-Aware Module + ).loss + return loss + + @torch.no_grad() + def generate(self, pixel_values, temperature, max_new_tokens, decoder_start_token_id, do_sample, top_p, + **kwargs): + + num_channels = pixel_values.shape[1] + if num_channels == 1: + pixel_values = pixel_values.repeat(1, 3, 1, 1) + outputs = self.model.generate( + pixel_values=pixel_values, + max_new_tokens=max_new_tokens, + decoder_start_token_id=decoder_start_token_id, + temperature=temperature, + do_sample=do_sample, + top_p=top_p, + ) + return outputs[:, 1:] + + + +class DonutTokenizer: + def __init__(self, path): + AutoImageProcessor.register(VariableDonutSwinConfig, VariableDonutImageProcessor) + processor = VariableDonutProcessor.from_pretrained(path) + processor.train = False + self.tokenizer = processor.tokenizer + self.max_seq_len = 2048 + self.pad_token_id = self.tokenizer.pad_token_id + self.bos_token_id = self.tokenizer.bos_token_id + self.eos_token_id = self.tokenizer.eos_token_id + + def __len__(self): + return len(self.tokenizer) + + def tokenize(self, texts, max_length=None): + if not max_length: + max_length = self.max_seq_len + text_inputs = self.tokenizer( + texts, + return_token_type_ids=False, + return_tensors="pt", + padding="longest", + truncation=True, + max_length=max_length, + ) + return text_inputs + + @staticmethod + def post_process(text): + text = fix_text(text) + return text + + def token2str(self, tokens) -> list: + generated_text = self.tokenizer.batch_decode(tokens, skip_special_tokens=True) + generated_text = [self.post_process(text) for text in generated_text] + return generated_text + + def detokenize(self, tokens): + toks = [self.tokenizer.convert_ids_to_tokens(tok) for tok in tokens] + for b in range(len(toks)): + for i in reversed(range(len(toks[b]))): + if toks[b][i] is None: + toks[b][i] = '' + toks[b][i] = toks[b][i].replace('Ġ', ' ').strip() + if toks[b][i] in ([self.tokenizer.bos_token, self.tokenizer.eos_token, self.tokenizer.pad_token]): + del toks[b][i] + return toks + +AutoImageProcessor.register(VariableDonutSwinConfig, VariableDonutImageProcessor) \ No newline at end of file diff --git a/batch_running_task/task_mfr/unimernet_processor.py b/batch_running_task/task_mfr/unimernet_processor.py new file mode 100644 index 0000000..9dce532 --- /dev/null +++ b/batch_running_task/task_mfr/unimernet_processor.py @@ -0,0 +1,192 @@ +from typing import Dict, Union, Optional, List + +from torch import TensorType +from transformers import DonutImageProcessor, DonutProcessor +from transformers.image_processing_utils import BatchFeature +from transformers.image_transforms import pad +from transformers.image_utils import PILImageResampling, ImageInput, ChannelDimension, make_list_of_images, \ + valid_images, to_numpy_array, is_scaled_image, get_image_size +import numpy as np +import PIL +import logging + +logger = logging.getLogger() + +IMAGE_STD = [0.229, 0.224, 0.225] +IMAGE_MEAN = [0.485, 0.456, 0.406] + + +class VariableDonutImageProcessor(DonutImageProcessor): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def numpy_resize(self, image: np.ndarray, size, resample): + image = PIL.Image.fromarray(image) + resized = self.pil_resize(image, size, resample) + resized = np.array(resized, dtype=np.uint8) + resized_image = resized.transpose(2, 0, 1) + + return resized_image + + def pil_resize(self, image: PIL.Image.Image, size, resample): + width, height = image.size + max_width, max_height = size["width"], size["height"] + if width != max_width or height != max_height: + # Shrink to fit within dimensions + width_scale = max_width / width + height_scale = max_height / height + scale = min(width_scale, height_scale) + + new_width = min(int(width * scale), max_width) + new_height = min(int(height * scale), max_height) + + image = image.resize((new_width, new_height), resample) + + image.thumbnail((max_width, max_height), resample) + + assert image.width <= max_width and image.height <= max_height + + return image + + def process_inner(self, images: List[List], train=False): + # This will be in list of lists format, with height x width x channel + assert isinstance(images[0], (list, np.ndarray)) + + # convert list of lists format to array + if isinstance(images[0], list): + # numpy unit8 needed for augmentation + np_images = [np.array(img, dtype=np.uint8) for img in images] + else: + np_images = [img.astype(np.uint8) for img in images] + + assert np_images[0].shape[2] == 3 # RGB input images, channel dim last + + # This also applies the right channel dim format, to channel x height x width + np_images = [self.numpy_resize(img, self.max_size, self.resample) for img in np_images] + assert np_images[0].shape[0] == 3 # RGB input images, channel dim first + + # Convert to float32 for rescale/normalize + np_images = [img.astype(np.float32) for img in np_images] + + # Pads with 255 (whitespace) + # Pad to max size to improve performance + max_size = self.max_size + np_images = [ + self.pad_image( + image=image, + size=max_size, + random_padding=train, # Change amount of padding randomly during training + input_data_format=ChannelDimension.FIRST, + pad_value=255.0 + ) + for image in np_images + ] + + # Rescale and normalize + np_images = [ + self.rescale(img, scale=self.rescale_factor, input_data_format=ChannelDimension.FIRST) + for img in np_images + ] + np_images = [ + self.normalize(img, mean=self.image_mean, std=self.image_std, input_data_format=ChannelDimension.FIRST) + for img in np_images + ] + + return np_images + + def preprocess( + self, + images: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_thumbnail: bool = None, + do_align_long_axis: bool = None, + do_pad: bool = None, + random_padding: bool = False, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> PIL.Image.Image: + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + # Convert to numpy for later processing steps + images = [to_numpy_array(image) for image in images] + + images = self.process_inner(images, train=False) + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def pad_image( + self, + image: np.ndarray, + size: Dict[str, int], + random_padding: bool = False, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_value: float = 0.0, + ) -> np.ndarray: + output_height, output_width = size["height"], size["width"] + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + + delta_width = output_width - input_width + delta_height = output_height - input_height + + assert delta_width >= 0 and delta_height >= 0 + + if random_padding: + pad_top = np.random.randint(low=0, high=delta_height + 1) + pad_left = np.random.randint(low=0, high=delta_width + 1) + else: + pad_top = delta_height // 2 + pad_left = delta_width // 2 + + pad_bottom = delta_height - pad_top + pad_right = delta_width - pad_left + + padding = ((pad_top, pad_bottom), (pad_left, pad_right)) + return pad(image, padding, data_format=data_format, input_data_format=input_data_format, + constant_values=pad_value) + + +class VariableDonutProcessor(DonutProcessor): + def __init__(self, image_processor=None, tokenizer=None, train=False, **kwargs): + if image_processor is None: + raise ValueError("You need to specify an `image_processor`.") + if tokenizer is None: + raise ValueError("You need to specify a `tokenizer`.") + + super().__init__(image_processor, tokenizer) + self.current_processor = self.image_processor + self._in_target_context_manager = False + self.train = train + + def __call__(self, *args, **kwargs): + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor(*args, **kwargs) + + images = kwargs.pop("images", None) + text = kwargs.pop("text", None) + if len(args) > 0: + images = args[0] + args = args[1:] + + if images is None: + raise ValueError("You need to specify images to process.") + + inputs = self.image_processor(images, *args, **kwargs) + return inputs diff --git a/batch_running_task/task_rec/batch_deal_with_rec.py b/batch_running_task/task_rec/batch_deal_with_rec.py new file mode 100644 index 0000000..0f01869 --- /dev/null +++ b/batch_running_task/task_rec/batch_deal_with_rec.py @@ -0,0 +1,156 @@ + +import warnings +warnings.filterwarnings("ignore", category=RuntimeWarning) +from rough_rec import * +import yaml +# from rough_layout_with_aync import * ## async is not safe, lets disable it +from get_data_utils import * +RESULT_SAVE_PATH="opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared" +#RESULT_SAVE_PATH="tianning:s3://temp/debug" +INPUT_LOAD_PATH="opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub" +import socket +hostname= socket.gethostname() +LOCKSERVER="http://10.140.52.123:8000" if hostname.startswith('SH') else "http://paraai-n32-h-01-ccs-master-2:32453" +from datetime import datetime,timedelta +import socket +hostname= socket.gethostname() +from batch_run_utils import BatchModeConfig, process_files,dataclass,obtain_processed_filelist +from simple_parsing import ArgumentParser +from tqdm.auto import tqdm +import traceback + +@dataclass +class BatchRECConfig(BatchModeConfig): + image_batch_size: int = 256 + pdf_batch_size: int = 32 + num_workers: int = 4 + result_save_path: str=RESULT_SAVE_PATH + check_lock: bool = True + update_origin: bool = False + compile: bool = False + replace:bool=False +if __name__ == '__main__': + task_name = "physics_part" + version = "mfr_patch_bf16" + + parser = ArgumentParser() + parser.add_arguments(BatchRECConfig, dest="config") + args = parser.parse_args() + args = args.config + all_file_list = obtain_processed_filelist(args) + + if len(all_file_list)==0: + exit() + + with open('configs/model_configs.yaml') as f: + model_configs = yaml.load(f, Loader=yaml.FullLoader) + + img_size = model_configs['model_args']['img_size'] + conf_thres= model_configs['model_args']['conf_thres'] + iou_thres = model_configs['model_args']['iou_thres'] + device = model_configs['model_args']['device'] + dpi = model_configs['model_args']['pdf_dpi'] + + task_name = "physics_part" + version = "rec_fixed_final" + layout_model = None + mfd_model = None + client = None + ocrmodel = None + page_num_map_whole = None #get_page_num_map_whole() + for inputs_path in tqdm(all_file_list, leave=False, position=1): + filename = os.path.basename(inputs_path) + + if args.replace: + origin_root = os.path.dirname(inputs_path).split('/') + task_name = origin_root[-2] + version = origin_root[-1] + args.result_save_path = os.path.dirname(os.path.dirname(os.path.dirname(inputs_path))) + args.redo = True + args.update_origin = True + result_save_root = os.path.join(args.result_save_path, task_name, version) + if inputs_path.startswith('s3'): + inputs_path = "opendata:"+inputs_path + # assert inputs_path.startswith('opendata:s3') + # assert result_path.startswith('opendata:s3') + if client is None: + client = build_client() + if not check_path_exists(inputs_path,client): + tqdm.write(f"[Skip]: no {inputs_path} ") + continue + + POSSIABLE_RESULT_SAVE_DIR_LIST=[ + os.path.join(args.result_save_path, task_name, version), + os.path.join("opendata:s3://llm-pdf-text/pdf_gpu_output/ebook_index_v4/scihub/v001/scihub/"), + ] + + skip = False + for result_old_dir in POSSIABLE_RESULT_SAVE_DIR_LIST: + result_old_path = os.path.join(result_old_dir, filename) + if check_path_exists(result_old_path,client) and not args.redo: + tqdm.write(f"[Skip]: existed {result_old_path} ") + skip = True + break + if skip:continue + + + + partion_num = 1 + for partion_idx in range(partion_num): + + if partion_num > 1: + filename_with_partion = f"{filename.replace('.jsonl','')}.{partion_idx:02d}_{partion_num:02d}.jsonl" + else: + filename_with_partion = filename + + skip = False + for result_old_dir in POSSIABLE_RESULT_SAVE_DIR_LIST: + result_old_path = os.path.join(result_old_dir, filename_with_partion) + if not args.redo and check_path_exists(result_old_path,client): + tqdm.write(f"[Skip]: existed {result_old_path} ") + skip = True + break + if skip:continue + + + result_path = os.path.join(result_save_root, filename_with_partion) + if args.check_lock: + lock_path = os.path.join(LOCKSERVER, "checklocktime", filename_with_partion) + last_start_time = check_lock_and_last_start_time(lock_path,client) + if last_start_time and not args.redo: + date_string = last_start_time + date_format = "%Y-%m-%d %H:%M:%S" + date = datetime.strptime(date_string, date_format) + deltatime = datetime.now() - date + if deltatime < timedelta(hours=20): + tqdm.write(f"[Skip]: {filename_with_partion} is locked by {date_string} created at {last_start_time} [now is {deltatime}]") + continue + + create_last_start_time_lock(os.path.join(LOCKSERVER,"createlocktime", filename_with_partion),client) + + print(f"now we deal with {inputs_path} to {result_path}") + os.makedirs(os.path.dirname(result_path), exist_ok=True) + + if ocrmodel is None: + ocrmodel = TextRecognizer(rec_args) + if args.compile: + ocrmodel.net.backbone = torch.compile(ocrmodel.net.backbone) + + + try: + deal_with_one_dataset(inputs_path, result_path, ocrmodel, + #batch_size = args.batch_size, + pdf_batch_size=args.pdf_batch_size, image_batch_size=args.image_batch_size, + num_workers = args.num_workers, + partion_num = partion_num, + partion_idx = partion_idx,update_origin=args.update_origin) + print(f""" +========================================= +finish dealing with {result_path} +========================================= + """) + except: + traceback.print_exc() + tqdm.write(f"[Error]: {filename_with_partion} failed") + finally: + pass \ No newline at end of file diff --git a/batch_running_task/task_rec/batch_text_rec.py b/batch_running_task/task_rec/batch_text_rec.py new file mode 100644 index 0000000..8693b04 --- /dev/null +++ b/batch_running_task/task_rec/batch_text_rec.py @@ -0,0 +1,373 @@ +import os +import sys +import copy +import cv2 +import numpy as np +import time +import json +import torch +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from pytorchocr.base_ocr_v20 import BaseOCRV20 +from pytorchocr.utils.utility import get_image_file_list, check_and_read_gif +from pytorchocr.data import create_operators, transform +from pytorchocr.postprocess import build_post_process +from pytorchocr import pytorchocr_utility as utility + +class TextRecognizer(BaseOCRV20): + def __init__(self, args, **kwargs): + self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")] + self.character_type = args.rec_char_type + self.rec_batch_num = args.rec_batch_num + self.rec_algorithm = args.rec_algorithm + self.max_text_length = args.max_text_length + postprocess_params = { + 'name': 'CTCLabelDecode', + "character_type": args.rec_char_type, + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + if self.rec_algorithm == "SRN": + postprocess_params = { + 'name': 'SRNLabelDecode', + "character_type": args.rec_char_type, + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "RARE": + postprocess_params = { + 'name': 'AttnLabelDecode', + "character_type": args.rec_char_type, + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == 'NRTR': + postprocess_params = { + 'name': 'NRTRLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "SAR": + postprocess_params = { + 'name': 'SARLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == 'ViTSTR': + postprocess_params = { + 'name': 'ViTSTRLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "CAN": + self.inverse = args.rec_image_inverse + postprocess_params = { + 'name': 'CANLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == 'RFL': + postprocess_params = { + 'name': 'RFLLabelDecode', + "character_dict_path": None, + "use_space_char": args.use_space_char + } + self.postprocess_op = build_post_process(postprocess_params) + + use_gpu = args.use_gpu + self.use_gpu = torch.cuda.is_available() and use_gpu + + self.limited_max_width = args.limited_max_width + self.limited_min_width = args.limited_min_width + + self.weights_path = args.rec_model_path + self.yaml_path = args.rec_yaml_path + + char_num = len(getattr(self.postprocess_op, 'character')) + network_config = utility.AnalysisConfig(self.weights_path, self.yaml_path, char_num) + weights = self.read_pytorch_weights(self.weights_path) + + self.out_channels = self.get_out_channels(weights) + if self.rec_algorithm == 'NRTR': + self.out_channels = list(weights.values())[-1].numpy().shape[0] + elif self.rec_algorithm == 'SAR': + self.out_channels = list(weights.values())[-3].numpy().shape[0] + + kwargs['out_channels'] = self.out_channels + super(TextRecognizer, self).__init__(network_config, **kwargs) + + self.load_state_dict(weights) + self.net.eval() + if self.use_gpu: + self.net.cuda() + + def resize_norm_img(self, img, max_wh_ratio): + imgC, imgH, imgW = self.rec_image_shape + if self.rec_algorithm == 'NRTR' or self.rec_algorithm == 'ViTSTR': + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + # return padding_im + image_pil = Image.fromarray(np.uint8(img)) + if self.rec_algorithm == 'ViTSTR': + img = image_pil.resize([imgW, imgH], Image.BICUBIC) + else: + img = image_pil.resize([imgW, imgH], Image.ANTIALIAS) + img = np.array(img) + norm_img = np.expand_dims(img, -1) + norm_img = norm_img.transpose((2, 0, 1)) + if self.rec_algorithm == 'ViTSTR': + norm_img = norm_img.astype(np.float32) / 255. + else: + norm_img = norm_img.astype(np.float32) / 128. - 1. + return norm_img + elif self.rec_algorithm == 'RFL': + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_CUBIC) + resized_image = resized_image.astype('float32') + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + resized_image -= 0.5 + resized_image /= 0.5 + return resized_image + + assert imgC == img.shape[2] + max_wh_ratio = max(max_wh_ratio, imgW / imgH) + imgW = int((imgH * max_wh_ratio)) + imgW = max(min(imgW, self.limited_max_width), self.limited_min_width) + h, w = img.shape[:2] + ratio = w / float(h) + ratio_imgH = math.ceil(imgH * ratio) + ratio_imgH = max(ratio_imgH, self.limited_min_width) + if ratio_imgH > imgW: + resized_w = imgW + else: + resized_w = int(ratio_imgH) + resized_image = cv2.resize(img, (resized_w, imgH)) + resized_image = resized_image.astype('float32') + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + return padding_im + + def resize_norm_img_svtr(self, img, image_shape): + + imgC, imgH, imgW = image_shape + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) + resized_image = resized_image.astype('float32') + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + return resized_image + + + def resize_norm_img_srn(self, img, image_shape): + imgC, imgH, imgW = image_shape + + img_black = np.zeros((imgH, imgW)) + im_hei = img.shape[0] + im_wid = img.shape[1] + + if im_wid <= im_hei * 1: + img_new = cv2.resize(img, (imgH * 1, imgH)) + elif im_wid <= im_hei * 2: + img_new = cv2.resize(img, (imgH * 2, imgH)) + elif im_wid <= im_hei * 3: + img_new = cv2.resize(img, (imgH * 3, imgH)) + else: + img_new = cv2.resize(img, (imgW, imgH)) + + img_np = np.asarray(img_new) + img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY) + img_black[:, 0:img_np.shape[1]] = img_np + img_black = img_black[:, :, np.newaxis] + + row, col, c = img_black.shape + c = 1 + + return np.reshape(img_black, (c, row, col)).astype(np.float32) + + def srn_other_inputs(self, image_shape, num_heads, max_text_length): + + imgC, imgH, imgW = image_shape + feature_dim = int((imgH / 8) * (imgW / 8)) + + encoder_word_pos = np.array(range(0, feature_dim)).reshape( + (feature_dim, 1)).astype('int64') + gsrm_word_pos = np.array(range(0, max_text_length)).reshape( + (max_text_length, 1)).astype('int64') + + gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length)) + gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape( + [-1, 1, max_text_length, max_text_length]) + gsrm_slf_attn_bias1 = np.tile( + gsrm_slf_attn_bias1, + [1, num_heads, 1, 1]).astype('float32') * [-1e9] + + gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape( + [-1, 1, max_text_length, max_text_length]) + gsrm_slf_attn_bias2 = np.tile( + gsrm_slf_attn_bias2, + [1, num_heads, 1, 1]).astype('float32') * [-1e9] + + encoder_word_pos = encoder_word_pos[np.newaxis, :] + gsrm_word_pos = gsrm_word_pos[np.newaxis, :] + + return [ + encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2 + ] + + def process_image_srn(self, img, image_shape, num_heads, max_text_length): + norm_img = self.resize_norm_img_srn(img, image_shape) + norm_img = norm_img[np.newaxis, :] + + [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \ + self.srn_other_inputs(image_shape, num_heads, max_text_length) + + gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32) + gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32) + encoder_word_pos = encoder_word_pos.astype(np.int64) + gsrm_word_pos = gsrm_word_pos.astype(np.int64) + + return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2) + + def resize_norm_img_sar(self, img, image_shape, + width_downsample_ratio=0.25): + imgC, imgH, imgW_min, imgW_max = image_shape + h = img.shape[0] + w = img.shape[1] + valid_ratio = 1.0 + # make sure new_width is an integral multiple of width_divisor. + width_divisor = int(1 / width_downsample_ratio) + # resize + ratio = w / float(h) + resize_w = math.ceil(imgH * ratio) + if resize_w % width_divisor != 0: + resize_w = round(resize_w / width_divisor) * width_divisor + if imgW_min is not None: + resize_w = max(imgW_min, resize_w) + if imgW_max is not None: + valid_ratio = min(1.0, 1.0 * resize_w / imgW_max) + resize_w = min(imgW_max, resize_w) + resized_image = cv2.resize(img, (resize_w, imgH)) + resized_image = resized_image.astype('float32') + # norm + if image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + resize_shape = resized_image.shape + padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32) + padding_im[:, :, 0:resize_w] = resized_image + pad_shape = padding_im.shape + + return padding_im, resize_shape, pad_shape, valid_ratio + + + def norm_img_can(self, img, image_shape): + + img = cv2.cvtColor( + img, cv2.COLOR_BGR2GRAY) # CAN only predict gray scale image + + if self.inverse: + img = 255 - img + + if self.rec_image_shape[0] == 1: + h, w = img.shape + _, imgH, imgW = self.rec_image_shape + if h < imgH or w < imgW: + padding_h = max(imgH - h, 0) + padding_w = max(imgW - w, 0) + img_padded = np.pad(img, ((0, padding_h), (0, padding_w)), + 'constant', + constant_values=(255)) + img = img_padded + + img = np.expand_dims(img, 0) / 255.0 # h,w,c -> c,h,w + img = img.astype('float32') + + return img + + def preprocessing(self, img_list): + norm_img_batch = [] + max_wh_ratio = 0 + for img_now in img_list: + # h, w = img_list[ino].shape[0:2] + h, w = img_now.shape[0:2] + wh_ratio = w * 1.0 / h + max_wh_ratio = max(max_wh_ratio, wh_ratio) + for img_now in img_list: + norm_img = self.resize_norm_img(img_now,max_wh_ratio) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + norm_img_batch = np.concatenate(norm_img_batch) + + # norm_img_batch = norm_img_batch.copy() + return norm_img_batch + + def to_tensor(self, img_batch_in_numpy): + inp = torch.from_numpy(img_batch_in_numpy) + if self.use_gpu: + inp = inp.cuda() + return inp + + def __call__(self, img_list): + assert self.rec_algorithm == 'SVTR_LCNet' + img_num = len(img_list) + # Calculate the aspect ratio of all text bars + width_list = [] + for img in img_list: + width_list.append(img.shape[1] / float(img.shape[0])) + # Sorting can speed up the recognition process + indices = np.argsort(np.array(width_list)) + + # rec_res = [] + rec_res = [['', 0.0]] * img_num + batch_num = self.rec_batch_num + elapse = 0 + for beg_img_no in range(0, img_num, batch_num): + end_img_no = min(img_num, beg_img_no + batch_num) + image_batch_now = [img_list[indices[ino]] for ino in range(beg_img_no, end_img_no)] + norm_img_batch = self.preprocessing(image_batch_now) + starttime = time.time() + inp = self.to_tensor(norm_img_batch) + with torch.no_grad(): + prob_out = self.net(inp) + + if isinstance(prob_out, list): + preds = [v.cpu().numpy() for v in prob_out] + else: + preds = prob_out.cpu().numpy() + + rec_result = self.postprocess_op(preds) + for rno in range(len(rec_result)): + rec_res[indices[beg_img_no + rno]] = rec_result[rno] + elapse += time.time() - starttime + return rec_res, elapse + +from argparse import Namespace +rec_args = args=Namespace(use_gpu=True, gpu_mem=500, warmup=False, + image_dir='./doc/imgs_words/en/word_1.png', det_algorithm='DB', det_model_path=None, det_limit_side_len=960, + det_limit_type='max', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, + use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, + det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_sast_polygon=False, det_pse_thresh=0, det_pse_box_thresh=0.85, + det_pse_min_area=16, det_pse_box_type='box', det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, + fourier_degree=5, det_fce_box_type='poly', rec_algorithm='CRNN', rec_model_path='models/pytorch_paddle_weight/en_ptocr_v4_rec_infer.pth', + rec_image_inverse=True, rec_image_shape='3,48,320', rec_char_type='ch', rec_batch_num=6, max_text_length=25, + use_space_char=True, drop_score=0.5, limited_max_width=1280, limited_min_width=16, + vis_font_path=None, + rec_char_dict_path='batch_running_task/pytorchocr/utils/en_dict.txt', use_angle_cls=False, cls_model_path=None, + cls_image_shape='3, 48, 192', label_list=['0', '180'], cls_batch_num=6, cls_thresh=0.9, enable_mkldnn=False, + use_pdserving=False, e2e_algorithm='PGNet', e2e_model_path=None, e2e_limit_side_len=768, e2e_limit_type='max', + e2e_pgnet_score_thresh=0.5, e2e_char_dict_path='batch_running_task/pytorchocr/utils/ic15_dict.txt', + e2e_pgnet_valid_set='totaltext', e2e_pgnet_polygon=True, e2e_pgnet_mode='fast', sr_model_path=None, + sr_image_shape='3, 32, 128', sr_batch_num=1, det_yaml_path=None, rec_yaml_path='./configs/rec/PP-OCRv4/en_PP-OCRv4_rec.yml', + cls_yaml_path=None, e2e_yaml_path=None, sr_yaml_path=None, use_mp=False, total_process_num=1, process_id=0, + benchmark=False, save_log_path='./log_output/', show_log=True) \ No newline at end of file diff --git a/batch_running_task/task_rec/rough_rec.py b/batch_running_task/task_rec/rough_rec.py new file mode 100644 index 0000000..d49cb08 --- /dev/null +++ b/batch_running_task/task_rec/rough_rec.py @@ -0,0 +1,543 @@ + +import os,sys,warnings +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ['CUDA_MODULE_LOADING'] = 'LAZY' +warnings.simplefilter(action='ignore', category=FutureWarning) +warnings.simplefilter(action='ignore', category=UserWarning) +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from get_data_utils import * +import numpy as np +from tqdm.auto import tqdm +import cv2 +from torch.utils.data import Dataset, TensorDataset, DataLoader +from dataaccelerate import DataPrefetcher +from task_rec.batch_text_rec import TextRecognizer, rec_args +import torch +from scihub_pdf_dataset import RecImageDataset,rec_collate_fn,deal_with_one_pdf,none_collate_fn,clean_pdf_path,Timers +try: + client=build_client() +except: + client=None +eps=1e-7 +import math + +# def rec_preprocessing(text_recognizer, img_list): +# norm_img_batch = [] + +# resize_norm_img_func = partial(resize_norm_img, +# max_wh_ratio=max_wh_ratio, +# rec_image_shape =text_recognizer.rec_image_shape, +# limited_max_width=text_recognizer.limited_max_width, +# limited_min_width=text_recognizer.limited_min_width) +# for img_now in tqdm(img_list, desc="resize and normlized image"): +# norm_img = resize_norm_img_func(img_now) +# norm_img = norm_img[np.newaxis, :] +# norm_img_batch.append(norm_img) +# norm_img_batch = np.concatenate(norm_img_batch) +# # norm_img_batch = norm_img_batch.copy() +# return norm_img_batch + +def resize_norm_img(img, max_wh_ratio=None,rec_image_shape=None,limited_max_width=None,limited_min_width=None): + imgC, imgH, imgW = rec_image_shape + assert imgC == img.shape[2] + max_wh_ratio = max(max_wh_ratio, imgW / (imgH+eps)) + imgW = int((imgH * max_wh_ratio)) + imgW = max(min(imgW, limited_max_width), limited_min_width) + h, w = img.shape[:2] + ratio = w / (float(h)+eps) + ratio_imgH = math.ceil(imgH * ratio) + ratio_imgH = max(ratio_imgH, limited_min_width) + if ratio_imgH > imgW: + resized_w = imgW + else: + resized_w = int(ratio_imgH) + resized_image = cv2.resize(img, (resized_w, imgH)) + resized_image = resized_image.astype('float32') + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + return padding_im + +class UnifiedResizedDataset(Dataset): + def __init__(self, img_list,rec_image_shape,limited_max_width,limited_min_width): + max_wh_ratio = 0 + for img_now in img_list: + # h, w = img_list[ino].shape[0:2] + h, w = img_now.shape[0:2] + wh_ratio = w * 1.0 / (h+eps) + max_wh_ratio = max(max_wh_ratio, wh_ratio) + self.max_wh_ratio = max_wh_ratio + self.image_list = img_list + self.rec_image_shape =rec_image_shape + self.limited_max_width =limited_max_width + self.limited_min_width =limited_min_width + def __len__(self): + return len(self.image_list) + + def __getitem__(self, idx): + return idx, resize_norm_img(self.image_list[idx], self.max_wh_ratio, self.rec_image_shape, self.limited_max_width, self.limited_min_width) + +class UnifiedResizedGroupDataset(Dataset): + def __init__(self, img_list,rec_image_shape,limited_max_width,limited_min_width,max_wh_ratios_list): + + self.image_list = img_list + self.rec_image_shape =rec_image_shape + self.limited_max_width =limited_max_width + self.limited_min_width =limited_min_width + self.max_wh_ratios_list = max_wh_ratios_list + def __len__(self): + return len(self.image_list) + + def __getitem__(self, idx): + return idx, resize_norm_img(self.image_list[idx], self.max_wh_ratios_list[idx], self.rec_image_shape, self.limited_max_width, self.limited_min_width) + + + +def postprocess(self,preds, label=None): + preds_prob,preds_idx = preds.max(axis=2) + text = self.decode(preds_idx.cpu().numpy(), preds_prob.cpu().numpy(), is_remove_duplicate=True) + + if label is None:return text + label = self.decode(label) + return text, label + +def gpu_inference(batch, tex_recognizer): + inp = batch + with torch.no_grad(): + with torch.cuda.amp.autocast(dtype=torch.float16): ### tested, fp16 only influence the result for last end sign like `.` or similar symbol like `0`` and `O` + prob_out = tex_recognizer.net(inp) + rec_result = postprocess(tex_recognizer.postprocess_op,prob_out) + return rec_result + + +def calculate_dimensions(bbox): + x_coords = bbox[::2] + y_coords = bbox[1::2] + width = max(x_coords) - min(x_coords) + height = max(y_coords) - min(y_coords) + return width, height + + +def build_bbox_group(metadatas, dataset): + width_range = 100 + height_range= 100 + grouped_bboxes = {} + location2group = {} + location2boxes = {} + count_how_many_pdf_is_recalculated = {} + count_how_many_page_is_recalculated = {} + for pdf_index, pdf_metadata in enumerate(tqdm(metadatas,desc="building group")): + pdf_path = clean_pdf_path(pdf_metadata['path']) + for pdf_page_metadata in tqdm(pdf_metadata['doc_layout_result'],desc="iter along page", leave=False, position=1): + location_keys = dataset.collect_location_and_dt_box_from_page_metadata(pdf_path, pdf_page_metadata) + for location in location_keys: + pdf_path,page_id,bbox_id,sub_box_id = location + bbox = sub_box_id + width, height = calculate_dimensions(bbox) + width_group = int(width / (width_range + eps)) + height_group = int(height / (height_range+ eps)) + group_key = (width_group, height_group) + if group_key not in grouped_bboxes: + grouped_bboxes[group_key] = [] + grouped_bboxes[group_key].append(location) + location2group[location] = group_key + location2boxes[location] = bbox + count_how_many_pdf_is_recalculated[pdf_path] = 1 + count_how_many_page_is_recalculated[(pdf_path,page_id)] = 1 + count_how_many_pdf_is_recalculated = len(count_how_many_pdf_is_recalculated) + count_how_many_page_is_recalculated = len(count_how_many_page_is_recalculated) + count_how_many_box_is_recalculated = len(location2group) + print(f"Processing: pdfs:{count_how_many_pdf_is_recalculated}, pages:{count_how_many_page_is_recalculated}, boxes:{count_how_many_box_is_recalculated}") + return grouped_bboxes, location2group, location2boxes + +from typing import List, Dict +def obtain_data_from_pool_list(pool_list, key): + for pool in pool_list: + if key in pool: + return pool[key] + return None + +def deal_with_one_dataset(pdf_path, result_path, tex_recognizer, + pdf_batch_size =32, + image_batch_size=256, + num_workers=8, + partion_num = 1, + partion_idx = 0,update_origin=False): + images_dataset = RecImageDataset(pdf_path,partion_num = partion_num, partion_idx = partion_idx) + data_to_save = fast_deal_with_one_dataset2(images_dataset,tex_recognizer, + pdf_batch_size =pdf_batch_size, + image_batch_size=image_batch_size,num_workers=num_workers, + update_origin=update_origin) + if data_to_save is not None: + write_jsonl_to_path(data_to_save,result_path,images_dataset.client) + + + +def fast_deal_with_one_dataset(images_dataset:RecImageDataset,tex_recognizer:TextRecognizer, + pdf_batch_size =32, + image_batch_size=256, + num_workers=8, update_origin=False): + + _,location2group,location2boxes = build_bbox_group(images_dataset.metadata,images_dataset) + image_collecter = DataLoader(images_dataset, batch_size=pdf_batch_size,collate_fn=none_collate_fn, + num_workers=num_workers,pin_memory=False, + prefetch_factor=2) + location_to_rec = {} + for image_pool_list in tqdm(image_collecter,position=0,leave=True,desc="Images batch"): + no_image_pdf_list = [] + image_pool = {} + current_group_bboxes = {} + for idx,(pdf_path, image_dict) in enumerate(tqdm(image_pool_list,position=0,leave=False, desc="Partiton current image pool")): + if len(image_dict)==0: + no_image_pdf_list.append(pdf_path) + #print(f"pdf {pdf_path} has no text image") + continue + for key,val in image_dict.items(): + image_pool[key]=val + group = location2group[key] + if group not in current_group_bboxes: + current_group_bboxes[group] = [] + current_group_bboxes[group].append((key,location2boxes[key])) + if len(image_pool) == 0:continue + + + #### next step, lets do normlized the bbox to the same size + + + pbar_whole_images = tqdm(total=len(image_pool),position=1,leave=False,desc=f"Group batch:{len(no_image_pdf_list)} pdfs has no text image and {len(image_pool)} text images") + for group_key, location_and_bbox in current_group_bboxes.items(): + if len(location_and_bbox) == 0:continue + + img_list_group = [image_pool[location] for location, bbox in location_and_bbox] + rec_list_group = [] + dataset = UnifiedResizedDataset(img_list_group, tex_recognizer.rec_image_shape, tex_recognizer.limited_max_width, tex_recognizer.limited_min_width) + if len(dataset)<=image_batch_size: + adapat_num_workers = 0 + elif len(dataset)<=2*image_batch_size: + adapat_num_workers = 1 + else: + adapat_num_workers = num_workers + dataloader_group = DataLoader(dataset, batch_size=image_batch_size, num_workers=adapat_num_workers, pin_memory=True, pin_memory_device='cuda') + featcher = DataPrefetcher(dataloader_group,device='cuda') + pbar = tqdm(total=len(dataloader_group),position=2,leave=False,desc="GPU batch") + batch = featcher.next() + indexes=[] + while batch is not None: + index, batch = batch + rec_result = gpu_inference(batch, tex_recognizer) + rec_list_group.extend(rec_result) + indexes.extend([t.item() for t in index]) + pbar.update(1) + batch = featcher.next() + assert len(location_and_bbox) == len(rec_list_group) + + for index, rec_res in zip(indexes, rec_list_group): + (location, bbox) = location_and_bbox[index] + location_to_rec[location] = rec_res + + pbar_whole_images.update(len(img_list_group)) + + location_and_sub_location_map = {} + for abs_location in location_to_rec.keys(): + pdf_path,page_id,bbox_id,sub_box_id = abs_location + location = (pdf_path,page_id,bbox_id) + if location not in location_and_sub_location_map:location_and_sub_location_map[location] = [] + location_and_sub_location_map[location].append(sub_box_id) + + + patch_metadata_list = [] + for pdf_index, pdf_metadata in enumerate(tqdm(images_dataset.metadata)): + pdf_path = clean_pdf_path(pdf_metadata['path']) + + patch_metadata = {'path':pdf_path,'doc_layout_result':[]} + for pdf_page_metadata in pdf_metadata['doc_layout_result']: + page_id = pdf_page_metadata['page_id'] + + this_line_pool = {'page_id':page_id, 'layout_dets':[]} + for bbox_metadata in pdf_page_metadata['layout_dets']: + if bbox_metadata['category_id']!=15:continue + bbox_id = tuple(bbox_metadata['poly']) + location = (pdf_path,page_id,bbox_id) + current_line_box_rec_result = [] + rel_location_list = location_and_sub_location_map[location] + for sub_box_id in rel_location_list: + abs_location = (pdf_path,page_id,bbox_id,sub_box_id) + text, score = location_to_rec[abs_location] + current_line_box_rec_result.append({'poly':sub_box_id, 'text':text, 'score':float(score)}) + if len(current_line_box_rec_result)==0: + continue + if update_origin: + bbox_metadata.update({'sub_boxes':current_line_box_rec_result}) + else: + this_line_pool['layout_dets'].append({'category_id':15, 'sub_boxes':current_line_box_rec_result}) + patch_metadata['doc_layout_result'].append(this_line_pool) + patch_metadata_list.append(patch_metadata) + if update_origin: + return images_dataset.metadata + else: + return patch_metadata_list + + +from torch.utils.data import Sampler + +from torch.utils.data import Sampler + +class GroupBatchSampler(Sampler): + def __init__(self, group_indices, batch_size): + self.group_indices = group_indices + self.batch_size = batch_size + + def __iter__(self): + for indices in self.group_indices: + # Yield full batches within the group + for i in range(0, len(indices), self.batch_size): + yield indices[i:i + self.batch_size] + + def __len__(self): + return sum((len(indices) + self.batch_size - 1) // self.batch_size for indices in self.group_indices) + +def fast_deal_with_one_dataset2(images_dataset:RecImageDataset,tex_recognizer:TextRecognizer, + pdf_batch_size =32, + image_batch_size=256, + num_workers=8,update_origin=False): + + _,location2group,location2boxes = build_bbox_group(images_dataset.metadata,images_dataset) + if len(location2group) == 0:return None + image_collecter = DataLoader(images_dataset, batch_size=pdf_batch_size,collate_fn=none_collate_fn, + num_workers=num_workers,pin_memory=False, + prefetch_factor=2) + location_to_rec = {} + for image_pool_list in tqdm(image_collecter,position=1,leave=True,desc="Images batch"): + no_image_pdf_list = [] + image_pool = {} + current_group_bboxes = {} + for idx,(pdf_path, image_dict) in enumerate(tqdm(image_pool_list,position=2,leave=False, desc="Partiton current image pool")): + if len(image_dict)==0: + no_image_pdf_list.append(pdf_path) + #print(f"pdf {pdf_path} has no text image") + continue + for key,val in image_dict.items(): + image_pool[key]=val + group = location2group[key] + if group not in current_group_bboxes: + current_group_bboxes[group] = [] + current_group_bboxes[group].append((key,location2boxes[key])) + if len(image_pool) == 0:continue + + + #### next step, lets do normlized the bbox to the same size + + all_images = [] + all_max_wh_ratios = [] + group_indices = [] + location_bbox_map = [] + current_index = 0 + for group_key, location_and_bbox in current_group_bboxes.items(): + if len(location_and_bbox) == 0: + continue + + img_list_group = [image_pool[location] for location, bbox in location_and_bbox] + max_wh_ratio = max((w / (h + 1e-5) for img in img_list_group for h, w in [img.shape[:2]]), default=0) + + all_images.extend(img_list_group) + all_max_wh_ratios.extend([max_wh_ratio] * len(img_list_group)) + location_bbox_map.extend(location_and_bbox) + + group_indices.append(list(range(current_index, current_index + len(img_list_group)))) + current_index += len(img_list_group) + + dataset = UnifiedResizedGroupDataset(all_images, tex_recognizer.rec_image_shape, tex_recognizer.limited_max_width, tex_recognizer.limited_min_width, all_max_wh_ratios) + batch_sampler = GroupBatchSampler(group_indices,image_batch_size) + dataloader = DataLoader(dataset, batch_sampler=batch_sampler, num_workers=num_workers, pin_memory=True, pin_memory_device='cuda') + + + + featcher = DataPrefetcher(dataloader ,device='cuda') + pbar = tqdm(total=len(dataset), position=2, leave=False, desc="GPU batch") + batch = featcher.next() + indexes=[] + rec_list = [] + while batch is not None: + index, batch = batch + #tqdm.write(f"This Batch shape is {batch.shape}") + rec_result = gpu_inference(batch, tex_recognizer) + rec_list.extend(rec_result) + indexes.extend([t.item() for t in index]) + pbar.update(len(batch)) + batch = featcher.next() + + assert len(rec_list) == len(location_bbox_map) + for index, rec_res in zip(indexes, rec_list): + (location, bbox) = location_bbox_map[index] + location_to_rec[location] = rec_res + + + location_and_sub_location_map = {} + for abs_location in location_to_rec.keys(): + pdf_path,page_id,bbox_id,sub_box_id = abs_location + location = (pdf_path,page_id,bbox_id) + if location not in location_and_sub_location_map:location_and_sub_location_map[location] = [] + location_and_sub_location_map[location].append(sub_box_id) + + + patch_metadata_list = [] + for pdf_index, pdf_metadata in enumerate(tqdm(images_dataset.metadata)): + pdf_path = clean_pdf_path(pdf_metadata['path']) + + patch_metadata = {'path':pdf_path,'doc_layout_result':[]} + for pdf_page_metadata in pdf_metadata['doc_layout_result']: + page_id = pdf_page_metadata['page_id'] + + this_line_pool = {'page_id':page_id, 'layout_dets':[]} + for bbox_metadata in pdf_page_metadata['layout_dets']: + if bbox_metadata['category_id']!=15:continue + + bbox_id = tuple(bbox_metadata['poly']) + location = (pdf_path,page_id,bbox_id) + if location not in location_and_sub_location_map: + assert update_origin, "you must update the origin metadata if you choose skip some bbox" + continue + current_line_box_rec_result = [] + rel_location_list = location_and_sub_location_map[location] + for sub_box_id in rel_location_list: + abs_location = (pdf_path,page_id,bbox_id,sub_box_id) + text, score = location_to_rec[abs_location] + + sub_box_id = tuple([int(t) for t in sub_box_id]) + + current_line_box_rec_result.append({'poly':sub_box_id, 'text':text, 'score':float(score)}) + if len(current_line_box_rec_result)==0: + continue + if update_origin: + bbox_metadata.update({'sub_boxes':current_line_box_rec_result}) + else: + this_line_pool['layout_dets'].append({'category_id':15, 'sub_boxes':current_line_box_rec_result}) + patch_metadata['doc_layout_result'].append(this_line_pool) + patch_metadata_list.append(patch_metadata) + if update_origin: + return images_dataset.metadata + else: + return patch_metadata_list + + +if __name__ == "__main__": + + ocr_mode = 'batch' + batch_size = 128 + num_workers= 8 + metadata_filepath = "0000000-0000209.01000_00001.jsonl" + images_dataset = RecImageDataset(metadata_filepath) + # _,location2group,location2boxes = build_bbox_group(images_dataset.metadata,images_dataset) + # image_collecter = DataLoader(images_dataset, batch_size=2,collate_fn=none_collate_fn, + # num_workers=num_workers,pin_memory=False, + # prefetch_factor=2) + + # for image_pool_list in tqdm(image_collecter,position=1,leave=True,desc="Images batch"): + # no_image_pdf_list = [] + # image_pool = {} + # current_group_bboxes = {} + # for idx,(pdf_path, image_dict) in enumerate(tqdm(image_pool_list,position=2,leave=False, desc="Partiton current image pool")): + # if len(image_dict)==0: + # no_image_pdf_list.append(pdf_path) + # #print(f"pdf {pdf_path} has no text image") + # continue + # for key,val in image_dict.items(): + # image_pool[key]=val + # group = location2group[key] + # if group not in current_group_bboxes: + # current_group_bboxes[group] = [] + # current_group_bboxes[group].append((key,location2boxes[key])) + # if len(image_pool) == 0:continue + # print(len(image_pool)) + # raise + if ocr_mode == 'batch': + tex_recognizer = TextRecognizer(rec_args) + #tex_recognizer.net.backbone = torch.compile(tex_recognizer.net.backbone) + patch_metadata_list = fast_deal_with_one_dataset2(images_dataset,tex_recognizer,pdf_batch_size=32, + image_batch_size=128 , + num_workers=num_workers, + update_origin=True) + #print(patch_metadata_list) + write_jsonl_to_path(patch_metadata_list, "test_result/result.test3.jsonl", None) + # patch_metadata_list = fast_deal_with_one_dataset(images_dataset,tex_recognizer,pdf_batch_size=32, image_batch_size=128 ,num_workers=num_workers) + # write_jsonj_to_path(patch_metadata_list, "test_result/result.test1.jsonl", None) + else: + from modules.self_modify import ModifiedPaddleOCR + + dataset = RecImageDataset(metadata_filepath) + image_collecter = DataLoader(dataset, batch_size=8,collate_fn=rec_collate_fn, + num_workers=num_workers,pin_memory=False, pin_memory_device='cuda', + prefetch_factor=2 if num_workers>0 else None) + + ocr_model = ModifiedPaddleOCR(show_log=True) + tex_recognizer=ocr_model.text_recognizer + # tex_recognizer = TextRecognizer(rec_args) + tex_recognizer.rec_batch_num = batch_size + for location_abs_list, image_list in tqdm(image_collecter,position=0,leave=False,desc="Do Rec"): + if len(image_list) == 0:continue + tqdm.write(f"Now deal with B={len(image_list)}") + rec_result = tex_recognizer(image_list) + + + + + + # #### next step, lets do normlized the bbox to the same size + + # location_to_rec = {} + # pbar_whole_images = tqdm(total=len(image_pool),position=1,leave=False) + # for group_key, location_and_bbox in grouped_bboxes.items(): + # if len(location_and_bbox) == 0:continue + + # img_list_group = [image_pool[location] for location, bbox in location_and_bbox] + # rec_list_group = [] + # dataset = UnifiedResizedDataset(img_list_group, tex_recognizer.rec_image_shape, tex_recognizer.limited_max_width, tex_recognizer.limited_min_width) + # dataloader_group = DataLoader(dataset, batch_size=batch_size, num_workers=8, pin_memory=True, pin_memory_device='cuda') + # featcher = DataPrefetcher(dataloader_group,device='cuda') + # pbar = tqdm(total=len(dataloader_group),position=2,leave=False) + # batch = featcher.next() + # while batch is not None: + # rec_result = gpu_inference(batch, tex_recognizer) + # rec_list_group.extend(rec_result) + # pbar.update(1) + # batch = featcher.next() + # assert len(location_and_bbox) == len(rec_list_group) + # for (location, bbox), rec_res in zip(location_and_bbox, rec_list_group): + # location_to_rec[location] = rec_res + # pbar_whole_images.update(len(img_list_group)) + + # patch_metadata_list = [] + # for pdf_index, pdf_metadata in enumerate(tqdm(metadatas)): + # pdf_path = pdf_metadata['path'] + + # patch_metadata = {'path':pdf_path,'doc_layout_result':[]} + # for pdf_page_metadata in pdf_metadata['doc_layout_result']: + # page_id = pdf_page_metadata['page_id'] + # bbox_id = 0 + # this_line_pool = {'page_id':page_id, 'layout_dets':[]} + # for bbox_metadata in pdf_page_metadata['layout_dets']: + # if bbox_metadata['category_id']!=15:continue + + # location= (pdf_path,page_id,bbox_id) + # bbox_id+=1 + # text, score = location_to_rec[location] + # this_line_pool['layout_dets'].append({'category_id':15, 'text':text, 'score':score}) + # patch_metadata['doc_layout_result'].append(this_line_pool) + # patch_metadata_list.append(patch_metadata) + + # write_json_to_path(patch_metadata_list, metadata_filepath.replace('.jsonl','.patch.rec_result.jsonl'), client) + + # deal_with_one_dataset("debug.jsonl", + # "debug.stage_1.jsonl", + # layout_model, mfd_model, ocrmodel=ocrmodel, + # inner_batch_size=2, batch_size=4,num_workers=4, + # do_text_det = True, + # do_text_rec = True, + # timer=timer) + # dataset = PDFImageDataset("part-66210c190659-000035.jsonl",layout_model.predictor.aug,layout_model.predictor.input_format,mfd_pre_transform=None) + # dataloader = DataLoader(dataset, batch_size=8,collate_fn=custom_collate_fn) + + + \ No newline at end of file diff --git a/batch_running_task/task_rec/run_rec.sh b/batch_running_task/task_rec/run_rec.sh new file mode 100644 index 0000000..b0342cb --- /dev/null +++ b/batch_running_task/task_rec/run_rec.sh @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH -J ParseSciHUB +#SBATCH -o .log/%j-ParseSciHUB.out +#SBATCH -e .log/%j-ParseSciHUB.out + + + + +# Check if the version matches + +if [[ $(hostname) == SH* ]]; then + IMAGE_BATCH_SIZE=256 + PDF_BATCH_SIZE=32 + export LD_LIBRARY_PATH=/mnt/cache/share/gcc/gcc-7.5.0/lib64:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} + export PATH=/mnt/cache/share/gcc/gcc-7.5.0/bin:$PATH + GCC_VERSION=$(gcc -v 2>&1 | grep "gcc version" | awk '{print $3}') + # Required version + REQUIRED_VERSION="7.5.0" + if [ "$GCC_VERSION" != "$REQUIRED_VERSION" ]; then + echo "[`hostname`] GCC version is not $REQUIRED_VERSION. Exiting." + exit 1 + else + echo "[`hostname`] GCC version is $REQUIRED_VERSION." + fi +else + IMAGE_BATCH_SIZE=128 + PDF_BATCH_SIZE=16 + +fi + + +python batch_running_task/task_rec/batch_deal_with_rec.py --image_batch_size $IMAGE_BATCH_SIZE --pdf_batch_size $PDF_BATCH_SIZE --root_path $1 --index_part $2 --num_parts $3 --num_workers 8 --update_origin --replace --shuffle #--compile diff --git a/batch_running_task/task_schedule.sh b/batch_running_task/task_schedule.sh new file mode 100644 index 0000000..9ab1b76 --- /dev/null +++ b/batch_running_task/task_schedule.sh @@ -0,0 +1,71 @@ + +#!/bin/bash +TASKLIMIT=30 +PENDINGLIMIT=2 +# Function to get the count of pending tasks +user=`whoami` +if [[ $(hostname) == SH* ]]; then + partition='AI4Chem' +else + partition='vip_gpu_ailab_low' +fi +# jobscript="batch_running_task/task_layout/run_layout_for_missing_page.sh" +# filelist='scihub_collection/analysis/not_complete_pdf_page_id.pairlist.filelist' +jobname='ParseSciHUB' +get_pending_count() { + squeue -u $user -p $partition -n $jobname | grep PD | wc -l +} +get_pending_jobids() { + squeue -u $user -p $partition -n $jobname | grep PD | awk '{print $1}' +} + +# Function to get the count of running tasks +get_running_count() { + squeue -u $user -p $partition -n $jobname | grep R | wc -l +} + + + +# Function to submit a task +submit_task() { + current_time=$(date +"%Y.%m.%d %H:%M") + bash batch_running_task/batch_run.sh 1 + #sbatch --quotatype=spot -p $partition -N1 -c8 --gres=gpu:1 $jobscript $filelist 0 1 +} + +# Function to cancel extra pending tasks +cancel_extra_pending_tasks() { + pending_jobids=($(get_pending_jobids)) + for (( i=$PENDINGLIMIT; i<${#pending_jobids[@]}; i++ )); do + echo "Cancelling extra pending task: ${pending_jobids[$i]}" + scancel "${pending_jobids[$i]}" + done +} + +# Main loop to check and submit tasks every 2 seconds +while true; do + pending_count=$(get_pending_count) + running_count=$(get_running_count) + + # Cancel extra pending tasks if pending count > 5 + if [ "$pending_count" -gt $PENDINGLIMIT ]; then + cancel_extra_pending_tasks + sleep 30 + fi + + pending_count=$(get_pending_count) + running_count=$(get_running_count) + + + + # Submit a task only when running tasks < 60 and pending tasks < 3 + if [ "$running_count" -lt $TASKLIMIT ] && [ "$pending_count" -lt 3 ]; then + echo "Pending tasks: $pending_count Running tasks: $running_count/$TASKLIMIT Submitting a new task..." + submit_task + else + echo "Pending tasks: $pending_count Running tasks: $running_count/$TASKLIMIT" + fi + + + sleep 30 +done \ No newline at end of file diff --git a/batch_running_task/utils.py b/batch_running_task/utils.py new file mode 100644 index 0000000..9f18bd9 --- /dev/null +++ b/batch_running_task/utils.py @@ -0,0 +1,324 @@ +import numpy as np +import copy +import torch +import cv2 +import torch +def sorted_boxes(dt_boxes): + """ + Sort text boxes in order from top to bottom, left to right + args: + dt_boxes(array):detected text boxes with shape [4, 2] + return: + sorted boxes(array) with shape [4, 2] + """ + num_boxes = dt_boxes.shape[0] + sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0])) + _boxes = list(sorted_boxes) + + for i in range(num_boxes - 1): + for j in range(i, -1, -1): + if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \ + (_boxes[j + 1][0][0] < _boxes[j][0][0]): + tmp = _boxes[j] + _boxes[j] = _boxes[j + 1] + _boxes[j + 1] = tmp + else: + break + return _boxes + +def formula_in_text(mf_bbox, text_bbox): + x1, y1, x2, y2 = mf_bbox + x3, y3 = text_bbox[0] + x4, y4 = text_bbox[2] + left_box, right_box = None, None + same_line = abs((y1+y2)/2 - (y3+y4)/2) / abs(y4-y3) < 0.2 + if not same_line: + return False, left_box, right_box + else: + drop_origin = False + left_x = x1 - 1 + right_x = x2 + 1 + if x3 < x1 and x2 < x4: + drop_origin = True + left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32') + right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32') + if x3 < x1 and x1 <= x4 <= x2: + drop_origin = True + left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32') + if x1 <= x3 <= x2 and x2 < x4: + drop_origin = True + right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32') + if x1 <= x3 < x4 <= x2: + drop_origin = True + return drop_origin, left_box, right_box + + +def update_det_boxes(dt_boxes, mfdetrec_res): + new_dt_boxes = dt_boxes + for mf_box in mfdetrec_res: + flag, left_box, right_box = False, None, None + for idx, text_box in enumerate(new_dt_boxes): + ret, left_box, right_box = formula_in_text(mf_box['bbox'], text_box) + if ret: + new_dt_boxes.pop(idx) + if left_box is not None: + new_dt_boxes.append(left_box) + if right_box is not None: + new_dt_boxes.append(right_box) + break + + return new_dt_boxes + +def get_gpu_memory(): + if torch.cuda.is_available(): + device = torch.device('cuda') + total_memory = torch.cuda.get_device_properties(device).total_memory / 1024**3 # Convert bytes to MB + return int(total_memory) + return 0 + + +def get_rotate_crop_image(img, points, padding=10): + """ + Extracts a rotated and cropped image patch defined by the quadrilateral `points` + with an additional padding. + + Args: + img (numpy.ndarray): The input image. + points (numpy.ndarray): A (4, 2) array containing the coordinates of the quadrilateral. + padding (int): The number of pixels to expand the bounding box on each side. + + Returns: + numpy.ndarray: The cropped and rotated image patch. + """ + assert len(points) == 4, "shape of points must be 4*2" + + # Calculate the bounding box with padding + img_height, img_width = img.shape[0:2] + left = max(0, int(np.min(points[:, 0])) - padding) + right = min(img_width, int(np.max(points[:, 0])) + padding) + top = max(0, int(np.min(points[:, 1])) - padding) + bottom = min(img_height, int(np.max(points[:, 1])) + padding) + + # Crop the image with padding + img_crop = img[top:bottom, left:right, :].copy() + + # Adjust points to the new cropped region + points[:, 0] -= left + points[:, 1] -= top + + # Calculate the width and height of the rotated crop + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]) + ) + ) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]) + ) + ) + + # Define the destination points for perspective transformation + pts_std = np.float32( + [ + [0, 0], + [img_crop_width, 0], + [img_crop_width, img_crop_height], + [0, img_crop_height], + ] + ) + + # Perform the perspective transformation + M = cv2.getPerspectiveTransform(points, pts_std) + dst_img = cv2.warpPerspective( + img_crop, + M, + (img_crop_width, img_crop_height), + borderMode=cv2.BORDER_REPLICATE, + flags=cv2.INTER_CUBIC, + ) + + # Rotate the image if the height/width ratio is >= 1.5 + dst_img_height, dst_img_width = dst_img.shape[0:2] + if dst_img_height * 1.0 / dst_img_width >= 1.5: + dst_img = np.rot90(dst_img) + + return dst_img + +def collect_text_image_and_its_coordinate(single_page_mfdetrec_res_this_batch, partition_per_batch, oimages, dt_boxes_list): + text_image_batch = [] + text_image_position=[] + text_line_bbox = [] + for partition_id, single_page_mfdetrec_res in enumerate(single_page_mfdetrec_res_this_batch): + partition_start = partition_per_batch[partition_id] + partition_end = partition_per_batch[partition_id+1] + #print(partition_start,partition_end) + dt_boxes_per_page = dt_boxes_list[partition_start:partition_end] + for text_box_id, dt_boxes in enumerate(dt_boxes_per_page): + ori_im = oimages[partition_id] + height, width, _ = ori_im.shape + dt_boxes = sorted_boxes(dt_boxes) + dt_boxes = update_det_boxes(dt_boxes, single_page_mfdetrec_res) + for bno in range(len(dt_boxes)): + tmp_box = copy.deepcopy(dt_boxes[bno]) + text_line_bbox.append(tmp_box) + img_crop = get_rotate_crop_image(ori_im, copy.deepcopy(tmp_box), padding=10) + text_image_batch.append(img_crop) + text_image_position.append((partition_id,text_box_id,bno)) + + return text_image_batch, text_image_position,text_line_bbox + + +def collect_mfdetrec_res_per_page(single_page_res): + single_page_mfdetrec_res = [] + for res in single_page_res: + if int(res['category_id']) in [13, 14]: + xmin, ymin = int(res['poly'][0]), int(res['poly'][1]) + xmax, ymax = int(res['poly'][4]), int(res['poly'][5]) + single_page_mfdetrec_res.append({"bbox": [xmin, ymin, xmax, ymax]}) + return single_page_mfdetrec_res + +def collect_image_tensor_cropped(oimage:np.ndarray, single_page_res, scale=1): + image_np = oimage + canvas_list = [] + canvas_idxes= [] + for bbox_id, res in enumerate(single_page_res): + if int(res['category_id']) in [0, 1, 2, 4, 6, 7]: #需要进行ocr的类别 + xmin, ymin = int(res['poly'][0]/scale), int(res['poly'][1]/scale) + xmax, ymax = int(res['poly'][4]/scale), int(res['poly'][5]/scale) + if isinstance(image_np, np.ndarray): + canvas = image_np.ones_like(image_np) * 255 + else: + canvas = torch.ones_like(image_np) * image_np[0,0,0] + if canvas.shape[0]==3: + canvas[:,ymin:ymax, xmin:xmax] = image_np[:,ymin:ymax, xmin:xmax] + elif canvas.shape[2]==3: + canvas[ymin:ymax, xmin:xmax,:] = image_np[ymin:ymax, xmin:xmax,:] + else: + raise ValueError("image shape is not 3 or 4") + canvas_list.append(canvas) + canvas_idxes.append(bbox_id) + return canvas_list, canvas_idxes + +def collect_paragraph_image_and_its_coordinate(oimages, rough_layout_this_batch,scale=1): + canvas_tensor_this_batch = [] + canvas_idxes_this_batch = [] + single_page_mfdetrec_res_this_batch = [] + partition_per_batch = [0] + for oimage, single_page_res in zip(oimages, rough_layout_this_batch): + single_page_mfdetrec_res = collect_mfdetrec_res_per_page(single_page_res) + canvas_tensor, canvas_idxes = collect_image_tensor_cropped(oimage, single_page_res,scale=scale) + canvas_tensor_this_batch.extend(canvas_tensor) + canvas_idxes_this_batch.append(canvas_idxes) + single_page_mfdetrec_res_this_batch.append(single_page_mfdetrec_res) + partition_per_batch.append(len(canvas_tensor_this_batch)) + return canvas_tensor_this_batch, partition_per_batch,canvas_idxes_this_batch,single_page_mfdetrec_res_this_batch + +def collect_paragraph_image_and_its_coordinate_from_detection_batch(detection_images, rough_layout_this_batch): + canvas_tensor_this_batch = [] + canvas_idxes_this_batch = [] + single_page_mfdetrec_res_this_batch = [] + partition_per_batch = [0] + for oimage, single_page_res in zip(detection_images, rough_layout_this_batch): + single_page_mfdetrec_res = collect_mfdetrec_res_per_page(single_page_res) + canvas_tensor, canvas_idxes = collect_image_tensor_cropped(oimage, single_page_res) + canvas_tensor_this_batch.extend(canvas_tensor) + canvas_idxes_this_batch.append(canvas_idxes) + single_page_mfdetrec_res_this_batch.append(single_page_mfdetrec_res) + partition_per_batch.append(len(canvas_tensor_this_batch)) + return canvas_tensor_this_batch, partition_per_batch,canvas_idxes_this_batch,single_page_mfdetrec_res_this_batch + +def convert_boxes(boxes, original_width, original_height, target_width, target_height): + """Convert bounding boxes to a new resolution.""" + width_ratio = target_width / original_width + height_ratio = target_height / original_height + + converted_boxes = [] + for box in boxes: + x_min, y_min, x_max, y_max = box + new_x_min = x_min * width_ratio + new_y_min = y_min * height_ratio + new_x_max = x_max * width_ratio + new_y_max = y_max * height_ratio + converted_boxes.append((new_x_min, new_y_min, new_x_max, new_y_max)) + + return converted_boxes + +import time, math +class _DummyTimer: + """A dummy timer that does nothing.""" + + def __enter__(self): + return self + + def __exit__(self, *args): + pass + +class _Timer: + """Timer.""" + + def __init__(self, name): + self.name = name + self.count = 0 + self.mean = 0.0 + self.sum_squares = 0.0 + self.start_time = None + + def __enter__(self): + self.start_time = time.time() + + def __exit__(self, exc_type, exc_value, exc_traceback): + elapsed_time = time.time() - self.start_time + self.update(elapsed_time) + self.start_time = None + + def update(self, elapsed_time): + self.count += 1 + delta = elapsed_time - self.mean + self.mean += delta / self.count + delta2 = elapsed_time - self.mean + self.sum_squares += delta * delta2 + + def mean_elapsed(self): + return self.mean + + def std_elapsed(self): + if self.count > 1: + variance = self.sum_squares / (self.count - 1) + return math.sqrt(variance) + else: + return 0.0 + +class Timers: + """Group of timers.""" + + def __init__(self, activate=False,warmup=0): + self.timers = {} + self.activate = activate + self.warmup = warmup + self.count_per_name={} + def __call__(self, name): + if not self.activate:return _DummyTimer() + if name not in self.timers: + self.timers[name] = _Timer(name) + self.count_per_name[name] = -1 + self.count_per_name[name]+=1 + if self.count_per_name[name] < self.warmup: + return _DummyTimer() + + return self.timers[name] + + def log(self, names=None, normalizer=1.0): + if not self.activate:return + """Log a group of timers.""" + assert normalizer > 0.0 + if names is None: + names = self.timers.keys() + print("Timer Results:") + for name in names: + mean_elapsed = self.timers[name].mean_elapsed() * 1000.0 / normalizer + std_elapsed = self.timers[name].std_elapsed() * 1000.0 / normalizer + space_num = " "*name.count('/') + print(f"{space_num}{name}: {mean_elapsed:.2f}±{std_elapsed:.2f} ms") diff --git a/configs/cls/cls_mv3.yml b/configs/cls/cls_mv3.yml new file mode 100644 index 0000000..b165bc4 --- /dev/null +++ b/configs/cls/cls_mv3.yml @@ -0,0 +1,96 @@ +Global: + use_gpu: true + epoch_num: 100 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/cls/mv3/ + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 1000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + label_list: ['0','180'] + +Architecture: + model_type: cls + algorithm: CLS + Transform: + Backbone: + name: MobileNetV3 + scale: 0.35 + model_name: small + Neck: + Head: + name: ClsHead + class_dim: 2 + +Loss: + name: ClsLoss + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: ClsPostProcess + +Metric: + name: ClsMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/cls + label_file_list: + - ./train_data/cls/train.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - ClsLabelEncode: # Class handling label + - RecAug: + use_tia: False + - RandAugment: + - ClsResizeImg: + image_shape: [3, 48, 192] + - KeepKeys: + keep_keys: ['image', 'label'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 512 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/cls + label_file_list: + - ./train_data/cls/test.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - ClsLabelEncode: # Class handling label + - ClsResizeImg: + image_shape: [3, 48, 192] + - KeepKeys: + keep_keys: ['image', 'label'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 512 + num_workers: 4 \ No newline at end of file diff --git a/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det.yml b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det.yml new file mode 100644 index 0000000..0e8af77 --- /dev/null +++ b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det.yml @@ -0,0 +1,163 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/ch_PP-OCR_V3_det/ + save_epoch_step: 100 + eval_batch_step: + - 0 + - 400 + cal_metric_during_train: false + pretrained_model: null + checkpoints: null + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./checkpoints/det_db/predicts_db.txt + distributed: true + +Architecture: + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: True + Neck: + name: RSEFPN + out_channels: 96 + shortcut: True + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 + regularizer: + name: L2 + factor: 5.0e-05 +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 +Metric: + name: DetMetric + main_indicator: hmean +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - IaaAugment: + augmenter_args: + - type: Fliplr + args: + p: 0.5 + - type: Affine + args: + rotate: + - -10 + - 10 + - type: Resize + args: + size: + - 0.5 + - 3 + - EastRandomCropData: + size: + - 960 + - 960 + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - threshold_map + - threshold_mask + - shrink_map + - shrink_mask + loader: + shuffle: true + drop_last: false + batch_size_per_card: 8 + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - DetResizeForTest: null + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - shape + - polys + - ignore_tags + loader: + shuffle: false + drop_last: false + batch_size_per_card: 1 + num_workers: 2 diff --git a/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_cml.yml b/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_cml.yml new file mode 100644 index 0000000..fe582ba --- /dev/null +++ b/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_cml.yml @@ -0,0 +1,235 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 20 + save_model_dir: ./output/ch_PP-OCRv4 + save_epoch_step: 50 + eval_batch_step: + - 0 + - 1000 + cal_metric_during_train: true + checkpoints: null + pretrained_model: null + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./checkpoints/det_db/predicts_db.txt + distributed: true +Architecture: + name: DistillationModel + algorithm: Distillation + model_type: det + Models: + Student: + model_type: det + algorithm: DB + Transform: null + Backbone: + name: PPLCNetNew + scale: 0.75 + pretrained: false + Neck: + name: RSEFPN + out_channels: 96 + shortcut: true + Head: + name: DBHead + k: 50 + Student2: + pretrained: null + model_type: det + algorithm: DB + Transform: null + Backbone: + name: PPLCNetNew + scale: 0.75 + pretrained: true + Neck: + name: RSEFPN + out_channels: 96 + shortcut: true + Head: + name: DBHead + k: 50 + Teacher: + pretrained: https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_cml_teacher_pretrained/teacher.pdparams + freeze_params: true + return_all_feats: false + model_type: det + algorithm: DB + Backbone: + name: ResNet_vd + in_channels: 3 + layers: 50 + Neck: + name: LKPAN + out_channels: 256 + Head: + name: DBHead + kernel_list: + - 7 + - 2 + - 2 + k: 50 +Loss: + name: CombinedLoss + loss_config_list: + - DistillationDilaDBLoss: + weight: 1.0 + model_name_pairs: + - - Student + - Teacher + - - Student2 + - Teacher + key: maps + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + - DistillationDMLLoss: + model_name_pairs: + - Student + - Student2 + maps_name: thrink_maps + weight: 1.0 + key: maps + - DistillationDBLoss: + weight: 1.0 + model_name_list: + - Student + - Student2 + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 + regularizer: + name: L2 + factor: 5.0e-05 +PostProcess: + name: DistillationDBPostProcess + model_name: + - Student + key: head_out + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 +Metric: + name: DistillationMetric + base_metric_name: DetMetric + main_indicator: hmean + key: Student +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - IaaAugment: + augmenter_args: + - type: Fliplr + args: + p: 0.5 + - type: Affine + args: + rotate: + - -10 + - 10 + - type: Resize + args: + size: + - 0.5 + - 3 + - EastRandomCropData: + size: + - 640 + - 640 + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + total_epoch: 500 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + total_epoch: 500 + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - threshold_map + - threshold_mask + - shrink_map + - shrink_mask + loader: + shuffle: true + drop_last: false + batch_size_per_card: 16 + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - DetResizeForTest: null + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - shape + - polys + - ignore_tags + loader: + shuffle: false + drop_last: false + batch_size_per_card: 1 + num_workers: 2 +profiler_options: null diff --git a/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_student.yml b/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_student.yml new file mode 100644 index 0000000..39b260c --- /dev/null +++ b/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_student.yml @@ -0,0 +1,171 @@ +Global: + debug: false + use_gpu: true + epoch_num: &epoch_num 500 + log_smooth_window: 20 + print_batch_step: 100 + save_model_dir: ./output/ch_PP-OCRv4 + save_epoch_step: 10 + eval_batch_step: + - 0 + - 1500 + cal_metric_during_train: false + checkpoints: + pretrained_model: https://paddleocr.bj.bcebos.com/pretrained/PPLCNetV3_x0_75_ocr_det.pdparams + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./checkpoints/det_db/predicts_db.txt + distributed: true + +Architecture: + model_type: det + algorithm: DB + Transform: null + Backbone: + name: PPLCNetV3 + scale: 0.75 + det: True + Neck: + name: RSEFPN + out_channels: 96 + shortcut: True + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 #(8*8c) + warmup_epoch: 2 + regularizer: + name: L2 + factor: 5.0e-05 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - CopyPaste: null + - IaaAugment: + augmenter_args: + - type: Fliplr + args: + p: 0.5 + - type: Affine + args: + rotate: + - -10 + - 10 + - type: Resize + args: + size: + - 0.5 + - 3 + - EastRandomCropData: + size: + - 640 + - 640 + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + total_epoch: *epoch_num + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + total_epoch: *epoch_num + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - threshold_map + - threshold_mask + - shrink_map + - shrink_mask + loader: + shuffle: true + drop_last: false + batch_size_per_card: 8 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - DetResizeForTest: + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - shape + - polys + - ignore_tags + loader: + shuffle: false + drop_last: false + batch_size_per_card: 1 + num_workers: 2 +profiler_options: null diff --git a/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_teacher.yml b/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_teacher.yml new file mode 100644 index 0000000..b58af1c --- /dev/null +++ b/configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_teacher.yml @@ -0,0 +1,172 @@ +Global: + debug: false + use_gpu: true + epoch_num: &epoch_num 500 + log_smooth_window: 20 + print_batch_step: 100 + save_model_dir: ./output/ch_PP-OCRv4 + save_epoch_step: 10 + eval_batch_step: + - 0 + - 1500 + cal_metric_during_train: false + checkpoints: + pretrained_model: https://paddleocr.bj.bcebos.com/pretrained/PPHGNet_small_ocr_det.pdparams + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./checkpoints/det_db/predicts_db.txt + distributed: true + +Architecture: + model_type: det + algorithm: DB + Transform: null + Backbone: + name: PPHGNet_small + det: True + Neck: + name: LKPAN + out_channels: 256 + intracl: true + Head: + name: PFHeadLocal + k: 50 + mode: "large" + + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 #(8*8c) + warmup_epoch: 2 + regularizer: + name: L2 + factor: 1e-6 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - CopyPaste: null + - IaaAugment: + augmenter_args: + - type: Fliplr + args: + p: 0.5 + - type: Affine + args: + rotate: + - -10 + - 10 + - type: Resize + args: + size: + - 0.5 + - 3 + - EastRandomCropData: + size: + - 640 + - 640 + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + total_epoch: *epoch_num + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + total_epoch: *epoch_num + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - threshold_map + - threshold_mask + - shrink_map + - shrink_mask + loader: + shuffle: true + drop_last: false + batch_size_per_card: 8 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - DetResizeForTest: + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - shape + - polys + - ignore_tags + loader: + shuffle: false + drop_last: false + batch_size_per_card: 1 + num_workers: 2 +profiler_options: null diff --git a/configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml b/configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml new file mode 100644 index 0000000..fd88495 --- /dev/null +++ b/configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml @@ -0,0 +1,134 @@ +Global: + use_gpu: true + epoch_num: 1200 + log_smooth_window: 20 + print_batch_step: 2 + save_model_dir: ./output/ch_db_mv3/ + save_epoch_step: 1200 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [3000, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + load_static_weights: True + cal_metric_during_train: False + pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_db/predicts_db.txt + +Architecture: + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: True + Neck: + name: DBFPN + out_channels: 96 + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - IaaAugment: + augmenter_args: + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [-10, 10] } } + - { 'type': Resize, 'args': { 'size': [0.5, 3] } } + - EastRandomCropData: + size: [960, 960] + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: +# image_shape: [736, 1280] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 diff --git a/configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml b/configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml new file mode 100644 index 0000000..52d6d4f --- /dev/null +++ b/configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml @@ -0,0 +1,133 @@ +Global: + use_gpu: true + epoch_num: 1200 + log_smooth_window: 20 + print_batch_step: 2 + save_model_dir: ./output/ch_db_res18/ + save_epoch_step: 1200 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [3000, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + load_static_weights: True + cal_metric_during_train: False + pretrained_model: ./pretrain_models/ResNet18_vd_pretrained + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_db/predicts_db.txt + +Architecture: + model_type: det + algorithm: DB + Transform: + Backbone: + name: ResNet_vd + layers: 18 + disable_se: True + Neck: + name: DBFPN + out_channels: 256 + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - IaaAugment: + augmenter_args: + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [-10, 10] } } + - { 'type': Resize, 'args': { 'size': [0.5, 3] } } + - EastRandomCropData: + size: [960, 960] + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: +# image_shape: [736, 1280] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 diff --git a/configs/det/det_mv3_db.yml b/configs/det/det_mv3_db.yml new file mode 100644 index 0000000..00a16b5 --- /dev/null +++ b/configs/det/det_mv3_db.yml @@ -0,0 +1,133 @@ +Global: + use_gpu: true + epoch_num: 1200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/db_mv3/ + save_epoch_step: 1200 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + load_static_weights: True + cal_metric_during_train: False + pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_db/predicts_db.txt + +Architecture: + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + Neck: + name: DBFPN + out_channels: 256 + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - IaaAugment: + augmenter_args: + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [-10, 10] } } + - { 'type': Resize, 'args': { 'size': [0.5, 3] } } + - EastRandomCropData: + size: [640, 640] + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 16 + num_workers: 8 + use_shared_memory: False + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + image_shape: [736, 1280] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 8 + use_shared_memory: False \ No newline at end of file diff --git a/configs/det/det_mv3_east.yml b/configs/det/det_mv3_east.yml new file mode 100644 index 0000000..05581a7 --- /dev/null +++ b/configs/det/det_mv3_east.yml @@ -0,0 +1,111 @@ +Global: + use_gpu: true + epoch_num: 10000 + log_smooth_window: 20 + print_batch_step: 2 + save_model_dir: ./output/east_mv3/ + save_epoch_step: 1000 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [4000, 5000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + load_static_weights: True + cal_metric_during_train: False + pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: + save_res_path: ./output/det_east/predicts_east.txt + +Architecture: + model_type: det + algorithm: EAST + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + Neck: + name: EASTFPN + model_name: small + Head: + name: EASTHead + model_name: small + +Loss: + name: EASTLoss + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + # name: Cosine + learning_rate: 0.001 + # warmup_epoch: 0 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: EASTPostProcess + score_thresh: 0.8 + cover_thresh: 0.1 + nms_thresh: 0.2 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - EASTProcessTrain: + image_shape: [512, 512] + background_ratio: 0.125 + min_crop_side_ratio: 0.1 + min_text_size: 10 + - KeepKeys: + keep_keys: ['image', 'score_map', 'geo_map', 'training_mask'] # dataloader will return list in this order + loader: + shuffle: True + drop_last: False + batch_size_per_card: 16 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + limit_side_len: 2400 + limit_type: max + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 \ No newline at end of file diff --git a/configs/det/det_mv3_pse.yml b/configs/det/det_mv3_pse.yml new file mode 100644 index 0000000..61ac247 --- /dev/null +++ b/configs/det/det_mv3_pse.yml @@ -0,0 +1,135 @@ +Global: + use_gpu: true + epoch_num: 600 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/det_mv3_pse/ + save_epoch_step: 600 + # evaluation is run every 63 iterations + eval_batch_step: [ 0,63 ] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + checkpoints: #./output/det_r50_vd_pse_batch8_ColorJitter/best_accuracy + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_pse/predicts_pse.txt + +Architecture: + model_type: det + algorithm: PSE + Transform: null + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + Neck: + name: FPN + out_channels: 96 + Head: + name: PSEHead + hidden_dim: 96 + out_channels: 7 + +Loss: + name: PSELoss + alpha: 0.7 + ohem_ratio: 3 + kernel_sample_mask: pred + reduction: none + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Step + learning_rate: 0.001 + step_size: 200 + gamma: 0.1 + regularizer: + name: 'L2' + factor: 0.0005 + +PostProcess: + name: PSEPostProcess + thresh: 0 + box_thresh: 0.85 + min_area: 16 + box_type: box # 'box' or 'poly' + scale: 1 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - ColorJitter: + brightness: 0.12549019607843137 + saturation: 0.5 + - IaaAugment: + augmenter_args: + - { 'type': Resize, 'args': { 'size': [ 0.5, 3 ] } } + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [ -10, 10 ] } } + - MakePseGt: + kernel_num: 7 + min_shrink_ratio: 0.4 + size: 640 + - RandomCropImgMask: + size: [ 640,640 ] + main_key: gt_text + crop_keys: [ 'image', 'gt_text', 'gt_kernels', 'mask' ] + - NormalizeImage: + scale: 1./255. + mean: [ 0.485, 0.456, 0.406 ] + std: [ 0.229, 0.224, 0.225 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'gt_text', 'gt_kernels', 'mask' ] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 16 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + limit_side_len: 736 + limit_type: min + - NormalizeImage: + scale: 1./255. + mean: [ 0.485, 0.456, 0.406 ] + std: [ 0.229, 0.224, 0.225 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'shape', 'polys', 'ignore_tags' ] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 8 \ No newline at end of file diff --git a/configs/det/det_ppocr_v3.yml b/configs/det/det_ppocr_v3.yml new file mode 100644 index 0000000..0e8af77 --- /dev/null +++ b/configs/det/det_ppocr_v3.yml @@ -0,0 +1,163 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/ch_PP-OCR_V3_det/ + save_epoch_step: 100 + eval_batch_step: + - 0 + - 400 + cal_metric_during_train: false + pretrained_model: null + checkpoints: null + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./checkpoints/det_db/predicts_db.txt + distributed: true + +Architecture: + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: True + Neck: + name: RSEFPN + out_channels: 96 + shortcut: True + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 + regularizer: + name: L2 + factor: 5.0e-05 +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 +Metric: + name: DetMetric + main_indicator: hmean +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - IaaAugment: + augmenter_args: + - type: Fliplr + args: + p: 0.5 + - type: Affine + args: + rotate: + - -10 + - 10 + - type: Resize + args: + size: + - 0.5 + - 3 + - EastRandomCropData: + size: + - 960 + - 960 + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - threshold_map + - threshold_mask + - shrink_map + - shrink_mask + loader: + shuffle: true + drop_last: false + batch_size_per_card: 8 + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - DetResizeForTest: null + - NormalizeImage: + scale: 1./255. + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - shape + - polys + - ignore_tags + loader: + shuffle: false + drop_last: false + batch_size_per_card: 1 + num_workers: 2 diff --git a/configs/det/det_r50_db++_icdar15.yml b/configs/det/det_r50_db++_icdar15.yml new file mode 100644 index 0000000..e0cd601 --- /dev/null +++ b/configs/det/det_r50_db++_icdar15.yml @@ -0,0 +1,163 @@ +Global: + debug: false + use_gpu: true + epoch_num: 1000 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/det_r50_icdar15/ + save_epoch_step: 200 + eval_batch_step: + - 0 + - 2000 + cal_metric_during_train: false + pretrained_model: ./pretrain_models/ResNet50_dcn_asf_synthtext_pretrained + checkpoints: null + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./checkpoints/det_db/predicts_db.txt +Architecture: + model_type: det + algorithm: DB++ + Transform: null + Backbone: + name: ResNet + layers: 50 + dcn_stage: [False, True, True, True] + Neck: + name: DBFPN + out_channels: 256 + use_asf: True + Head: + name: DBHead + k: 50 +Loss: + name: DBLoss + balance_loss: true + main_loss_type: BCELoss + alpha: 5 + beta: 10 + ohem_ratio: 3 +Optimizer: + name: Momentum + momentum: 0.9 + lr: + name: DecayLearningRate + learning_rate: 0.007 + epochs: 1000 + factor: 0.9 + end_lr: 0 + weight_decay: 0.0001 +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 +Metric: + name: DetMetric + main_indicator: hmean +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: + - 1.0 + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - IaaAugment: + augmenter_args: + - type: Fliplr + args: + p: 0.5 + - type: Affine + args: + rotate: + - -10 + - 10 + - type: Resize + args: + size: + - 0.5 + - 3 + - EastRandomCropData: + size: + - 640 + - 640 + max_tries: 10 + keep_ratio: true + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - NormalizeImage: + scale: 1./255. + mean: + - 0.48109378172549 + - 0.45752457890196 + - 0.40787054090196 + std: + - 1.0 + - 1.0 + - 1.0 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - threshold_map + - threshold_mask + - shrink_map + - shrink_mask + loader: + shuffle: true + drop_last: false + batch_size_per_card: 4 + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - DetResizeForTest: + image_shape: + - 1152 + - 2048 + - NormalizeImage: + scale: 1./255. + mean: + - 0.48109378172549 + - 0.45752457890196 + - 0.40787054090196 + std: + - 1.0 + - 1.0 + - 1.0 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - shape + - polys + - ignore_tags + loader: + shuffle: false + drop_last: false + batch_size_per_card: 1 + num_workers: 2 +profiler_options: null diff --git a/configs/det/det_r50_db++_td_tr.yml b/configs/det/det_r50_db++_td_tr.yml new file mode 100644 index 0000000..65021bb --- /dev/null +++ b/configs/det/det_r50_db++_td_tr.yml @@ -0,0 +1,166 @@ +Global: + debug: false + use_gpu: true + epoch_num: 1000 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/det_r50_td_tr/ + save_epoch_step: 200 + eval_batch_step: + - 0 + - 2000 + cal_metric_during_train: false + pretrained_model: ./pretrain_models/ResNet50_dcn_asf_synthtext_pretrained + checkpoints: null + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./checkpoints/det_db/predicts_db.txt +Architecture: + model_type: det + algorithm: DB++ + Transform: null + Backbone: + name: ResNet + layers: 50 + dcn_stage: [False, True, True, True] + Neck: + name: DBFPN + out_channels: 256 + use_asf: True + Head: + name: DBHead + k: 50 +Loss: + name: DBLoss + balance_loss: true + main_loss_type: BCELoss + alpha: 5 + beta: 10 + ohem_ratio: 3 +Optimizer: + name: Momentum + momentum: 0.9 + lr: + name: DecayLearningRate + learning_rate: 0.007 + epochs: 1000 + factor: 0.9 + end_lr: 0 + weight_decay: 0.0001 +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.5 + max_candidates: 1000 + unclip_ratio: 1.5 +Metric: + name: DetMetric + main_indicator: hmean +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/TD_TR/TD500/train_gt_labels.txt + - ./train_data/TD_TR/TR400/gt_labels.txt + ratio_list: + - 1.0 + - 1.0 + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - IaaAugment: + augmenter_args: + - type: Fliplr + args: + p: 0.5 + - type: Affine + args: + rotate: + - -10 + - 10 + - type: Resize + args: + size: + - 0.5 + - 3 + - EastRandomCropData: + size: + - 640 + - 640 + max_tries: 10 + keep_ratio: true + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - NormalizeImage: + scale: 1./255. + mean: + - 0.48109378172549 + - 0.45752457890196 + - 0.40787054090196 + std: + - 1.0 + - 1.0 + - 1.0 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - threshold_map + - threshold_mask + - shrink_map + - shrink_mask + loader: + shuffle: true + drop_last: false + batch_size_per_card: 4 + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/TD_TR/TD500/test_gt_labels.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - DetLabelEncode: null + - DetResizeForTest: + image_shape: + - 736 + - 736 + keep_ratio: True + - NormalizeImage: + scale: 1./255. + mean: + - 0.48109378172549 + - 0.45752457890196 + - 0.40787054090196 + std: + - 1.0 + - 1.0 + - 1.0 + order: hwc + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - shape + - polys + - ignore_tags + loader: + shuffle: false + drop_last: false + batch_size_per_card: 1 + num_workers: 2 +profiler_options: null diff --git a/configs/det/det_r50_vd_db.yml b/configs/det/det_r50_vd_db.yml new file mode 100644 index 0000000..b1c4a04 --- /dev/null +++ b/configs/det/det_r50_vd_db.yml @@ -0,0 +1,130 @@ +Global: + use_gpu: true + epoch_num: 1200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/det_r50_vd/ + save_epoch_step: 1200 + # evaluation is run every 2000 iterations + eval_batch_step: [0,2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + load_static_weights: True + cal_metric_during_train: False + pretrained_model: ./pretrain_models/ResNet50_vd_ssld_pretrained + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_db/predicts_db.txt + +Architecture: + model_type: det + algorithm: DB + Transform: + Backbone: + name: ResNet_vd + layers: 50 + Neck: + name: DBFPN + out_channels: 256 + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.7 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - IaaAugment: + augmenter_args: + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [-10, 10] } } + - { 'type': Resize, 'args': { 'size': [0.5, 3] } } + - EastRandomCropData: + size: [640, 640] + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 16 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + image_shape: [736, 1280] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 8 \ No newline at end of file diff --git a/configs/det/det_r50_vd_dcn_fce_ctw.yml b/configs/det/det_r50_vd_dcn_fce_ctw.yml new file mode 100644 index 0000000..3a4075b --- /dev/null +++ b/configs/det/det_r50_vd_dcn_fce_ctw.yml @@ -0,0 +1,139 @@ +Global: + use_gpu: true + epoch_num: 1500 + log_smooth_window: 20 + print_batch_step: 20 + save_model_dir: ./output/det_r50_dcn_fce_ctw/ + save_epoch_step: 100 + # evaluation is run every 835 iterations + eval_batch_step: [0, 835] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/ResNet50_vd_ssld_pretrained + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_fce/predicts_fce.txt + + +Architecture: + model_type: det + algorithm: FCE + Transform: + Backbone: + name: ResNet_vd + layers: 50 + dcn_stage: [False, True, True, True] + out_indices: [1,2,3] + Neck: + name: FCEFPN + out_channels: 256 + has_extra_convs: False + extra_stage: 0 + Head: + name: FCEHead + fourier_degree: 5 +Loss: + name: FCELoss + fourier_degree: 5 + num_sample: 50 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.0001 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: FCEPostProcess + scales: [8, 16, 32] + alpha: 1.0 + beta: 1.0 + fourier_degree: 5 + box_type: 'poly' + +Metric: + name: DetFCEMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ctw1500/imgs/ + label_file_list: + - ./train_data/ctw1500/imgs/training.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + ignore_orientation: True + - DetLabelEncode: # Class handling label + - ColorJitter: + brightness: 0.142 + saturation: 0.5 + contrast: 0.5 + - RandomScaling: + - RandomCropFlip: + crop_ratio: 0.5 + - RandomCropPolyInstances: + crop_ratio: 0.8 + min_side_ratio: 0.3 + - RandomRotatePolyInstances: + rotate_ratio: 0.5 + max_angle: 30 + pad_with_fixed_color: False + - SquareResizePad: + target_size: 800 + pad_ratio: 0.6 + - IaaAugment: + augmenter_args: + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - FCENetTargets: + fourier_degree: 5 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'p3_maps', 'p4_maps', 'p5_maps'] # dataloader will return list in this order + loader: + shuffle: True + drop_last: False + batch_size_per_card: 6 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ctw1500/imgs/ + label_file_list: + - ./train_data/ctw1500/imgs/test.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + ignore_orientation: True + - DetLabelEncode: # Class handling label + - DetResizeForTest: + limit_type: 'min' + limit_side_len: 736 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - Pad: + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 \ No newline at end of file diff --git a/configs/det/det_r50_vd_east.yml b/configs/det/det_r50_vd_east.yml new file mode 100644 index 0000000..745a266 --- /dev/null +++ b/configs/det/det_r50_vd_east.yml @@ -0,0 +1,110 @@ +Global: + use_gpu: true + epoch_num: 10000 + log_smooth_window: 20 + print_batch_step: 2 + save_model_dir: ./output/east_r50_vd/ + save_epoch_step: 1000 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [4000, 5000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + load_static_weights: True + cal_metric_during_train: False + pretrained_model: ./pretrain_models/ResNet50_vd_pretrained/ + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: + save_res_path: ./output/det_east/predicts_east.txt + +Architecture: + model_type: det + algorithm: EAST + Transform: + Backbone: + name: ResNet_vd + layers: 50 + Neck: + name: EASTFPN + model_name: large + Head: + name: EASTHead + model_name: large + +Loss: + name: EASTLoss + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + # name: Cosine + learning_rate: 0.001 + # warmup_epoch: 0 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: EASTPostProcess + score_thresh: 0.8 + cover_thresh: 0.1 + nms_thresh: 0.2 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - EASTProcessTrain: + image_shape: [512, 512] + background_ratio: 0.125 + min_crop_side_ratio: 0.1 + min_text_size: 10 + - KeepKeys: + keep_keys: ['image', 'score_map', 'geo_map', 'training_mask'] # dataloader will return list in this order + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + limit_side_len: 2400 + limit_type: max + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 \ No newline at end of file diff --git a/configs/det/det_r50_vd_pse.yml b/configs/det/det_r50_vd_pse.yml new file mode 100644 index 0000000..ac00718 --- /dev/null +++ b/configs/det/det_r50_vd_pse.yml @@ -0,0 +1,134 @@ +Global: + use_gpu: true + epoch_num: 600 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/det_r50_vd_pse/ + save_epoch_step: 600 + # evaluation is run every 125 iterations + eval_batch_step: [ 0,125 ] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/ResNet50_vd_ssld_pretrained + checkpoints: #./output/det_r50_vd_pse_batch8_ColorJitter/best_accuracy + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_pse/predicts_pse.txt + +Architecture: + model_type: det + algorithm: PSE + Transform: + Backbone: + name: ResNet_vd + layers: 50 + Neck: + name: FPN + out_channels: 256 + Head: + name: PSEHead + hidden_dim: 256 + out_channels: 7 + +Loss: + name: PSELoss + alpha: 0.7 + ohem_ratio: 3 + kernel_sample_mask: pred + reduction: none + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Step + learning_rate: 0.0001 + step_size: 200 + gamma: 0.1 + regularizer: + name: 'L2' + factor: 0.0005 + +PostProcess: + name: PSEPostProcess + thresh: 0 + box_thresh: 0.85 + min_area: 16 + box_type: box # 'box' or 'poly' + scale: 1 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - ColorJitter: + brightness: 0.12549019607843137 + saturation: 0.5 + - IaaAugment: + augmenter_args: + - { 'type': Resize, 'args': { 'size': [ 0.5, 3 ] } } + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [ -10, 10 ] } } + - MakePseGt: + kernel_num: 7 + min_shrink_ratio: 0.4 + size: 640 + - RandomCropImgMask: + size: [ 640,640 ] + main_key: gt_text + crop_keys: [ 'image', 'gt_text', 'gt_kernels', 'mask' ] + - NormalizeImage: + scale: 1./255. + mean: [ 0.485, 0.456, 0.406 ] + std: [ 0.229, 0.224, 0.225 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'gt_text', 'gt_kernels', 'mask' ] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + limit_side_len: 736 + limit_type: min + - NormalizeImage: + scale: 1./255. + mean: [ 0.485, 0.456, 0.406 ] + std: [ 0.229, 0.224, 0.225 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'shape', 'polys', 'ignore_tags' ] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 8 \ No newline at end of file diff --git a/configs/det/det_r50_vd_sast_icdar15.yml b/configs/det/det_r50_vd_sast_icdar15.yml new file mode 100644 index 0000000..a989bc8 --- /dev/null +++ b/configs/det/det_r50_vd_sast_icdar15.yml @@ -0,0 +1,110 @@ +Global: + use_gpu: true + epoch_num: 5000 + log_smooth_window: 20 + print_batch_step: 2 + save_model_dir: ./output/sast_r50_vd_ic15/ + save_epoch_step: 1000 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [4000, 5000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + load_static_weights: True + cal_metric_during_train: False + pretrained_model: ./pretrain_models/ResNet50_vd_ssld_pretrained/ + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: + save_res_path: ./output/sast_r50_vd_ic15/predicts_sast.txt + +Architecture: + model_type: det + algorithm: SAST + Transform: + Backbone: + name: ResNet_SAST + layers: 50 + Neck: + name: SASTFPN + with_cab: True + Head: + name: SASTHead + +Loss: + name: SASTLoss + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + # name: Cosine + learning_rate: 0.001 + # warmup_epoch: 0 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: SASTPostProcess + score_thresh: 0.5 + sample_pts_num: 2 + nms_thresh: 0.2 + expand_scale: 1.0 + shrink_ratio_of_width: 0.3 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: [./train_data/icdar2013/train_label_json.txt, ./train_data/icdar2015/train_label_json.txt, ./train_data/icdar17_mlt_latin/train_label_json.txt, ./train_data/coco_text_icdar_4pts/train_label_json.txt] + ratio_list: [0.1, 0.45, 0.3, 0.15] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - SASTProcessTrain: + image_shape: [512, 512] + min_crop_side_ratio: 0.3 + min_crop_size: 24 + min_text_size: 4 + max_text_size: 512 + - KeepKeys: + keep_keys: ['image', 'score_map', 'border_map', 'training_mask', 'tvo_map', 'tco_map'] # dataloader will return list in this order + loader: + shuffle: True + drop_last: False + batch_size_per_card: 4 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + resize_long: 1536 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 \ No newline at end of file diff --git a/configs/det/det_r50_vd_sast_totaltext.yml b/configs/det/det_r50_vd_sast_totaltext.yml new file mode 100644 index 0000000..e040c42 --- /dev/null +++ b/configs/det/det_r50_vd_sast_totaltext.yml @@ -0,0 +1,110 @@ +Global: + use_gpu: true + epoch_num: 5000 + log_smooth_window: 20 + print_batch_step: 2 + save_model_dir: ./output/sast_r50_vd_tt/ + save_epoch_step: 1000 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [4000, 5000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + load_static_weights: True + cal_metric_during_train: False + pretrained_model: ./pretrain_models/ResNet50_vd_ssld_pretrained/ + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: + save_res_path: ./output/sast_r50_vd_tt/predicts_sast.txt + +Architecture: + model_type: det + algorithm: SAST + Transform: + Backbone: + name: ResNet_SAST + layers: 50 + Neck: + name: SASTFPN + with_cab: True + Head: + name: SASTHead + +Loss: + name: SASTLoss + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + # name: Cosine + learning_rate: 0.001 + # warmup_epoch: 0 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: SASTPostProcess + score_thresh: 0.5 + sample_pts_num: 6 + nms_thresh: 0.2 + expand_scale: 1.2 + shrink_ratio_of_width: 0.2 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: [./train_data/art_latin_icdar_14pt/train_no_tt_test/train_label_json.txt, ./train_data/total_text_icdar_14pt/train_label_json.txt] + ratio_list: [0.5, 0.5] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - SASTProcessTrain: + image_shape: [512, 512] + min_crop_side_ratio: 0.3 + min_crop_size: 24 + min_text_size: 4 + max_text_size: 512 + - KeepKeys: + keep_keys: ['image', 'score_map', 'border_map', 'training_mask', 'tvo_map', 'tco_map'] # dataloader will return list in this order + loader: + shuffle: True + drop_last: False + batch_size_per_card: 4 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/total_text_icdar_14pt/test_label_json.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + resize_long: 768 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 \ No newline at end of file diff --git a/configs/e2e/e2e_r50_vd_pg.yml b/configs/e2e/e2e_r50_vd_pg.yml new file mode 100644 index 0000000..0a232f7 --- /dev/null +++ b/configs/e2e/e2e_r50_vd_pg.yml @@ -0,0 +1,114 @@ +Global: + use_gpu: True + epoch_num: 600 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/pgnet_r50_vd_totaltext/ + save_epoch_step: 10 + # evaluation is run every 0 iterationss after the 1000th iteration + eval_batch_step: [ 0, 1000 ] + # 1. If pretrained_model is saved in static mode, such as classification pretrained model + # from static branch, load_static_weights must be set as True. + # 2. If you want to finetune the pretrained models we provide in the docs, + # you should set load_static_weights as False. + load_static_weights: False + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: + valid_set: totaltext # two mode: totaltext valid curved words, partvgg valid non-curved words + save_res_path: ./output/pgnet_r50_vd_totaltext/predicts_pgnet.txt + character_dict_path: ppocr/utils/ic15_dict.txt + character_type: EN + max_text_length: 50 # the max length in seq + max_text_nums: 30 # the max seq nums in a pic + tcl_len: 64 + +Architecture: + model_type: e2e + algorithm: PGNet + Transform: + Backbone: + name: ResNet + layers: 50 + Neck: + name: PGFPN + Head: + name: PGHead + +Loss: + name: PGLoss + tcl_bs: 64 + max_text_length: 50 # the same as Global: max_text_length + max_text_nums: 30 # the same as Global:max_text_nums + pad_num: 36 # the length of dict for pad + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0 + + +PostProcess: + name: PGPostProcess + score_thresh: 0.5 +Metric: + name: E2EMetric + character_dict_path: ppocr/utils/ic15_dict.txt + main_indicator: f_score_e2e + +Train: + dataset: + name: PGDataSet + label_file_list: [.././train_data/total_text/train/] + ratio_list: [1.0] + data_format: icdar #two data format: icdar/textnet + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - PGProcessTrain: + batch_size: 14 # same as loader: batch_size_per_card + min_crop_size: 24 + min_text_size: 4 + max_text_size: 512 + - KeepKeys: + keep_keys: [ 'images', 'tcl_maps', 'tcl_label_maps', 'border_maps','direction_maps', 'training_masks', 'label_list', 'pos_list', 'pos_mask' ] # dataloader will return list in this order + loader: + shuffle: True + drop_last: True + batch_size_per_card: 14 + num_workers: 16 + +Eval: + dataset: + name: PGDataSet + data_dir: ./train_data/ + label_file_list: [./train_data/total_text/test/] + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - E2ELabelEncode: + - E2EResizeForTest: + max_side_len: 768 + - NormalizeImage: + scale: 1./255. + mean: [ 0.485, 0.456, 0.406 ] + std: [ 0.229, 0.224, 0.225 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'shape', 'polys', 'strs', 'tags' ] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 \ No newline at end of file diff --git a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml new file mode 100644 index 0000000..2d4261f --- /dev/null +++ b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml @@ -0,0 +1,126 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v3 + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml new file mode 100644 index 0000000..e7cbae5 --- /dev/null +++ b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml @@ -0,0 +1,205 @@ +Global: + debug: false + use_gpu: true + epoch_num: 800 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v3_distillation + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_distillation.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Piecewise + decay_epochs : [700, 800] + values : [0.0005, 0.00005] + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: &model_type "rec" + name: DistillationModel + algorithm: Distillation + Models: + Teacher: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + Student: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length +Loss: + name: CombinedLoss + loss_config_list: + - DistillationDMLLoss: + weight: 1.0 + act: "softmax" + use_log: true + model_name_pairs: + - ["Student", "Teacher"] + key: head_out + multi_head: True + dis_head: ctc + name: dml_ctc + - DistillationDMLLoss: + weight: 0.5 + act: "softmax" + use_log: true + model_name_pairs: + - ["Student", "Teacher"] + key: head_out + multi_head: True + dis_head: sar + name: dml_sar + - DistillationDistanceLoss: + weight: 1.0 + mode: "l2" + model_name_pairs: + - ["Student", "Teacher"] + key: backbone_out + - DistillationCTCLoss: + weight: 1.0 + model_name_list: ["Student", "Teacher"] + key: head_out + multi_head: True + - DistillationSARLoss: + weight: 1.0 + model_name_list: ["Student", "Teacher"] + key: head_out + multi_head: True + +PostProcess: + name: DistillationCTCLabelDecode + model_name: ["Student", "Teacher"] + key: head_out + multi_head: True + +Metric: + name: DistillationMetric + base_metric_name: RecMetric + main_indicator: acc + key: "Student" + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml new file mode 100644 index 0000000..af8b7ba --- /dev/null +++ b/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml @@ -0,0 +1,126 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_en_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/en_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_en.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/.gitkeep b/configs/rec/PP-OCRv3/multi_language/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml new file mode 100644 index 0000000..55cee5b --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml @@ -0,0 +1,126 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_arabic_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/arabic_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_arabic.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml new file mode 100644 index 0000000..135a360 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml @@ -0,0 +1,126 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_chinese_cht_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/chinese_cht_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_chinese_cht.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml new file mode 100644 index 0000000..5365152 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml @@ -0,0 +1,126 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_cyrillic_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/cyrillic_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_cyrillic.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml new file mode 100644 index 0000000..023b4af --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml @@ -0,0 +1,126 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_devanagari_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/devanagari_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_devanagari.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml new file mode 100644 index 0000000..065b327 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml @@ -0,0 +1,126 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_japan_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/japan_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_japan.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml new file mode 100644 index 0000000..5edb298 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml @@ -0,0 +1,126 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_ka_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/ka_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_ka.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml new file mode 100644 index 0000000..eebeb09 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml @@ -0,0 +1,126 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_korean_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/korean_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_korean.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml new file mode 100644 index 0000000..4f8e25b --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml @@ -0,0 +1,126 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_latin_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/latin_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_latin.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml new file mode 100644 index 0000000..15b8624 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml @@ -0,0 +1,126 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_ta_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/ta_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_ta.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml new file mode 100644 index 0000000..9074c8d --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml @@ -0,0 +1,126 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_te_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/te_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_te.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec.yml new file mode 100644 index 0000000..158815e --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec.yml @@ -0,0 +1,138 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4 + save_epoch_step: 10 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: pytorchocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: true + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR_LCNet + Transform: + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + max_text_length: *max_text_length + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 192 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_ampO2_ultra.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_ampO2_ultra.yml new file mode 100644 index 0000000..475c551 --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_ampO2_ultra.yml @@ -0,0 +1,140 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4 + save_epoch_step: 10 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt + use_amp: True + amp_level: O2 + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR_LCNet + Transform: + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + max_text_length: *max_text_length + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 384 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 16 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 16 diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_ctc.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_ctc.yml new file mode 100644 index 0000000..28901ed --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_ctc.yml @@ -0,0 +1,132 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4 + save_epoch_step: 10 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: true + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR_LCNet + Transform: + Backbone: + name: PPLCNetV3 + scale: 0.95 + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [ 1, 3 ] + use_guide: True + Head: + name: CTCHead + fc_decay: 0.00004 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + max_text_length: *max_text_length + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 192 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_distill.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_distill.yml new file mode 100644 index 0000000..f613ee5 --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_distill.yml @@ -0,0 +1,231 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_dkd_400w_svtr_ctc_lcnet_blank_dkd0.1/ + save_epoch_step: 40 + eval_batch_step: + - 0 + - 2000 + cal_metric_during_train: true + pretrained_model: null + checkpoints: ./output/rec_dkd_400w_svtr_ctc_lcnet_blank_dkd0.1/latest + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 + regularizer: + name: L2 + factor: 3.0e-05 +Architecture: + model_type: rec + name: DistillationModel + algorithm: Distillation + Models: + Teacher: + pretrained: + freeze_params: true + return_all_feats: true + model_type: rec + algorithm: SVTR + Transform: null + Backbone: + name: SVTRNet + img_size: + - 48 + - 320 + out_char_num: 40 + out_channels: 192 + patch_merging: Conv + embed_dim: + - 64 + - 128 + - 256 + depth: + - 3 + - 6 + - 3 + num_heads: + - 2 + - 4 + - 8 + mixer: + - Conv + - Conv + - Conv + - Conv + - Conv + - Conv + - Global + - Global + - Global + - Global + - Global + - Global + local_mixer: + - - 5 + - 5 + - - 5 + - 5 + - - 5 + - 5 + last_stage: false + prenorm: true + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + Student: + pretrained: + freeze_params: false + return_all_feats: true + model_type: rec + algorithm: SVTR + Transform: null + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length +Loss: + name: CombinedLoss + loss_config_list: + - DistillationDKDLoss: + weight: 0.1 + model_name_pairs: + - - Student + - Teacher + key: head_out + multi_head: true + alpha: 1.0 + beta: 2.0 + dis_head: gtc + name: dkd + - DistillationCTCLoss: + weight: 1.0 + model_name_list: + - Student + key: head_out + multi_head: true + - DistillationNRTRLoss: + weight: 1.0 + smoothing: false + model_name_list: + - Student + key: head_out + multi_head: true + - DistillCTCLogits: + weight: 1.0 + reduction: mean + model_name_pairs: + - - Student + - Teacher + key: head_out +PostProcess: + name: DistillationCTCLabelDecode + model_name: + - Student + key: head_out + multi_head: true +Metric: + name: DistillationMetric + base_metric_name: RecMetric + main_indicator: acc + key: Student + ignore_space: false +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list.txt + ratio_list: + - 1.0 + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 8 + use_shared_memory: true +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 +profiler_options: null diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_fp32_ultra.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_fp32_ultra.yml new file mode 100644 index 0000000..8c26730 --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_fp32_ultra.yml @@ -0,0 +1,138 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4 + save_epoch_step: 10 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR_LCNet + Transform: + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + max_text_length: *max_text_length + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 192 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 16 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 16 diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet.yml new file mode 100644 index 0000000..c181da6 --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet.yml @@ -0,0 +1,137 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4_hgnet + save_epoch_step: 10 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: pytorchocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: PPHGNet_small + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + max_text_length: *max_text_length + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 128 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet_ampO2_ultra.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet_ampO2_ultra.yml new file mode 100644 index 0000000..4303521 --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet_ampO2_ultra.yml @@ -0,0 +1,139 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4_hgnet + save_epoch_step: 10 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt + use_amp: True + amp_level: O2 + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: PPHGNet_small + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + max_text_length: *max_text_length + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 256 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 16 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 16 diff --git a/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet_fp32_ultra.yml b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet_fp32_ultra.yml new file mode 100644 index 0000000..ee9ebca --- /dev/null +++ b/configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet_fp32_ultra.yml @@ -0,0 +1,137 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4_hgnet + save_epoch_step: 10 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: PPHGNet_small + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + max_text_length: *max_text_length + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 256 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 16 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 16 diff --git a/configs/rec/PP-OCRv4/en_PP-OCRv4_rec.yml b/configs/rec/PP-OCRv4/en_PP-OCRv4_rec.yml new file mode 100644 index 0000000..b355fb4 --- /dev/null +++ b/configs/rec/PP-OCRv4/en_PP-OCRv4_rec.yml @@ -0,0 +1,150 @@ +Global: + debug: false + use_gpu: true + epoch_num: 50 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_ppocr_v4 + save_epoch_step: 10 + eval_batch_step: + - 0 + - 2000 + cal_metric_during_train: true + pretrained_model: refactor + checkpoints: null + save_inference_dir: null + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: pytorchocr/utils/en_dict.txt + max_text_length: 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3.txt +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.0005 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 +Architecture: + model_type: rec + algorithm: SVTR_LCNet + Transform: null + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: + - 1 + - 3 + use_guide: true + Head: + fc_decay: 1.0e-05 + - NRTRHead: + nrtr_dim: 384 + max_text_length: 25 +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: null + - NRTRLoss: null +PostProcess: + name: CTCLabelDecode +Metric: + name: RecMetric + main_indicator: acc + ignore_space: false +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: + - 48 + - 320 + - 3 + max_text_length: 25 + - RecAug: null + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: + - - 320 + - 32 + - - 320 + - 48 + - - 320 + - 64 + first_bs: 96 + fix_bs: false + divided_factor: + - 8 + - 16 + is_training: true + loader: + shuffle: true + batch_size_per_card: 96 + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: + - 3 + - 48 + - 320 + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 +profiler_options: null diff --git a/configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml b/configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml new file mode 100644 index 0000000..1db3e1c --- /dev/null +++ b/configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml @@ -0,0 +1,100 @@ +Global: + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_chinese_common_v2.0 + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words/ch/word_1.jpg + # for data or label process + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + character_type: ch + max_text_length: 25 + infer_mode: False + use_space_char: True + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0.00004 + +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: ResNet + layers: 34 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 256 + Head: + name: CTCHead + fc_decay: 0.00004 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/train_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RecAug: + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/val_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml b/configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml new file mode 100644 index 0000000..dc9d650 --- /dev/null +++ b/configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml @@ -0,0 +1,102 @@ +Global: + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_chinese_lite_v2.0 + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words/ch/word_1.jpg + # for data or label process + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + character_type: ch + max_text_length: 25 + infer_mode: False + use_space_char: True + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0.00001 + +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: small + small_stride: [1, 2, 2, 2] + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 48 + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/train_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RecAug: + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: ["./train_data/val_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/multi_language/generate_multi_language_configs.py b/configs/rec/multi_language/generate_multi_language_configs.py new file mode 100644 index 0000000..7a65401 --- /dev/null +++ b/configs/rec/multi_language/generate_multi_language_configs.py @@ -0,0 +1,200 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import yaml +from argparse import ArgumentParser, RawDescriptionHelpFormatter +import os.path +import logging +logging.basicConfig(level=logging.INFO) + +support_list = { + 'it': 'italian', + 'es': 'spanish', + 'pt': 'portuguese', + 'ru': 'russian', + 'ar': 'arabic', + 'ta': 'tamil', + 'ug': 'uyghur', + 'fa': 'persian', + 'ur': 'urdu', + 'rs_latin': 'serbian latin', + 'oc': 'occitan', + 'rs_cyrillic': 'serbian cyrillic', + 'bg': 'bulgarian', + 'uk': 'ukranian', + 'be': 'belarusian', + 'te': 'telugu', + 'kn': 'kannada', + 'ch_tra': 'chinese tradition', + 'hi': 'hindi', + 'mr': 'marathi', + 'ne': 'nepali', +} +assert (os.path.isfile("./rec_multi_language_lite_train.yml") + ), "Loss basic configuration file rec_multi_language_lite_train.yml.\ +You can download it from \ +https://github.com/PaddlePaddle/PaddleOCR/tree/dygraph/configs/rec/multi_language/" + +global_config = yaml.load( + open("./rec_multi_language_lite_train.yml", 'rb'), Loader=yaml.Loader) +project_path = os.path.abspath(os.path.join(os.getcwd(), "../../../")) + + +class ArgsParser(ArgumentParser): + def __init__(self): + super(ArgsParser, self).__init__( + formatter_class=RawDescriptionHelpFormatter) + self.add_argument( + "-o", "--opt", nargs='+', help="set configuration options") + self.add_argument( + "-l", + "--language", + nargs='+', + help="set language type, support {}".format(support_list)) + self.add_argument( + "--train", + type=str, + help="you can use this command to change the train dataset default path" + ) + self.add_argument( + "--val", + type=str, + help="you can use this command to change the eval dataset default path" + ) + self.add_argument( + "--dict", + type=str, + help="you can use this command to change the dictionary default path" + ) + self.add_argument( + "--data_dir", + type=str, + help="you can use this command to change the dataset default root path" + ) + + def parse_args(self, argv=None): + args = super(ArgsParser, self).parse_args(argv) + args.opt = self._parse_opt(args.opt) + args.language = self._set_language(args.language) + return args + + def _parse_opt(self, opts): + config = {} + if not opts: + return config + for s in opts: + s = s.strip() + k, v = s.split('=') + config[k] = yaml.load(v, Loader=yaml.Loader) + return config + + def _set_language(self, type): + assert (type), "please use -l or --language to choose language type" + assert( + type[0] in support_list.keys() + ),"the sub_keys(-l or --language) can only be one of support list: \n{},\nbut get: {}, " \ + "please check your running command".format(support_list, type) + global_config['Global'][ + 'character_dict_path'] = 'ppocr/utils/dict/{}_dict.txt'.format(type[ + 0]) + global_config['Global'][ + 'save_model_dir'] = './output/rec_{}_lite'.format(type[0]) + global_config['Train']['dataset'][ + 'label_file_list'] = ["train_data/{}_train.txt".format(type[0])] + global_config['Eval']['dataset'][ + 'label_file_list'] = ["train_data/{}_val.txt".format(type[0])] + global_config['Global']['character_type'] = type[0] + assert ( + os.path.isfile( + os.path.join(project_path, global_config['Global'][ + 'character_dict_path'])) + ), "Loss default dictionary file {}_dict.txt.You can download it from \ +https://github.com/PaddlePaddle/PaddleOCR/tree/dygraph/ppocr/utils/dict/".format( + type[0]) + return type[0] + + +def merge_config(config): + """ + Merge config into global config. + Args: + config (dict): Config to be merged. + Returns: global config + """ + for key, value in config.items(): + if "." not in key: + if isinstance(value, dict) and key in global_config: + global_config[key].update(value) + else: + global_config[key] = value + else: + sub_keys = key.split('.') + assert ( + sub_keys[0] in global_config + ), "the sub_keys can only be one of global_config: {}, but get: {}, please check your running command".format( + global_config.keys(), sub_keys[0]) + cur = global_config[sub_keys[0]] + for idx, sub_key in enumerate(sub_keys[1:]): + if idx == len(sub_keys) - 2: + cur[sub_key] = value + else: + cur = cur[sub_key] + + +def loss_file(path): + assert ( + os.path.exists(path) + ), "There is no such file:{},Please do not forget to put in the specified file".format( + path) + + +if __name__ == '__main__': + FLAGS = ArgsParser().parse_args() + merge_config(FLAGS.opt) + save_file_path = 'rec_{}_lite_train.yml'.format(FLAGS.language) + if os.path.isfile(save_file_path): + os.remove(save_file_path) + + if FLAGS.train: + global_config['Train']['dataset']['label_file_list'] = [FLAGS.train] + train_label_path = os.path.join(project_path, FLAGS.train) + loss_file(train_label_path) + if FLAGS.val: + global_config['Eval']['dataset']['label_file_list'] = [FLAGS.val] + eval_label_path = os.path.join(project_path, FLAGS.val) + loss_file(eval_label_path) + if FLAGS.dict: + global_config['Global']['character_dict_path'] = FLAGS.dict + dict_path = os.path.join(project_path, FLAGS.dict) + loss_file(dict_path) + if FLAGS.data_dir: + global_config['Eval']['dataset']['data_dir'] = FLAGS.data_dir + global_config['Train']['dataset']['data_dir'] = FLAGS.data_dir + data_dir = os.path.join(project_path, FLAGS.data_dir) + loss_file(data_dir) + + with open(save_file_path, 'w') as f: + yaml.dump( + dict(global_config), f, default_flow_style=False, sort_keys=False) + logging.info("Project path is :{}".format(project_path)) + logging.info("Train list path set to :{}".format(global_config['Train'][ + 'dataset']['label_file_list'][0])) + logging.info("Eval list path set to :{}".format(global_config['Eval'][ + 'dataset']['label_file_list'][0])) + logging.info("Dataset root path set to :{}".format(global_config['Eval'][ + 'dataset']['data_dir'])) + logging.info("Dict path set to :{}".format(global_config['Global'][ + 'character_dict_path'])) + logging.info("Config file set to :configs/rec/multi_language/{}". + format(save_file_path)) diff --git a/configs/rec/multi_language/rec_arabic_lite_train.yml b/configs/rec/multi_language/rec_arabic_lite_train.yml new file mode 100644 index 0000000..6dcfd1b --- /dev/null +++ b/configs/rec/multi_language/rec_arabic_lite_train.yml @@ -0,0 +1,111 @@ +Global: + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_arabic_lite + save_epoch_step: 3 + eval_batch_step: + - 0 + - 2000 + cal_metric_during_train: true + pretrained_model: null + checkpoints: null + save_inference_dir: null + use_visualdl: false + infer_img: null + character_dict_path: ppocr/utils/dict/arabic_dict.txt + character_type: arabic + max_text_length: 25 + infer_mode: false + use_space_char: true +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: L2 + factor: 1.0e-05 +Architecture: + model_type: rec + algorithm: CRNN + Transform: null + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: small + small_stride: + - 1 + - 2 + - 2 + - 2 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 48 + Head: + name: CTCHead + fc_decay: 1.0e-05 +Loss: + name: CTCLoss +PostProcess: + name: CTCLabelDecode +Metric: + name: RecMetric + main_indicator: acc +Train: + dataset: + name: SimpleDataSet + data_dir: train_data/ + label_file_list: + - train_data/arabic_train.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: null + - CTCLabelEncode: null + - RecResizeImg: + image_shape: + - 3 + - 32 + - 320 + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: true + batch_size_per_card: 256 + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/ + label_file_list: + - train_data/arabic_val.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - CTCLabelEncode: null + - RecResizeImg: + image_shape: + - 3 + - 32 + - 320 + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: false + drop_last: false + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/multi_language/rec_cyrillic_lite_train.yml b/configs/rec/multi_language/rec_cyrillic_lite_train.yml new file mode 100644 index 0000000..52527c1 --- /dev/null +++ b/configs/rec/multi_language/rec_cyrillic_lite_train.yml @@ -0,0 +1,111 @@ +Global: + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_cyrillic_lite + save_epoch_step: 3 + eval_batch_step: + - 0 + - 2000 + cal_metric_during_train: true + pretrained_model: null + checkpoints: null + save_inference_dir: null + use_visualdl: false + infer_img: null + character_dict_path: ppocr/utils/dict/cyrillic_dict.txt + character_type: cyrillic + max_text_length: 25 + infer_mode: false + use_space_char: true +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: L2 + factor: 1.0e-05 +Architecture: + model_type: rec + algorithm: CRNN + Transform: null + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: small + small_stride: + - 1 + - 2 + - 2 + - 2 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 48 + Head: + name: CTCHead + fc_decay: 1.0e-05 +Loss: + name: CTCLoss +PostProcess: + name: CTCLabelDecode +Metric: + name: RecMetric + main_indicator: acc +Train: + dataset: + name: SimpleDataSet + data_dir: train_data/ + label_file_list: + - train_data/cyrillic_train.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: null + - CTCLabelEncode: null + - RecResizeImg: + image_shape: + - 3 + - 32 + - 320 + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: true + batch_size_per_card: 256 + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/ + label_file_list: + - train_data/cyrillic_val.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - CTCLabelEncode: null + - RecResizeImg: + image_shape: + - 3 + - 32 + - 320 + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: false + drop_last: false + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/multi_language/rec_devanagari_lite_train.yml b/configs/rec/multi_language/rec_devanagari_lite_train.yml new file mode 100644 index 0000000..e1a7c82 --- /dev/null +++ b/configs/rec/multi_language/rec_devanagari_lite_train.yml @@ -0,0 +1,111 @@ +Global: + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_devanagari_lite + save_epoch_step: 3 + eval_batch_step: + - 0 + - 2000 + cal_metric_during_train: true + pretrained_model: null + checkpoints: null + save_inference_dir: null + use_visualdl: false + infer_img: null + character_dict_path: ppocr/utils/dict/devanagari_dict.txt + character_type: devanagari + max_text_length: 25 + infer_mode: false + use_space_char: true +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: L2 + factor: 1.0e-05 +Architecture: + model_type: rec + algorithm: CRNN + Transform: null + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: small + small_stride: + - 1 + - 2 + - 2 + - 2 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 48 + Head: + name: CTCHead + fc_decay: 1.0e-05 +Loss: + name: CTCLoss +PostProcess: + name: CTCLabelDecode +Metric: + name: RecMetric + main_indicator: acc +Train: + dataset: + name: SimpleDataSet + data_dir: train_data/ + label_file_list: + - train_data/devanagari_train.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: null + - CTCLabelEncode: null + - RecResizeImg: + image_shape: + - 3 + - 32 + - 320 + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: true + batch_size_per_card: 256 + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/ + label_file_list: + - train_data/devanagari_val.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - CTCLabelEncode: null + - RecResizeImg: + image_shape: + - 3 + - 32 + - 320 + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: false + drop_last: false + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/multi_language/rec_en_number_lite_train.yml b/configs/rec/multi_language/rec_en_number_lite_train.yml new file mode 100644 index 0000000..13eda84 --- /dev/null +++ b/configs/rec/multi_language/rec_en_number_lite_train.yml @@ -0,0 +1,102 @@ +Global: + use_gpu: True + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_en_number_lite + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: + # for data or label process + character_dict_path: ppocr/utils/dict/en_dict.txt + character_type: EN + max_text_length: 25 + infer_mode: False + use_space_char: False + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0.00001 + +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: small + small_stride: [1, 2, 2, 2] + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 48 + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/train_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RecAug: + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/eval_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/multi_language/rec_french_lite_train.yml b/configs/rec/multi_language/rec_french_lite_train.yml new file mode 100644 index 0000000..63378d3 --- /dev/null +++ b/configs/rec/multi_language/rec_french_lite_train.yml @@ -0,0 +1,102 @@ +Global: + use_gpu: True + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_french_lite + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: + # for data or label process + character_dict_path: ppocr/utils/dict/french_dict.txt + character_type: french + max_text_length: 25 + infer_mode: False + use_space_char: False + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0.00001 + +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: small + small_stride: [1, 2, 2, 2] + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 48 + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/train_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RecAug: + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/eval_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/multi_language/rec_german_lite_train.yml b/configs/rec/multi_language/rec_german_lite_train.yml new file mode 100644 index 0000000..1651510 --- /dev/null +++ b/configs/rec/multi_language/rec_german_lite_train.yml @@ -0,0 +1,102 @@ +Global: + use_gpu: True + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_german_lite + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: + # for data or label process + character_dict_path: ppocr/utils/dict/german_dict.txt + character_type: german + max_text_length: 25 + infer_mode: False + use_space_char: False + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0.00001 + +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: small + small_stride: [1, 2, 2, 2] + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 48 + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/train_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RecAug: + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/eval_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/multi_language/rec_japan_lite_train.yml b/configs/rec/multi_language/rec_japan_lite_train.yml new file mode 100644 index 0000000..bb47584 --- /dev/null +++ b/configs/rec/multi_language/rec_japan_lite_train.yml @@ -0,0 +1,102 @@ +Global: + use_gpu: True + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_japan_lite + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: + # for data or label process + character_dict_path: ppocr/utils/dict/japan_dict.txt + character_type: japan + max_text_length: 25 + infer_mode: False + use_space_char: False + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0.00001 + +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: small + small_stride: [1, 2, 2, 2] + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 48 + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/train_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RecAug: + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/eval_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/multi_language/rec_korean_lite_train.yml b/configs/rec/multi_language/rec_korean_lite_train.yml new file mode 100644 index 0000000..77f1552 --- /dev/null +++ b/configs/rec/multi_language/rec_korean_lite_train.yml @@ -0,0 +1,102 @@ +Global: + use_gpu: True + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_korean_lite + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: + # for data or label process + character_dict_path: ppocr/utils/dict/korean_dict.txt + character_type: korean + max_text_length: 25 + infer_mode: False + use_space_char: False + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0.00001 + +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: small + small_stride: [1, 2, 2, 2] + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 48 + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/train_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RecAug: + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/eval_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/multi_language/rec_latin_lite_train.yml b/configs/rec/multi_language/rec_latin_lite_train.yml new file mode 100644 index 0000000..e71112b --- /dev/null +++ b/configs/rec/multi_language/rec_latin_lite_train.yml @@ -0,0 +1,111 @@ +Global: + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_latin_lite + save_epoch_step: 3 + eval_batch_step: + - 0 + - 2000 + cal_metric_during_train: true + pretrained_model: null + checkpoints: null + save_inference_dir: null + use_visualdl: false + infer_img: null + character_dict_path: ppocr/utils/dict/latin_dict.txt + character_type: latin + max_text_length: 25 + infer_mode: false + use_space_char: true +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: L2 + factor: 1.0e-05 +Architecture: + model_type: rec + algorithm: CRNN + Transform: null + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: small + small_stride: + - 1 + - 2 + - 2 + - 2 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 48 + Head: + name: CTCHead + fc_decay: 1.0e-05 +Loss: + name: CTCLoss +PostProcess: + name: CTCLabelDecode +Metric: + name: RecMetric + main_indicator: acc +Train: + dataset: + name: SimpleDataSet + data_dir: train_data/ + label_file_list: + - train_data/latin_train.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: null + - CTCLabelEncode: null + - RecResizeImg: + image_shape: + - 3 + - 32 + - 320 + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: true + batch_size_per_card: 256 + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/ + label_file_list: + - train_data/latin_val.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - CTCLabelEncode: null + - RecResizeImg: + image_shape: + - 3 + - 32 + - 320 + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: false + drop_last: false + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/multi_language/rec_multi_language_lite_train.yml b/configs/rec/multi_language/rec_multi_language_lite_train.yml new file mode 100644 index 0000000..c42a3d1 --- /dev/null +++ b/configs/rec/multi_language/rec_multi_language_lite_train.yml @@ -0,0 +1,103 @@ +Global: + use_gpu: True + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_multi_language_lite + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: + # for data or label process + character_dict_path: + # Set the language of training, if set, select the default dictionary file + character_type: + max_text_length: 25 + infer_mode: False + use_space_char: True + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0.00001 + +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: small + small_stride: [1, 2, 2, 2] + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 48 + Head: + name: CTCHead + fc_decay: 0.00001 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: train_data/ + label_file_list: ["./train_data/train_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RecAug: + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/ + label_file_list: ["./train_data/val_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/rec_d28_can.yml b/configs/rec/rec_d28_can.yml new file mode 100644 index 0000000..b6aa26b --- /dev/null +++ b/configs/rec/rec_d28_can.yml @@ -0,0 +1,122 @@ +Global: + use_gpu: True + epoch_num: 240 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/can/ + save_epoch_step: 1 + # evaluation is run every 1105 iterations (1 epoch)(batch_size = 8) + eval_batch_step: [0, 1105] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/datasets/crohme_demo/hme_00.jpg + # for data or label process + character_dict_path: ppocr/utils/dict/latex_symbol_dict.txt + max_text_length: 36 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_can.txt + +Optimizer: + name: Momentum + momentum: 0.9 + clip_norm_global: 100.0 + lr: + name: TwoStepCosine + learning_rate: 0.01 + warmup_epoch: 1 + weight_decay: 0.0001 + +Architecture: + model_type: rec + algorithm: CAN + in_channels: 1 + Transform: + Backbone: + name: DenseNet + growthRate: 24 + reduction: 0.5 + bottleneck: True + use_dropout: True + input_channel: 1 + Head: + name: CANHead + in_channel: 684 + out_channel: 111 + max_text_length: 36 + ratio: 16 + attdecoder: + is_train: False + input_size: 256 + hidden_size: 256 + encoder_out_channel: 684 + dropout: True + dropout_ratio: 0.5 + word_num: 111 + counting_decoder_out_channel: 111 + attention: + attention_dim: 512 + word_conv_kernel: 1 + +Loss: + name: CANLoss + +PostProcess: + name: CANLabelDecode + +Metric: + name: CANMetric + main_indicator: exp_rate + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/CROHME/training/images/ + label_file_list: ["./train_data/CROHME/training/labels.txt"] + transforms: + - DecodeImage: + channel_first: False + - NormalizeImage: + mean: [0,0,0] + std: [1,1,1] + order: 'hwc' + - GrayImageChannelFormat: + inverse: True + - CANLabelEncode: + lower: False + - KeepKeys: + keep_keys: ['image', 'label'] + loader: + shuffle: True + batch_size_per_card: 8 + drop_last: False + num_workers: 4 + collate_fn: DyMaskCollator + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/CROHME/evaluation/images/ + label_file_list: ["./train_data/CROHME/evaluation/labels.txt"] + transforms: + - DecodeImage: + channel_first: False + - NormalizeImage: + mean: [0,0,0] + std: [1,1,1] + order: 'hwc' + - GrayImageChannelFormat: + inverse: True + - CANLabelEncode: + lower: False + - KeepKeys: + keep_keys: ['image', 'label'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 + num_workers: 4 + collate_fn: DyMaskCollator diff --git a/configs/rec/rec_efficientb3_fpn_pren.yml b/configs/rec/rec_efficientb3_fpn_pren.yml new file mode 100644 index 0000000..0fac6a7 --- /dev/null +++ b/configs/rec/rec_efficientb3_fpn_pren.yml @@ -0,0 +1,92 @@ +Global: + use_gpu: True + epoch_num: 8 + log_smooth_window: 20 + print_batch_step: 5 + save_model_dir: ./output/rec/pren_new + save_epoch_step: 3 + # evaluation is run every 2000 iterations after the 4000th iteration + eval_batch_step: [4000, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words/ch/word_1.jpg + # for data or label process + character_dict_path: + max_text_length: &max_text_length 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_pren.txt + +Optimizer: + name: Adadelta + lr: + name: Piecewise + decay_epochs: [2, 5, 7] + values: [0.5, 0.1, 0.01, 0.001] + +Architecture: + model_type: rec + algorithm: PREN + in_channels: 3 + Backbone: + name: EfficientNetb3_PREN + Neck: + name: PRENFPN + n_r: 5 + d_model: 384 + max_len: *max_text_length + dropout: 0.1 + Head: + name: PRENHead + +Loss: + name: PRENLoss + +PostProcess: + name: PRENLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: + img_mode: BGR + channel_first: False + - PRENLabelEncode: + - RecAug: + - PRENResizeImg: + image_shape: [64, 256] # h,w + - KeepKeys: + keep_keys: ['image', 'label'] + loader: + shuffle: True + batch_size_per_card: 128 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/validation/ + transforms: + - DecodeImage: + img_mode: BGR + channel_first: False + - PRENLabelEncode: + - PRENResizeImg: + image_shape: [64, 256] # h,w + - KeepKeys: + keep_keys: ['image', 'label'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 64 + num_workers: 8 diff --git a/configs/rec/rec_icdar15_train.yml b/configs/rec/rec_icdar15_train.yml new file mode 100644 index 0000000..8a743b5 --- /dev/null +++ b/configs/rec/rec_icdar15_train.yml @@ -0,0 +1,100 @@ +Global: + use_gpu: true + epoch_num: 72 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/ic15/ + save_epoch_step: 3 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: ppocr/utils/ic15_dict.txt + character_type: ch + max_text_length: 25 + infer_mode: False + use_space_char: False + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.0005 + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 96 + Head: + name: CTCHead + fc_decay: 0 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/train_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + use_shared_memory: False + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: ["./train_data/train_list.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 4 + use_shared_memory: False diff --git a/configs/rec/rec_mtb_nrtr.yml b/configs/rec/rec_mtb_nrtr.yml new file mode 100644 index 0000000..765e282 --- /dev/null +++ b/configs/rec/rec_mtb_nrtr.yml @@ -0,0 +1,101 @@ +Global: + use_gpu: True + epoch_num: 21 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/nrtr/ + save_epoch_step: 1 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: pytorchocr/utils/EN_symbol_dict.txt + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_nrtr.txt + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.99 + clip_norm: 5.0 + lr: + name: Cosine + learning_rate: 0.0005 + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0. + +Architecture: + model_type: rec + algorithm: NRTR + in_channels: 1 + Transform: + Backbone: + name: MTB + cnn_num: 2 + Head: + name: Transformer + d_model: 512 + num_encoder_layers: 6 + beam_size: -1 # When Beam size is greater than 0, it means to use beam search when evaluation. + + +Loss: + name: NRTRLoss + smoothing: True + +PostProcess: + name: NRTRLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - NRTRLabelEncode: # Class handling label + - NRTRRecResizeImg: + image_shape: [100, 32] + resize_type: PIL # PIL or OpenCV + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 512 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/evaluation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - NRTRLabelEncode: # Class handling label + - NRTRRecResizeImg: + image_shape: [100, 32] + resize_type: PIL # PIL or OpenCV + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 1 + use_shared_memory: False diff --git a/configs/rec/rec_mv3_none_bilstm_ctc.yml b/configs/rec/rec_mv3_none_bilstm_ctc.yml new file mode 100644 index 0000000..00c1db8 --- /dev/null +++ b/configs/rec/rec_mv3_none_bilstm_ctc.yml @@ -0,0 +1,96 @@ +Global: + use_gpu: True + epoch_num: 72 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/mv3_none_bilstm_ctc/ + save_epoch_step: 3 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.0005 + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 96 + Head: + name: CTCHead + fc_decay: 0 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/validation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 4 diff --git a/configs/rec/rec_mv3_none_none_ctc.yml b/configs/rec/rec_mv3_none_none_ctc.yml new file mode 100644 index 0000000..6711b1d --- /dev/null +++ b/configs/rec/rec_mv3_none_none_ctc.yml @@ -0,0 +1,95 @@ +Global: + use_gpu: True + epoch_num: 72 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/mv3_none_none_ctc/ + save_epoch_step: 3 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.0005 + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: Rosetta + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + Neck: + name: SequenceEncoder + encoder_type: reshape + Head: + name: CTCHead + fc_decay: 0.0004 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/validation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/rec_mv3_tps_bilstm_att.yml b/configs/rec/rec_mv3_tps_bilstm_att.yml new file mode 100644 index 0000000..3cf1f7a --- /dev/null +++ b/configs/rec/rec_mv3_tps_bilstm_att.yml @@ -0,0 +1,102 @@ +Global: + use_gpu: True + epoch_num: 72 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/rec_mv3_tps_bilstm_att/ + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words/ch/word_1.jpg + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.0005 + regularizer: + name: 'L2' + factor: 0.00001 + +Architecture: + model_type: rec + algorithm: RARE + Transform: + name: TPS + num_fiducial: 20 + loc_lr: 0.1 + model_name: small + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 96 + Head: + name: AttentionHead + hidden_size: 96 + + +Loss: + name: AttentionLoss + +PostProcess: + name: AttnLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - AttnLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/validation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - AttnLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 1 diff --git a/configs/rec/rec_mv3_tps_bilstm_ctc.yml b/configs/rec/rec_mv3_tps_bilstm_ctc.yml new file mode 100644 index 0000000..4e86709 --- /dev/null +++ b/configs/rec/rec_mv3_tps_bilstm_ctc.yml @@ -0,0 +1,100 @@ +Global: + use_gpu: True + epoch_num: 72 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/mv3_tps_bilstm_ctc/ + save_epoch_step: 3 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.0005 + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: STARNet + Transform: + name: TPS + num_fiducial: 20 + loc_lr: 0.1 + model_name: small + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 96 + Head: + name: CTCHead + fc_decay: 0.0004 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/validation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 4 diff --git a/configs/rec/rec_r31_sar.yml b/configs/rec/rec_r31_sar.yml new file mode 100644 index 0000000..65e7877 --- /dev/null +++ b/configs/rec/rec_r31_sar.yml @@ -0,0 +1,98 @@ +Global: + use_gpu: true + epoch_num: 5 + log_smooth_window: 20 + print_batch_step: 20 + save_model_dir: ./sar_rec + save_epoch_step: 1 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: + # for data or label process + character_dict_path: ppocr/utils/dict90.txt + max_text_length: 30 + infer_mode: False + use_space_char: False + rm_symbol: True + save_res_path: ./output/rec/predicts_sar.txt + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Piecewise + decay_epochs: [3, 4] + values: [0.001, 0.0001, 0.00001] + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: SAR + Transform: + Backbone: + name: ResNet31 + Head: + name: SARHead + +Loss: + name: SARLoss + +PostProcess: + name: SARLabelDecode + +Metric: + name: RecMetric + + +Train: + dataset: + name: SimpleDataSet + label_file_list: ['./train_data/train_list.txt'] + data_dir: ./train_data/ + ratio_list: 1.0 + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SARLabelEncode: # Class handling label + - SARRecResizeImg: + image_shape: [3, 48, 48, 160] # h:48 w:[48,160] + width_downsample_ratio: 0.25 + - KeepKeys: + keep_keys: ['image', 'label', 'valid_ratio'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 64 + drop_last: True + num_workers: 8 + use_shared_memory: False + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/evaluation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SARLabelEncode: # Class handling label + - SARRecResizeImg: + image_shape: [3, 48, 48, 160] + width_downsample_ratio: 0.25 + - KeepKeys: + keep_keys: ['image', 'label', 'valid_ratio'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 64 + num_workers: 4 + use_shared_memory: False + diff --git a/configs/rec/rec_r34_vd_none_bilstm_ctc.yml b/configs/rec/rec_r34_vd_none_bilstm_ctc.yml new file mode 100644 index 0000000..e4d301a --- /dev/null +++ b/configs/rec/rec_r34_vd_none_bilstm_ctc.yml @@ -0,0 +1,95 @@ +Global: + use_gpu: true + epoch_num: 72 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/r34_vd_none_bilstm_ctc/ + save_epoch_step: 3 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.0005 + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: ResNet + layers: 34 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 256 + Head: + name: CTCHead + fc_decay: 0 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/validation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 4 diff --git a/configs/rec/rec_r34_vd_none_none_ctc.yml b/configs/rec/rec_r34_vd_none_none_ctc.yml new file mode 100644 index 0000000..4a17a00 --- /dev/null +++ b/configs/rec/rec_r34_vd_none_none_ctc.yml @@ -0,0 +1,93 @@ +Global: + use_gpu: true + epoch_num: 72 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/r34_vd_none_none_ctc/ + save_epoch_step: 3 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.0005 + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: Rosetta + Backbone: + name: ResNet + layers: 34 + Neck: + name: SequenceEncoder + encoder_type: reshape + Head: + name: CTCHead + fc_decay: 0.0004 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/validation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 4 diff --git a/configs/rec/rec_r34_vd_tps_bilstm_att.yml b/configs/rec/rec_r34_vd_tps_bilstm_att.yml new file mode 100644 index 0000000..659a172 --- /dev/null +++ b/configs/rec/rec_r34_vd_tps_bilstm_att.yml @@ -0,0 +1,101 @@ +Global: + use_gpu: True + epoch_num: 400 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/b3_rare_r34_none_gru/ + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words/ch/word_1.jpg + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.0005 + regularizer: + name: 'L2' + factor: 0.00000 + +Architecture: + model_type: rec + algorithm: RARE + Transform: + name: TPS + num_fiducial: 20 + loc_lr: 0.1 + model_name: large + Backbone: + name: ResNet + layers: 34 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 256 #96 + Head: + name: AttentionHead # AttentionHead + hidden_size: 256 # + l2_decay: 0.00001 + +Loss: + name: AttentionLoss + +PostProcess: + name: AttnLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - AttnLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/validation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - AttnLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 8 diff --git a/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml b/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml new file mode 100644 index 0000000..7e9d13a --- /dev/null +++ b/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml @@ -0,0 +1,99 @@ +Global: + use_gpu: true + epoch_num: 72 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/r34_vd_tps_bilstm_ctc/ + save_epoch_step: 3 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.0005 + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: STARNet + Transform: + name: TPS + num_fiducial: 20 + loc_lr: 0.1 + model_name: large + Backbone: + name: ResNet + layers: 34 + Neck: + name: SequenceEncoder + encoder_type: reshape + hidden_size: 256 + Head: + name: CTCHead + fc_decay: 0 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/validation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 4 diff --git a/configs/rec/rec_r50_fpn_srn.yml b/configs/rec/rec_r50_fpn_srn.yml new file mode 100644 index 0000000..6b38616 --- /dev/null +++ b/configs/rec/rec_r50_fpn_srn.yml @@ -0,0 +1,107 @@ +Global: + use_gpu: True + epoch_num: 72 + log_smooth_window: 20 + print_batch_step: 5 + save_model_dir: ./output/rec/srn_new + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 5000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words/ch/word_1.jpg + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + num_heads: 8 + infer_mode: False + use_space_char: False + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + clip_norm: 10.0 + lr: + learning_rate: 0.0001 + +Architecture: + model_type: rec + algorithm: SRN + in_channels: 1 + Transform: + Backbone: + name: ResNetFPN + Head: + name: SRNHead + max_text_length: 25 + num_heads: 8 + num_encoder_TUs: 2 + num_decoder_TUs: 4 + hidden_dims: 512 + +Loss: + name: SRNLoss + +PostProcess: + name: SRNLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SRNLabelEncode: # Class handling label + - SRNRecResizeImg: + image_shape: [1, 64, 256] + - KeepKeys: + keep_keys: ['image', + 'label', + 'length', + 'encoder_word_pos', + 'gsrm_word_pos', + 'gsrm_slf_attn_bias1', + 'gsrm_slf_attn_bias2'] # dataloader will return list in this order + loader: + shuffle: False + batch_size_per_card: 64 + drop_last: False + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/validation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SRNLabelEncode: # Class handling label + - SRNRecResizeImg: + image_shape: [1, 64, 256] + - KeepKeys: + keep_keys: ['image', + 'label', + 'length', + 'encoder_word_pos', + 'gsrm_word_pos', + 'gsrm_slf_attn_bias1', + 'gsrm_slf_attn_bias2'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 32 + num_workers: 4 diff --git a/configs/rec/rec_svtr/rec_svtr_base_8local_10global_stn_en.yml b/configs/rec/rec_svtr/rec_svtr_base_8local_10global_stn_en.yml new file mode 100644 index 0000000..9127838 --- /dev/null +++ b/configs/rec/rec_svtr/rec_svtr_base_8local_10global_stn_en.yml @@ -0,0 +1,117 @@ +Global: + use_gpu: True + epoch_num: 20 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/rec_svtr_base_stn_en/ + save_epoch_step: 1 + # evaluation is run every 2000 iterations after the 0th iteration + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_svtr_base.txt + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.99 + epsilon: 0.00000008 + weight_decay: 0.05 + no_weight_decay_name: norm pos_embed + one_dim_param_no_weight_decay: true + lr: + name: Cosine + learning_rate: 0.00025 + warmup_epoch: 2 + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + name: STN_ON + tps_inputsize: [32, 64] + tps_outputsize: [48, 160] + num_control_points: 20 + tps_margins: [0.05,0.05] + stn_activation: none + Backbone: + name: SVTRNet + img_size: [48, 160] + out_char_num: 40 # output char patch + out_channels: 256 # char patch dim + patch_merging: 'Conv' + embed_dim: [128, 256, 384] + depth: [3, 6, 9] + num_heads: [4, 8, 12] + mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] + local_mixer: [[7, 11], [7, 11], [7, 11]] + last_stage: True + prenorm: False + Neck: + name: SequenceEncoder + encoder_type: reshape + Head: + name: CTCHead + +Loss: + name: CTCLoss + +PostProcess: + name: SVTRLabelDecode # SVTRLabelDecode is used for eval after train, please change to CTCLabelDecode when training + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + character_dict_path: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/evaluation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval after train, please change to RecResizeImg when training + character_dict_path: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 128 + num_workers: 2 diff --git a/configs/rec/rec_svtr/rec_svtr_large_10local_11global_stn_ch.yml b/configs/rec/rec_svtr/rec_svtr_large_10local_11global_stn_ch.yml new file mode 100644 index 0000000..ed393ad --- /dev/null +++ b/configs/rec/rec_svtr/rec_svtr_large_10local_11global_stn_ch.yml @@ -0,0 +1,113 @@ +Global: + use_gpu: True + epoch_num: 100 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/rec_svtr_large_ch/ + save_epoch_step: 10 + # evaluation is run every 2000 iterations after the 0th iteration + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words/ch/word_1.jpg + # for data or label process + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: 40 + infer_mode: False + use_space_char: True + save_res_path: ./output/rec/predicts_svtr_large_ch.txt + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.99 + epsilon: 0.00000008 + weight_decay: 0.05 + no_weight_decay_name: norm pos_embed + one_dim_param_no_weight_decay: true + lr: + name: Cosine + learning_rate: 0.0003 + warmup_epoch: 5 + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + name: STN_ON + tps_inputsize: [32, 64] + tps_outputsize: [32, 320] + num_control_points: 20 + tps_margins: [0.05,0.05] + stn_activation: none + Backbone: + name: SVTRNet + img_size: [32, 320] + out_char_num: 40 + out_channels: 384 + patch_merging: 'Conv' + embed_dim: [192, 256, 512] + depth: [3, 9, 9] + num_heads: [6, 8, 16] + mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] + local_mixer: [[7, 11], [7, 11], [7, 11]] + prenorm: False + Neck: + name: SequenceEncoder + encoder_type: reshape + Head: + name: CTCHead + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/scene_ch/ch_scene + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 128 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/scene_ch/scene_test + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 2 diff --git a/configs/rec/rec_svtr/rec_svtr_large_10local_11global_stn_en.yml b/configs/rec/rec_svtr/rec_svtr_large_10local_11global_stn_en.yml new file mode 100644 index 0000000..aac7516 --- /dev/null +++ b/configs/rec/rec_svtr/rec_svtr_large_10local_11global_stn_en.yml @@ -0,0 +1,117 @@ +Global: + use_gpu: True + epoch_num: 20 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/rec_svtr_large_en/ + save_epoch_step: 1 + # evaluation is run every 2000 iterations after the 0th iteration + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_svtr_large.txt + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.99 + epsilon: 0.00000008 + weight_decay: 0.05 + no_weight_decay_name: norm pos_embed + one_dim_param_no_weight_decay: true + lr: + name: Cosine + learning_rate: 0.000125 + warmup_epoch: 2 + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + name: STN_ON + tps_inputsize: [32, 64] + tps_outputsize: [48, 160] + num_control_points: 20 + tps_margins: [0.05,0.05] + stn_activation: none + Backbone: + name: SVTRNet + img_size: [48, 160] + out_char_num: 40 + out_channels: 384 + patch_merging: 'Conv' + embed_dim: [192, 256, 512] + depth: [3, 9, 9] + num_heads: [6, 8, 16] + mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] + local_mixer: [[7, 11], [7, 11], [7, 11]] + prenorm: false + Neck: + name: SequenceEncoder + encoder_type: reshape + Head: + name: CTCHead + +Loss: + name: CTCLoss + +PostProcess: + name: SVTRLabelDecode # SVTRLabelDecode is used for eval after train, please change to CTCLabelDecode when training + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - RecAug: + - CTCLabelEncode: # Class handling label + - RecResizeImg: + character_dict_path: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 128 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/evaluation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval after train, please change to RecResizeImg when training + character_dict_path: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 128 + num_workers: 2 diff --git a/configs/rec/rec_svtr/rec_svtr_small_8local_7global_stn_ch.yml b/configs/rec/rec_svtr/rec_svtr_small_8local_7global_stn_ch.yml new file mode 100644 index 0000000..985f0d3 --- /dev/null +++ b/configs/rec/rec_svtr/rec_svtr_small_8local_7global_stn_ch.yml @@ -0,0 +1,114 @@ +Global: + use_gpu: True + epoch_num: 100 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/rec_svtr_small_ch/ + save_epoch_step: 10 + # evaluation is run every 2000 iterations after the 0th iteration + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words/ch/word_1.jpg + # for data or label process + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: 40 + infer_mode: False + use_space_char: True + save_res_path: ./output/rec/predicts_svtr_small_ch.txt + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.99 + epsilon: 0.00000008 + weight_decay: 0.05 + no_weight_decay_name: norm pos_embed + one_dim_param_no_weight_decay: true + lr: + name: Cosine + learning_rate: 0.0003 + warmup_epoch: 5 + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + name: STN_ON + tps_inputsize: [32, 64] + tps_outputsize: [32, 320] + num_control_points: 20 + tps_margins: [0.05,0.05] + stn_activation: none + Backbone: + name: SVTRNet + img_size: [32, 320] + out_char_num: 40 + out_channels: 192 + patch_merging: 'Conv' + embed_dim: [96, 192, 256] + depth: [3, 6, 6] + num_heads: [3, 6, 8] + mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global'] + local_mixer: [[7, 11], [7, 11], [7, 11]] + last_stage: True + prenorm: False + Neck: + name: SequenceEncoder + encoder_type: reshape + Head: + name: CTCHead + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/scene_ch/ch_scene + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 128 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/scene_ch/scene_test + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 2 diff --git a/configs/rec/rec_svtr/rec_svtr_small_8local_7global_stn_en.yml b/configs/rec/rec_svtr/rec_svtr_small_8local_7global_stn_en.yml new file mode 100644 index 0000000..2405d3e --- /dev/null +++ b/configs/rec/rec_svtr/rec_svtr_small_8local_7global_stn_en.yml @@ -0,0 +1,117 @@ +Global: + use_gpu: True + epoch_num: 20 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/rec_svtr_small_stn_en/ + save_epoch_step: 1 + # evaluation is run every 2000 iterations after the 0th iteration + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_svtr_small.txt + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.99 + epsilon: 0.00000008 + weight_decay: 0.05 + no_weight_decay_name: norm pos_embed + one_dim_param_no_weight_decay: true + lr: + name: Cosine + learning_rate: 0.0005 + warmup_epoch: 2 + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + name: STN_ON + tps_inputsize: [32, 64] + tps_outputsize: [32, 100] + num_control_points: 20 + tps_margins: [0.05,0.05] + stn_activation: none + Backbone: + name: SVTRNet + img_size: [32, 100] + out_char_num: 25 + out_channels: 192 + patch_merging: 'Conv' + embed_dim: [96, 192, 256] + depth: [3, 6, 6] + num_heads: [3, 6, 8] + mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global'] + local_mixer: [[7, 11], [7, 11], [7, 11]] + last_stage: True + prenorm: False + Neck: + name: SequenceEncoder + encoder_type: reshape + Head: + name: CTCHead + +Loss: + name: CTCLoss + +PostProcess: + name: SVTRLabelDecode # SVTRLabelDecode is used for eval after train, please change to CTCLabelDecode when training + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + character_dict_path: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 512 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/evaluation + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval after train, please change to RecResizeImg when training + character_dict_path: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 2 diff --git a/configs/rec/rec_svtr/rec_svtr_tiny_6local_6global_stn_ch.yml b/configs/rec/rec_svtr/rec_svtr_tiny_6local_6global_stn_ch.yml new file mode 100644 index 0000000..42fe93b --- /dev/null +++ b/configs/rec/rec_svtr/rec_svtr_tiny_6local_6global_stn_ch.yml @@ -0,0 +1,114 @@ +Global: + use_gpu: True + epoch_num: 100 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/rec_svtr_tiny_ch/ + save_epoch_step: 10 + # evaluation is run every 2000 iterations after the 0th iteration + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words/ch/word_1.jpg + # for data or label process + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: 40 + infer_mode: False + use_space_char: True + save_res_path: ./output/rec/predicts_svtr_tiny_ch.txt + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.99 + epsilon: 0.00000008 + weight_decay: 0.05 + no_weight_decay_name: norm pos_embed + one_dim_param_no_weight_decay: true + lr: + name: Cosine + learning_rate: 0.0003 + warmup_epoch: 5 + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + name: STN_ON + tps_inputsize: [32, 64] + tps_outputsize: [32, 320] + num_control_points: 20 + tps_margins: [0.05,0.05] + stn_activation: none + Backbone: + name: SVTRNet + img_size: [32, 320] + out_char_num: 40 + out_channels: 192 + patch_merging: 'Conv' + embed_dim: [64, 128, 256] + depth: [3, 6, 3] + num_heads: [2, 4, 8] + mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] + local_mixer: [[7, 11], [7, 11], [7, 11]] + last_stage: True + prenorm: false + Neck: + name: SequenceEncoder + encoder_type: reshape + Head: + name: CTCHead + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/scene_ch/ch_scene + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 128 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/scene_ch/scene_test + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 2 diff --git a/configs/rec/rec_svtr/rec_svtr_tiny_6local_6global_stn_en.yml b/configs/rec/rec_svtr/rec_svtr_tiny_6local_6global_stn_en.yml new file mode 100644 index 0000000..2a41390 --- /dev/null +++ b/configs/rec/rec_svtr/rec_svtr_tiny_6local_6global_stn_en.yml @@ -0,0 +1,117 @@ +Global: + use_gpu: True + epoch_num: 20 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/rec_svtr_tiny_en/ + save_epoch_step: 1 + # evaluation is run every 2000 iterations after the 0th iteration + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_svtr_tiny.txt + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.99 + epsilon: 0.00000008 + weight_decay: 0.05 + no_weight_decay_name: norm pos_embed + one_dim_param_no_weight_decay: true + lr: + name: Cosine + learning_rate: 0.0005 + warmup_epoch: 2 + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + name: STN_ON + tps_inputsize: [32, 64] + tps_outputsize: [32, 100] + num_control_points: 20 + tps_margins: [0.05,0.05] + stn_activation: none + Backbone: + name: SVTRNet + img_size: [32, 100] + out_char_num: 25 + out_channels: 192 + patch_merging: 'Conv' + embed_dim: [64, 128, 256] + depth: [3, 6, 3] + num_heads: [2, 4, 8] + mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] + local_mixer: [[7, 11], [7, 11], [7, 11]] + last_stage: True + prenorm: false + Neck: + name: SequenceEncoder + encoder_type: reshape + Head: + name: CTCHead + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + character_dict_path: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 512 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/evaluation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + character_dict_path: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 2 diff --git a/configs/rec/rec_svtrnet.yml b/configs/rec/rec_svtrnet.yml new file mode 100644 index 0000000..233d5e2 --- /dev/null +++ b/configs/rec/rec_svtrnet.yml @@ -0,0 +1,117 @@ +Global: + use_gpu: True + epoch_num: 20 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/svtr/ + save_epoch_step: 1 + # evaluation is run every 2000 iterations after the 0th iteration + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_svtr_tiny.txt + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.99 + epsilon: 0.00000008 + weight_decay: 0.05 + no_weight_decay_name: norm pos_embed + one_dim_param_no_weight_decay: true + lr: + name: Cosine + learning_rate: 0.0005 + warmup_epoch: 2 + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + name: STN_ON + tps_inputsize: [32, 64] + tps_outputsize: [32, 100] + num_control_points: 20 + tps_margins: [0.05,0.05] + stn_activation: none + Backbone: + name: SVTRNet + img_size: [32, 100] + out_char_num: 25 + out_channels: 192 + patch_merging: 'Conv' + embed_dim: [64, 128, 256] + depth: [3, 6, 3] + num_heads: [2, 4, 8] + mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] + local_mixer: [[7, 11], [7, 11], [7, 11]] + last_stage: True + prenorm: false + Neck: + name: SequenceEncoder + encoder_type: reshape + Head: + name: CTCHead + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + character_dict_path: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 512 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/validation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + character_dict_path: + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 2 diff --git a/configs/rec/rec_vitstr_none_ce.yml b/configs/rec/rec_vitstr_none_ce.yml new file mode 100644 index 0000000..ebe304f --- /dev/null +++ b/configs/rec/rec_vitstr_none_ce.yml @@ -0,0 +1,102 @@ +Global: + use_gpu: True + epoch_num: 20 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/vitstr_none_ce/ + save_epoch_step: 1 + # evaluation is run every 2000 iterations after the 0th iteration# + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: ppocr/utils/EN_symbol_dict.txt + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_vitstr.txt + + +Optimizer: + name: Adadelta + epsilon: 1.e-8 + rho: 0.95 + clip_norm: 5.0 + lr: + learning_rate: 1.0 + +Architecture: + model_type: rec + algorithm: ViTSTR + in_channels: 1 + Transform: + Backbone: + name: ViTSTR + scale: tiny + Neck: + name: SequenceEncoder + encoder_type: reshape + Head: + name: CTCHead + +Loss: + name: CELoss + with_all: True + ignore_index: &ignore_index 0 # Must be zero or greater than the number of character classes + +PostProcess: + name: ViTSTRLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - ViTSTRLabelEncode: # Class handling label + ignore_index: *ignore_index + - GrayRecResizeImg: + image_shape: [224, 224] # W H + resize_type: PIL # PIL or OpenCV + inter_type: 'Image.BICUBIC' + scale: false + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 48 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/evaluation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - ViTSTRLabelEncode: # Class handling label + ignore_index: *ignore_index + - GrayRecResizeImg: + image_shape: [224, 224] # W H + resize_type: PIL # PIL or OpenCV + inter_type: 'Image.BICUBIC' + scale: false + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 2 diff --git a/configs/sr/sr_telescope.yml b/configs/sr/sr_telescope.yml new file mode 100644 index 0000000..1152596 --- /dev/null +++ b/configs/sr/sr_telescope.yml @@ -0,0 +1,84 @@ +Global: + use_gpu: true + epoch_num: 100 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/sr/sr_telescope/ + save_epoch_step: 3 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 1000] + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: ./output/sr/sr_telescope/infer + use_visualdl: False + infer_img: doc/imgs_words_en/word_52.png + # for data or label process + character_dict_path: + max_text_length: 100 + infer_mode: False + use_space_char: False + save_res_path: ./output/sr/predicts_telescope.txt + +Optimizer: + name: Adam + beta1: 0.5 + beta2: 0.999 + clip_norm: 0.25 + lr: + learning_rate: 0.0001 + +Architecture: + model_type: sr + algorithm: Telescope + Transform: + name: TBSRN + STN: True + infer_mode: True + +Loss: + name: TelescopeLoss + confuse_dict_path: ./ppocr/utils/dict/confuse.pkl + + +PostProcess: + name: None + +Metric: + name: SRMetric + main_indicator: all + +Train: + dataset: + name: LMDBDataSetSR + data_dir: ./train_data/TextZoom/train + transforms: + - SRResize: + imgH: 32 + imgW: 128 + down_sample_scale: 2 + - KeepKeys: + keep_keys: ['img_lr', 'img_hr', 'label'] # dataloader will return list in this order + loader: + shuffle: False + batch_size_per_card: 16 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSetSR + data_dir: ./train_data/TextZoom/test + transforms: + - SRResize: + imgH: 32 + imgW: 128 + down_sample_scale: 2 + - KeepKeys: + keep_keys: ['img_lr', 'img_hr', 'label'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 16 + num_workers: 4 + diff --git a/configs/sr/sr_tsrn_transformer_strock.yml b/configs/sr/sr_tsrn_transformer_strock.yml new file mode 100644 index 0000000..2e50948 --- /dev/null +++ b/configs/sr/sr_tsrn_transformer_strock.yml @@ -0,0 +1,85 @@ +Global: + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/sr/sr_tsrn_transformer_strock/ + save_epoch_step: 3 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 1000] + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: sr_output + use_visualdl: False + infer_img: doc/imgs_words_en/word_52.png + # for data or label process + character_dict_path: ./train_data/srdata/english_decomposition.txt + max_text_length: 100 + infer_mode: False + use_space_char: False + save_res_path: ./output/sr/predicts_gestalt.txt + +Optimizer: + name: Adam + beta1: 0.5 + beta2: 0.999 + clip_norm: 0.25 + lr: + learning_rate: 0.0001 + +Architecture: + model_type: sr + algorithm: Gestalt + Transform: + name: TSRN + STN: True + infer_mode: True + +Loss: + name: StrokeFocusLoss + character_dict_path: ./train_data/srdata/english_decomposition.txt + +PostProcess: + name: None + +Metric: + name: SRMetric + main_indicator: all + +Train: + dataset: + name: LMDBDataSetSR + data_dir: ./train_data/srdata/train + transforms: + - SRResize: + imgH: 32 + imgW: 128 + down_sample_scale: 2 + - SRLabelEncode: # Class handling label + - KeepKeys: + keep_keys: ['img_lr', 'img_hr', 'length', 'input_tensor', 'label'] # dataloader will return list in this order + loader: + shuffle: False + batch_size_per_card: 16 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSetSR + data_dir: ./train_data/srdata/test + transforms: + - SRResize: + imgH: 32 + imgW: 128 + down_sample_scale: 2 + - SRLabelEncode: # Class handling label + - KeepKeys: + keep_keys: ['img_lr', 'img_hr','length', 'input_tensor', 'label'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 16 + num_workers: 4 + diff --git a/configs/table/table_mv3.yml b/configs/table/table_mv3.yml new file mode 100644 index 0000000..a74e18d --- /dev/null +++ b/configs/table/table_mv3.yml @@ -0,0 +1,116 @@ +Global: + use_gpu: true + epoch_num: 50 + log_smooth_window: 20 + print_batch_step: 5 + save_model_dir: ./output/table_mv3/ + save_epoch_step: 5 + # evaluation is run every 400 iterations after the 0th iteration + eval_batch_step: [0, 400] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words/ch/word_1.jpg + # for data or label process + character_dict_path: ppocr/utils/dict/table_structure_dict.txt + character_type: en + max_text_length: 100 + max_elem_length: 500 + max_cell_num: 500 + infer_mode: False + process_total_num: 0 + process_cut_num: 0 + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + clip_norm: 5.0 + lr: + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0.00000 + +Architecture: + model_type: table + algorithm: TableAttn + Backbone: + name: MobileNetV3 + scale: 1.0 + model_name: small + disable_se: True + Head: + name: TableAttentionHead + hidden_size: 256 + l2_decay: 0.00001 + loc_type: 2 + +Loss: + name: TableAttentionLoss + structure_weight: 100.0 + loc_weight: 10000.0 + +PostProcess: + name: TableLabelDecode + +Metric: + name: TableMetric + main_indicator: acc + +Train: + dataset: + name: PubTabDataSet + data_dir: train_data/table/pubtabnet/train/ + label_file_path: train_data/table/pubtabnet/PubTabNet_2.0.0_train.jsonl + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - ResizeTableImage: + max_len: 488 + - TableLabelEncode: + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - PaddingTableImage: + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'structure', 'bbox_list', 'sp_tokens', 'bbox_list_mask'] + loader: + shuffle: True + batch_size_per_card: 32 + drop_last: True + num_workers: 1 + +Eval: + dataset: + name: PubTabDataSet + data_dir: train_data/table/pubtabnet/val/ + label_file_path: train_data/table/pubtabnet/PubTabNet_2.0.0_val.jsonl + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - ResizeTableImage: + max_len: 488 + - TableLabelEncode: + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - PaddingTableImage: + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'structure', 'bbox_list', 'sp_tokens', 'bbox_list_mask'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 16 + num_workers: 1 diff --git a/configs/table/table_mv3_det.yml b/configs/table/table_mv3_det.yml new file mode 100644 index 0000000..f9bb4ba --- /dev/null +++ b/configs/table/table_mv3_det.yml @@ -0,0 +1,15 @@ +Architecture: + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: true + Neck: + name: DBFPN + out_channels: 96 + Head: + name: DBHead + k: 50 \ No newline at end of file diff --git a/configs/table/table_mv3_rec.yml b/configs/table/table_mv3_rec.yml new file mode 100644 index 0000000..8510ca1 --- /dev/null +++ b/configs/table/table_mv3_rec.yml @@ -0,0 +1,14 @@ +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV3 + model_name: large + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 96 + Head: + name: CTCHead + fc_decay: 0 diff --git a/configs/table/table_mv3_table_structure.yml b/configs/table/table_mv3_table_structure.yml new file mode 100644 index 0000000..6398e25 --- /dev/null +++ b/configs/table/table_mv3_table_structure.yml @@ -0,0 +1,15 @@ +Architecture: + model_type: table + algorithm: TableAttn + Backbone: + name: MobileNetV3 + scale: 1.0 + model_name: large + disable_se: True + Head: + name: TableAttentionHead + hidden_size: 256 + l2_decay: 0.00001 + loc_type: 2 + max_elem_length: 800 + in_max_len: 512 diff --git a/convension/MDF/convert.py b/convension/MDF/convert.py new file mode 100644 index 0000000..41458c3 --- /dev/null +++ b/convension/MDF/convert.py @@ -0,0 +1,24 @@ +import os, sys +os.environ['CUDA_MODULE_LOADING'] = 'LAZY' +__dir__ = os.path.dirname(__file__) +sys.path.append(os.path.dirname(os.path.dirname(__dir__))) +from batch_running_task.task_layout.get_batch_yolo import mfd_process, get_batch_YOLO_model +import yaml +with open('configs/model_configs.yaml') as f: + model_configs = yaml.load(f, Loader=yaml.FullLoader) + +img_size = model_configs['model_args']['img_size'] +conf_thres= model_configs['model_args']['conf_thres'] +iou_thres = model_configs['model_args']['iou_thres'] +device = model_configs['model_args']['device'] +dpi = model_configs['model_args']['pdf_dpi'] + +inner_batch_size = 16 +mfd_model = get_batch_YOLO_model(model_configs,inner_batch_size,use_tensorRT=False) +mfd_model.export(format="engine",half=True,imgsz=(1888,1472), batch=inner_batch_size, simplify=True) # creates 'yolov8n.engine' +oldname = model_configs['model_args']['mfd_weight'] +os.rename( + oldname[:-3]+f'.engine', + oldname[:-3]+f'.b{inner_batch_size}.engine' + +) \ No newline at end of file diff --git a/convension/detectron2/convert_to_tensorRT.py b/convension/detectron2/convert_to_tensorRT.py new file mode 100644 index 0000000..22e7fff --- /dev/null +++ b/convension/detectron2/convert_to_tensorRT.py @@ -0,0 +1,21 @@ +import tensorrt as trt +onnx_model_path = "model.sim.onnx" +engine_file_path="model.egine.fp16.trt" +# Load the ONNX model +with open(onnx_model_path, "rb") as f: + onnx_model = f.read() + +TRT_LOGGER = trt.Logger(trt.Logger.WARNING) +builder = trt.Builder(TRT_LOGGER) +network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) + +parser = trt.OnnxParser(network, TRT_LOGGER) +parser.parse(onnx_model) + +config = builder.create_builder_config() +config.set_flag(trt.BuilderFlag.FP16) +#config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 20) # 1 MiB +engine_bytes = builder.build_serialized_network(network, config) + +with open(engine_file_path, "wb") as f: + f.write(engine_bytes) \ No newline at end of file diff --git a/convension/detectron2/create_backbone_tensorRT.py b/convension/detectron2/create_backbone_tensorRT.py new file mode 100644 index 0000000..c1c61ce --- /dev/null +++ b/convension/detectron2/create_backbone_tensorRT.py @@ -0,0 +1,47 @@ +import torch +from rough_layout import * +with open('configs/model_configs.yaml') as f: + model_configs = yaml.load(f, Loader=yaml.FullLoader) + + +img_size = model_configs['model_args']['img_size'] +conf_thres= model_configs['model_args']['conf_thres'] +iou_thres = model_configs['model_args']['iou_thres'] +device = model_configs['model_args']['device'] +dpi = model_configs['model_args']['pdf_dpi'] +layout_model = get_layout_model(model_configs) + +batched_inputs = torch.load("convension/detectron2/batched_inputs.pt") + +import torch_tensorrt + +model = layout_model.predictor.model.eval() +x = batched_inputs[0]['image'][None] +self = layout_model.predictor.model +with torch.no_grad(): + images = self.preprocess_image(batched_inputs) + _input = self.get_batch(batched_inputs, images) + +class ModelWrapper(torch.nn.Module): + def __init__(self, model): + super(ModelWrapper, self).__init__() + self.model = model + + def forward(self, images): + outputs = self.model({'images':images}) + return ( + outputs['p2'], + outputs['p3'], + outputs['p4'], + outputs['p5'], + outputs['p6'], + ) + +# Wrap the model +wrapped_model = ModelWrapper(layout_model.predictor.model.backbone) +inputs =_input['images'][:1] +trt_gm = torch_tensorrt.compile(wrapped_model, ir="dynamo", inputs=inputs) +import os +os.makedirs("models/layout/",exist_ok=True) +torch_tensorrt.save(trt_gm, "models/layout/trt.ep", inputs=inputs) # PyTorch only supports Python runtime for an ExportedProgram. For C++ deployment, use a TorchScript file +torch_tensorrt.save(trt_gm, "models/layout/trt.ts", output_format="torchscript", inputs=inputs) \ No newline at end of file diff --git a/convension/detectron2/create_onnx.py b/convension/detectron2/create_onnx.py new file mode 100644 index 0000000..1370bbf --- /dev/null +++ b/convension/detectron2/create_onnx.py @@ -0,0 +1,550 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import re +import sys +import argparse +import logging +import cv2 +import onnx_graphsurgeon as gs +import numpy as np +import onnx +from onnx import shape_inference +import torch + +try: + from detectron2.engine.defaults import DefaultPredictor + from detectron2.modeling import build_model + from detectron2.config import get_cfg + from detectron2.structures import ImageList +except ImportError: + print("Could not import Detectron 2 modules. Maybe you did not install Detectron 2") + print("Please install Detectron 2, check https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md") + sys.exit(1) + +import onnx_utils + +logging.basicConfig(level=logging.INFO) +logging.getLogger("ModelHelper").setLevel(logging.INFO) +log = logging.getLogger("ModelHelper") + +from modules.layoutlmv3.model_init import * +class DET2GraphSurgeon: + def __init__(self, saved_model_path, config_file, weights): + """ + Constructor of the Model Graph Surgeon object, to do the conversion of a Detectron 2 Mask R-CNN exported model + to an ONNX-TensorRT parsable model. + :param saved_model_path: The path pointing to the exported Detectron 2 Mask R-CNN ONNX model. + :param config_file: The path pointing to the Detectron 2 yaml file which describes the model. + :param config_file: Weights to load for the Detectron 2 model. + """ + + # Import exported Detectron 2 Mask R-CNN ONNX model as GraphSurgeon object. + self.graph = gs.import_onnx(onnx.load(saved_model_path)) + assert self.graph + log.info("ONNX graph loaded successfully") + + # Fold constants via ONNX-GS that exported script might've missed. + self.graph.fold_constants() + + layout_args = { + "config_file": "modules/layoutlmv3/layoutlmv3_base_inference.yaml", + "resume": False, + "eval_only": False, + "num_gpus": 1, + "num_machines": 1, + "machine_rank": 0, + "dist_url": "tcp://127.0.0.1:57823", + "opts": ["MODEL.WEIGHTS", weights], + } + layout_args = DotDict(layout_args) + cfg = setup(layout_args) + self.det2_cfg = cfg + + # Getting model characteristics. + self.fpn_out_channels = self.det2_cfg.MODEL.FPN.OUT_CHANNELS + self.num_classes = self.det2_cfg.MODEL.ROI_HEADS.NUM_CLASSES + self.first_NMS_max_proposals = self.det2_cfg.MODEL.RPN.POST_NMS_TOPK_TEST + self.first_NMS_iou_threshold = self.det2_cfg.MODEL.RPN.NMS_THRESH + self.first_NMS_score_threshold = 0.01 + self.first_ROIAlign_pooled_size = self.det2_cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION + self.first_ROIAlign_sampling_ratio = self.det2_cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO + self.first_ROIAlign_type = self.det2_cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE + self.second_NMS_max_proposals = self.det2_cfg.TEST.DETECTIONS_PER_IMAGE + self.second_NMS_iou_threshold = self.det2_cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST + self.second_NMS_score_threshold = self.det2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST + self.second_ROIAlign_pooled_size = self.det2_cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION + self.second_ROIAlign_sampling_ratio = self.det2_cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO + self.second_ROIAlign_type = self.det2_cfg.MODEL.ROI_MASK_HEAD.POOLER_TYPE + self.mask_out_res = 28 + + # Model characteristics. + log.info("Number of FPN output channels is {}".format(self.fpn_out_channels)) + log.info("Number of classes is {}".format(self.num_classes)) + log.info("First NMS max proposals is {}".format(self.first_NMS_max_proposals)) + log.info("First NMS iou threshold is {}".format(self.first_NMS_iou_threshold)) + log.info("First NMS score threshold is {}".format(self.first_NMS_score_threshold)) + log.info("First ROIAlign type is {}".format(self.first_ROIAlign_type)) + log.info("First ROIAlign pooled size is {}".format(self.first_ROIAlign_pooled_size)) + log.info("First ROIAlign sampling ratio is {}".format(self.first_ROIAlign_sampling_ratio)) + log.info("Second NMS max proposals is {}".format(self.second_NMS_max_proposals)) + log.info("Second NMS iou threshold is {}".format(self.second_NMS_iou_threshold)) + log.info("Second NMS score threshold is {}".format(self.second_NMS_score_threshold)) + log.info("Second ROIAlign type is {}".format(self.second_ROIAlign_type)) + log.info("Second ROIAlign pooled size is {}".format(self.second_ROIAlign_pooled_size)) + log.info("Second ROIAlign sampling ratio is {}".format(self.second_ROIAlign_sampling_ratio)) + log.info("Individual mask output resolution is {}x{}".format(self.mask_out_res, self.mask_out_res)) + + self.batch_size = None + + def sanitize(self): + """ + Sanitize the graph by cleaning any unconnected nodes, do a topological resort, and fold constant inputs values. + When possible, run shape inference on the ONNX graph to determine tensor shapes. + """ + + for i in range(3): + count_before = len(self.graph.nodes) + self.graph.cleanup().toposort() + try: + for node in self.graph.nodes: + for o in node.outputs: + o.shape = None + model = gs.export_onnx(self.graph) + model = shape_inference.infer_shapes(model) + self.graph = gs.import_onnx(model) + except Exception as e: + log.info("Shape inference could not be performed at this time:\n{}".format(e)) + try: + self.graph.fold_constants(fold_shapes=True) + except TypeError as e: + log.error("This version of ONNX GraphSurgeon does not support folding shapes, please upgrade your " + "onnx_graphsurgeon module. Error:\n{}".format(e)) + raise + + count_after = len(self.graph.nodes) + if count_before == count_after: + # No new folding occurred in this iteration, so we can stop for now. + break + + def get_anchors(self, sample_image): + """ + Detectron 2 exported ONNX does not contain anchors required for efficientNMS plug-in, so they must be generated + "offline" by calling actual Detectron 2 model and getting anchors from it. + :param sample_image: Sample image required to run through the model and obtain anchors. + Can be any image from a dataset. Make sure listed here Detectron 2 preprocessing steps + actually match your preprocessing steps. Otherwise, behavior can be unpredictable. + Additionally, anchors have to be generated for a fixed input dimensions, + meaning as soon as image leaves a preprocessor and enters predictor.model.backbone() it must have + a fixed dimension (1344x1344 in my case) that every single image in dataset must follow, since currently + TensorRT plug-ins do not support dynamic shapes. + """ + # Get Detectron 2 model config and build it. + predictor = DefaultPredictor(self.det2_cfg) + model = build_model(self.det2_cfg) + + # Image preprocessing. + input_im = cv2.imread(sample_image) + raw_height, raw_width = input_im.shape[:2] + image = predictor.aug.get_transform(input_im).apply_image(input_im) + image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) + + # Model preprocessing. + inputs = [{"image": image, "height": raw_height, "width": raw_width}] + images = [x["image"].to(model.device) for x in inputs] + images = [(x - model.pixel_mean) / model.pixel_std for x in images] + imagelist_images = ImageList.from_tensors(images, 1344) + + # Get feature maps from backbone. + features = predictor.model.backbone(imagelist_images.tensor) + + # Get proposals from Region Proposal Network and obtain anchors from anchor generator. + features = [features[f] for f in predictor.model.proposal_generator.in_features] + det2_anchors = predictor.model.proposal_generator.anchor_generator(features) + + # Extract anchors based on feature maps in ascending order (P2->P6). + p2_anchors = det2_anchors[0].tensor.detach().cpu().numpy() + p3_anchors = det2_anchors[1].tensor.detach().cpu().numpy() + p4_anchors = det2_anchors[2].tensor.detach().cpu().numpy() + p5_anchors = det2_anchors[3].tensor.detach().cpu().numpy() + p6_anchors = det2_anchors[4].tensor.detach().cpu().numpy() + final_anchors = np.concatenate((p2_anchors,p3_anchors,p4_anchors,p5_anchors,p6_anchors)) + + return final_anchors + + def save(self, output_path): + """ + Save the ONNX model to the given location. + :param output_path: Path pointing to the location where to write out the updated ONNX model. + """ + self.graph.cleanup().toposort() + model = gs.export_onnx(self.graph) + output_path = os.path.realpath(output_path) + os.makedirs(os.path.dirname(output_path), exist_ok=True) + onnx.save(model, output_path) + log.info("Saved ONNX model to {}".format(output_path)) + + def update_preprocessor(self, batch_size): + """ + Remove all the pre-processing nodes in the ONNX graph and leave only the image normalization essentials. + :param batch_size: The batch size to use for the ONNX graph. + """ + # Set graph inputs. + self.batch_size = batch_size + self.height = self.graph.inputs[0].shape[1] + self.width = self.graph.inputs[0].shape[2] + + input_shape = [self.batch_size, 3, self.height, self.width] + self.graph.inputs[0].shape = input_shape + self.graph.inputs[0].dtype = np.float32 + self.graph.inputs[0].name = "input_tensor" + + self.sanitize() + log.info("ONNX graph input shape: {} [NCHW format set]".format(self.graph.inputs[0].shape)) + + # Find the initial nodes of the graph, whatever the input is first connected to, and disconnect them. + for node in [node for node in self.graph.nodes if self.graph.inputs[0] in node.inputs]: + node.inputs.clear() + + # Get input tensor. + input_tensor = self.graph.inputs[0] + + # Create preprocessing Sub node and connect input tensor to it. + sub_const = np.expand_dims(np.asarray([255 * 0.406, 255 * 0.456, 255 * 0.485], dtype=np.float32), axis=(1, 2)) + sub_out = self.graph.op_with_const("Sub", "preprocessor/mean", input_tensor, sub_const) + + # Find first Div node and connect to output of Sub node. + div_node = self.graph.find_node_by_op("Div") + log.info("Found {} node".format(div_node.op)) + div_node.inputs[0] = sub_out[0] + + # Find first Conv and connect preprocessor directly to it. + conv_node = self.graph.find_node_by_op("Conv") + log.info("Found {} node".format(conv_node.op)) + conv_node.inputs[0] = div_node.outputs[0] + + # Reshape nodes tend to update the batch dimension to a fixed value of 1, they should use the batch size instead. + for node in [node for node in self.graph.nodes if node.op == "Reshape"]: + if type(node.inputs[1]) == gs.Constant and node.inputs[1].values[0] == 1: + node.inputs[1].values[0] = self.batch_size + + def NMS(self, boxes, scores, anchors, background_class, score_activation, max_proposals, iou_threshold, nms_score_threshold, user_threshold, nms_name=None): + # Helper function to create the NMS Plugin node with the selected inputs. + # EfficientNMS_TRT TensorRT Plugin is suitable for our use case. + # :param boxes: The box predictions from the Box Net. + # :param scores: The class predictions from the Class Net. + # :param anchors: The default anchor coordinates. + # :param background_class: The label ID for the background class. + # :param max_proposals: Number of proposals made by NMS. + # :param score_activation: If set to True - apply sigmoid activation to the confidence scores during NMS operation, + # if false - no activation. + # :param iou_threshold: NMS intersection over union threshold, given by self.det2_cfg. + # :param nms_score_threshold: NMS score threshold, given by self.det2_cfg. + # :param user_threshold: User's given threshold to overwrite default NMS score threshold. + # :param nms_name: Name of NMS node in a graph, renames NMS elements accordingly in order to eliminate cycles. + + if nms_name is None: + nms_name = "" + else: + nms_name = "_" + nms_name + + # Set score threshold. + score_threshold = nms_score_threshold if user_threshold is None else user_threshold + + # NMS Outputs. + nms_output_num_detections = gs.Variable(name="num_detections"+nms_name, dtype=np.int32, shape=[self.batch_size, 1]) + nms_output_boxes = gs.Variable(name="detection_boxes"+nms_name, dtype=np.float32, + shape=[self.batch_size, max_proposals, 4]) + nms_output_scores = gs.Variable(name="detection_scores"+nms_name, dtype=np.float32, + shape=[self.batch_size, max_proposals]) + nms_output_classes = gs.Variable(name="detection_classes"+nms_name, dtype=np.int32, + shape=[self.batch_size, max_proposals]) + + nms_outputs = [nms_output_num_detections, nms_output_boxes, nms_output_scores, nms_output_classes] + + # Plugin. + self.graph.plugin( + op="EfficientNMS_TRT", + name="nms"+nms_name, + inputs=[boxes, scores, anchors], + outputs=nms_outputs, + attrs={ + 'plugin_version': "1", + 'background_class': background_class, + 'max_output_boxes': max_proposals, + 'score_threshold': max(0.01, score_threshold), + 'iou_threshold': iou_threshold, + 'score_activation': score_activation, + 'class_agnostic': False, + 'box_coding': 1, + } + ) + log.info("Created nms{} with EfficientNMS_TRT plugin".format(nms_name)) + + return nms_outputs + + def ROIAlign(self, rois, p2, p3, p4, p5, pooled_size, sampling_ratio, roi_align_type, num_rois, ra_name): + # Helper function to create the ROIAlign Plugin node with the selected inputs. + # PyramidROIAlign_TRT TensorRT Plugin is suitable for our use case. + # :param rois: Regions of interest/detection boxes outputs from preceding NMS node. + # :param p2: Output of p2 feature map. + # :param p3: Output of p3 feature map. + # :param p4: Output of p4 feature map. + # :param p5: Output of p5 feature map. + # :param pooled_size: Pooled output dimensions. + # :param sampling_ratio: Number of sampling points in the interpolation grid used to compute the output value of each pooled output bin. + # :param roi_align_type: Type of Detectron 2 ROIAlign op, either ROIAlign (vanilla) or ROIAlignV2 (0.5 coordinate offset). + # :param num_rois: Number of ROIs resulting from ROIAlign operation. + # :param ra_name: Name of ROIAlign node in a graph, renames ROIAlign elements accordingly in order to eliminate cycles. + + # Different types of Detectron 2's ROIAlign ops require coordinate offset that is supported by PyramidROIAlign_TRT. + if roi_align_type == "ROIAlignV2": + roi_coords_transform = 2 + elif roi_align_type == "ROIAlign": + roi_coords_transform = 0 + + # ROIAlign outputs. + roi_align_output = gs.Variable(name="roi_align/output_"+ra_name, dtype=np.float32, + shape=[self.batch_size, num_rois, self.fpn_out_channels, pooled_size, pooled_size]) + + # Plugin. + self.graph.plugin( + op="PyramidROIAlign_TRT", + name="roi_align_"+ra_name, + inputs=[rois, p2, p3, p4, p5], + outputs=[roi_align_output], + attrs={ + 'plugin_version': "1", + 'fpn_scale': 224, + 'pooled_size': pooled_size, + 'image_size': [self.height, self.width], + 'roi_coords_absolute': 0, + 'roi_coords_swap': 0, + 'roi_coords_transform': roi_coords_transform, + 'sampling_ratio': sampling_ratio, + } + ) + log.info("Created {} with PyramidROIAlign_TRT plugin".format(ra_name)) + + return roi_align_output + + def process_graph(self, anchors, first_nms_threshold=None, second_nms_threshold=None): + """ + Processes the graph to replace the GenerateProposals and BoxWithNMSLimit operations with EfficientNMS_TRT + TensorRT plugin nodes and ROIAlign operations with PyramidROIAlign_TRT plugin nodes. + :param anchors: Anchors generated from sample image "offline" by Detectron 2, since anchors are not provided + inside the graph. + :param first_nms_threshold: Override the 1st NMS score threshold value. If set to None, use the value in the graph. + :param second_nms_threshold: Override the 2nd NMS score threshold value. If set to None, use the value in the graph. + """ + def backbone(): + """ + Updates the graph to replace all ResizeNearest ops with ResizeNearest plugins in backbone. + """ + # Get final backbone outputs. + p2 = self.graph.find_node_by_op_name("Conv", "/backbone/fpn_output2/Conv") + p3 = self.graph.find_node_by_op_name("Conv", "/backbone/fpn_output3/Conv") + p4 = self.graph.find_node_by_op_name("Conv", "/backbone/fpn_output4/Conv") + p5 = self.graph.find_node_by_op_name("Conv", "/backbone/fpn_output5/Conv") + + + return p2.outputs[0], p3.outputs[0], p4.outputs[0], p5.outputs[0] + + def proposal_generator(anchors, first_nms_threshold): + """ + Updates the graph to replace all GenerateProposals Caffe ops with one single NMS for proposals generation. + :param anchors: Anchors generated from sample image "offline" by Detectron 2, since anchors are not provided + inside the graph + :param first_nms_threshold: Override the 1st NMS score threshold value. If set to None, use the value in the graph. + """ + # Get nodes containing final objectness logits. + p2_logits = self.graph.find_node_by_op_name("Flatten", "/proposal_generator/Flatten") + p3_logits = self.graph.find_node_by_op_name("Flatten", "/proposal_generator/Flatten_1") + p4_logits = self.graph.find_node_by_op_name("Flatten", "/proposal_generator/Flatten_2") + p5_logits = self.graph.find_node_by_op_name("Flatten", "/proposal_generator/Flatten_3") + p6_logits = self.graph.find_node_by_op_name("Flatten", "/proposal_generator/Flatten_4") + + # Get nodes containing final anchor_deltas. + p2_anchors = self.graph.find_node_by_op_name("Reshape", "/proposal_generator/Reshape_1") + p3_anchors = self.graph.find_node_by_op_name("Reshape", "/proposal_generator/Reshape_3") + p4_anchors = self.graph.find_node_by_op_name("Reshape", "/proposal_generator/Reshape_5") + p5_anchors = self.graph.find_node_by_op_name("Reshape", "/proposal_generator/Reshape_7") + p6_anchors = self.graph.find_node_by_op_name("Reshape", "/proposal_generator/Reshape_9") + + # Concatenate all objectness logits/scores data. + scores_inputs = [p2_logits.outputs[0], p3_logits.outputs[0], p4_logits.outputs[0], p5_logits.outputs[0], p6_logits.outputs[0]] + scores_tensor = self.graph.layer(name="scores", op="Concat", inputs=scores_inputs, outputs=['scores'], attrs={'axis': 1})[0] + # Unsqueeze to add 3rd dimension of 1 to match tensor dimensions of boxes tensor. + scores = self.graph.unsqueeze("scores_unsqueeze", scores_tensor, [2])[0] + + # Concatenate all boxes/anchor_delta data. + boxes_inputs = [p2_anchors.outputs[0], p3_anchors.outputs[0], p4_anchors.outputs[0], p5_anchors.outputs[0], p6_anchors.outputs[0]] + boxes = self.graph.layer(name="boxes", op="Concat", inputs=boxes_inputs, outputs=['anchors'], attrs={'axis': 1})[0] + + # Convert the anchors from Corners to CenterSize encoding. + anchors = np.matmul(anchors, [[0.5, 0, -1, 0], [0, 0.5, 0, -1], [0.5, 0, 1, 0], [0, 0.5, 0, 1]]) + anchors = anchors / [self.width, self.height, self.width, self.height] # Normalize anchors to [0-1] range + anchors = np.expand_dims(anchors, axis=0) + anchors = anchors.astype(np.float32) + anchors = gs.Constant(name="default_anchors", values=anchors) + + # Create NMS node. + nms_outputs = self.NMS(boxes, scores, anchors, -1, False, self.first_NMS_max_proposals, self.first_NMS_iou_threshold, self.first_NMS_score_threshold, first_nms_threshold, 'rpn') + + return nms_outputs + + def roi_heads(rpn_outputs, p2, p3, p4, p5, second_nms_threshold): + """ + Updates the graph to replace all ROIAlign Caffe ops with one single pyramid ROIAlign. Eliminates CollectRpnProposals + DistributeFpnProposals and BatchPermutation nodes that are not supported by TensorRT. Connects pyramid ROIAlign to box_head + and connects box_head to final box head outputs in a form of second NMS. In order to implement mask head outputs, + similar steps as in box_pooler are performed to replace mask_pooler. Finally, reimplemented mask_pooler is connected to + mask_head and mask head outputs are produced. + :param rpn_outputs: Outputs of the first NMS/proposal generator. + :param p2: Output of p2 feature map, required for ROIAlign operation. + :param p3: Output of p3 feature map, required for ROIAlign operation. + :param p4: Output of p4 feature map, required for ROIAlign operation. + :param p5: Output of p5 feature map, required for ROIAlign operation. + :param second_nms_threshold: Override the 2nd NMS score threshold value. If set to None, use the value in the graph. + """ + # Create ROIAlign node. + box_pooler_output = self.ROIAlign(rpn_outputs[1], p2, p3, p4, p5, self.first_ROIAlign_pooled_size, self.first_ROIAlign_sampling_ratio, self.first_ROIAlign_type, self.first_NMS_max_proposals, 'box_pooler') + + # Reshape node that prepares ROIAlign/box pooler output for Gemm node that comes next. + box_pooler_shape = np.asarray([-1, self.fpn_out_channels*self.first_ROIAlign_pooled_size*self.first_ROIAlign_pooled_size], dtype=np.int64) + box_pooler_reshape = self.graph.op_with_const("Reshape", "box_pooler/reshape", box_pooler_output, box_pooler_shape) + + # Get first Gemm op of box head and connect box pooler to it. + first_box_head_gemm = self.graph.find_node_by_op_name("Gemm", "/roi_heads/box_head.0/fc1/Gemm") + first_box_head_gemm.inputs[0] = box_pooler_reshape[0] + + # Get final two nodes of box predictor. Softmax op for cls_score, Gemm op for bbox_pred. + cls_score = self.graph.find_node_by_op_name("Softmax", "/roi_heads/Softmax") + bbox_pred = self.graph.find_node_by_op_name("Gemm", "/roi_heads/box_predictor.0/bbox_pred/Gemm") + + # Linear transformation to convert box coordinates from (TopLeft, BottomRight) Corner encoding + # to CenterSize encoding. 1st NMS boxes are multiplied by transformation matrix in order to + # encode it into CenterSize format. + matmul_const = np.matrix('0.5 0 -1 0; 0 0.5 0 -1; 0.5 0 1 0; 0 0.5 0 1', dtype=np.float32) + matmul_out = self.graph.matmul("RPN_NMS/detection_boxes_conversion", rpn_outputs[1], matmul_const) + + # Reshape node that prepares bbox_pred for scaling and second NMS. + bbox_pred_shape = np.asarray([self.batch_size, self.first_NMS_max_proposals, self.num_classes, 4], dtype=np.int64) + bbox_pred_reshape = self.graph.op_with_const("Reshape", "bbox_pred/reshape", bbox_pred.outputs[0], bbox_pred_shape) + + # 0.1, 0.1, 0.2, 0.2 are localization head variance numbers, they scale bbox_pred_reshape, in order to get accurate coordinates. + scale_adj = np.expand_dims(np.asarray([0.1, 0.1, 0.2, 0.2], dtype=np.float32), axis=(0, 1)) + final_bbox_pred = self.graph.op_with_const("Mul", "bbox_pred/scale", bbox_pred_reshape[0], scale_adj) + + # Reshape node that prepares cls_score for slicing and second NMS. + cls_score_shape = np.array([self.batch_size, self.first_NMS_max_proposals, self.num_classes+1], dtype=np.int64) + cls_score_reshape = self.graph.op_with_const("Reshape", "cls_score/reshape", cls_score.outputs[0], cls_score_shape) + + # Slice operation to adjust third dimension of cls_score tensor, deletion of background class (81 in Detectron 2). + final_cls_score = self.graph.slice("cls_score/slicer", cls_score_reshape[0], 0, self.num_classes, 2) + + # Create NMS node. + nms_outputs = self.NMS(final_bbox_pred[0], final_cls_score[0], matmul_out[0], -1, False, self.second_NMS_max_proposals, self.second_NMS_iou_threshold, self.second_NMS_score_threshold, second_nms_threshold, 'box_outputs') + + # Create ROIAlign node. + mask_pooler_output = self.ROIAlign(nms_outputs[1], p2, p3, p4, p5, self.second_ROIAlign_pooled_size, self.second_ROIAlign_sampling_ratio, self.second_ROIAlign_type, self.second_NMS_max_proposals, 'mask_pooler') + + # Reshape mask pooler output. + mask_pooler_shape = np.asarray([self.second_NMS_max_proposals*self.batch_size, self.fpn_out_channels, self.second_ROIAlign_pooled_size, self.second_ROIAlign_pooled_size], dtype=np.int64) + mask_pooler_reshape_node = self.graph.op_with_const("Reshape", "mask_pooler/reshape", mask_pooler_output, mask_pooler_shape) + + # Get first Conv op in mask head and connect ROIAlign's squeezed output to it. + mask_head_conv = self.graph.find_node_by_op_name("Conv", "/roi_heads/mask_head/mask_fcn1/Conv") + mask_head_conv.inputs[0] = mask_pooler_reshape_node[0] + + # Reshape node that is preparing 2nd NMS class outputs for Add node that comes next. + classes_reshape_shape = np.asarray([self.second_NMS_max_proposals*self.batch_size], dtype=np.int64) + classes_reshape_node = self.graph.op_with_const("Reshape", "box_outputs/reshape_classes", nms_outputs[3], classes_reshape_shape) + + # This loop will generate an array used in Add node, which eventually will help Gather node to pick the single + # class of interest per bounding box, instead of creating 80 masks for every single bounding box. + add_array = [] + for i in range(self.second_NMS_max_proposals*self.batch_size): + if i == 0: + start_pos = 0 + else: + start_pos = i * self.num_classes + add_array.append(start_pos) + + # This Add node is one of the Gather node inputs, Gather node performs gather on 0th axis of data tensor + # and requires indices that set tensors to be withing bounds, this Add node provides the bounds for Gather. + add_array = np.asarray(add_array, dtype=np.int32) + classes_add_node = self.graph.op_with_const("Add", "box_outputs/add", classes_reshape_node[0], add_array) + + # Get the last Conv op in mask head and reshape it to correctly gather class of interest's masks. + last_conv = self.graph.find_node_by_op_name("Conv", "/roi_heads/mask_head/predictor/Conv") + last_conv_reshape_shape = np.asarray([self.second_NMS_max_proposals*self.num_classes*self.batch_size, self.mask_out_res, self.mask_out_res], dtype=np.int64) + last_conv_reshape_node = self.graph.op_with_const("Reshape", "mask_head/reshape_all_masks", last_conv.outputs[0], last_conv_reshape_shape) + + # Gather node that selects only masks belonging to detected class, 79 other masks are discarded. + final_gather = self.graph.gather("mask_head/final_gather", last_conv_reshape_node[0], classes_add_node[0], 0) + + # Get last Sigmoid node and connect Gather node to it. + mask_head_sigmoid = self.graph.find_node_by_op_name("Sigmoid", "/roi_heads/mask_head/Sigmoid") + mask_head_sigmoid.inputs[0] = final_gather[0] + + # Final Reshape node, reshapes output of Sigmoid, important for various batch_size support (not tested yet). + final_graph_reshape_shape = np.asarray([self.batch_size, self.second_NMS_max_proposals, self.mask_out_res, self.mask_out_res], dtype=np.int64) + final_graph_reshape_node = self.graph.op_with_const("Reshape", "mask_head/final_reshape", mask_head_sigmoid.outputs[0], final_graph_reshape_shape) + final_graph_reshape_node[0].dtype = np.float32 + final_graph_reshape_node[0].name = "detection_masks" + + return nms_outputs, final_graph_reshape_node[0] + + # Only Detectron 2's Mask-RCNN R50-FPN 3x is supported currently. + p2, p3, p4, p5 = backbone() + rpn_outputs = proposal_generator(anchors, first_nms_threshold) + box_head_outputs, mask_head_output = roi_heads(rpn_outputs, p2, p3, p4, p5, second_nms_threshold) + print(box_head_outputs) + # Append segmentation head output. + box_head_outputs.append(mask_head_output) + # Set graph outputs, both bbox and segmentation heads. + self.graph.outputs = box_head_outputs + self.sanitize() + + +def main(args): + det2_gs = DET2GraphSurgeon(args.exported_onnx, args.det2_config, args.det2_weights) + det2_gs.update_preprocessor(args.batch_size) + anchors = np.load("final_anchors.npy") + #anchors = det2_gs.get_anchors(args.sample_image) + det2_gs.process_graph(anchors, args.first_nms_threshold, args.second_nms_threshold) + det2_gs.save(args.onnx) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--exported_onnx", help="The exported to ONNX Detectron 2 Mask R-CNN", type=str) + parser.add_argument("-o", "--onnx", help="The output ONNX model file to write", type=str) + parser.add_argument("-c", "--det2_config", help="The Detectron 2 config file (.yaml) for the model", type=str) + parser.add_argument("-w", "--det2_weights", help="The Detectron 2 model weights (.pkl)", type=str) + parser.add_argument("-s", "--sample_image", help="Sample image for anchors generation", type=str) + parser.add_argument("-b", "--batch_size", help="Batch size for the model", type=int, default=1) + parser.add_argument("-t1", "--first_nms_threshold", help="Override the score threshold for the 1st NMS operation", type=float) + parser.add_argument("-t2", "--second_nms_threshold", help="Override the score threshold for the 2nd NMS operation", type=float) + args = parser.parse_args() + if not all([args.exported_onnx, args.onnx, args.det2_config, args.det2_weights, args.sample_image]): + parser.print_help() + print("\nThese arguments are required: --exported_onnx --onnx --det2_config --det2_weights and --sample_image") + sys.exit(1) + main(args) diff --git a/convension/detectron2/export_model.py b/convension/detectron2/export_model.py new file mode 100644 index 0000000..8b700e4 --- /dev/null +++ b/convension/detectron2/export_model.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. +import argparse +import os +from typing import Dict, List, Tuple +import torch +from torch import Tensor, nn + +import detectron2.data.transforms as T +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import get_cfg +from detectron2.data import build_detection_test_loader, detection_utils +from detectron2.evaluation import COCOEvaluator, inference_on_dataset, print_csv_format +from detectron2.export import ( + STABLE_ONNX_OPSET_VERSION, + TracingAdapter, + dump_torchscript_IR, + scripting_with_instances, +) +from detectron2.modeling import GeneralizedRCNN, RetinaNet, build_model +from detectron2.modeling.postprocessing import detector_postprocess +from detectron2.projects.point_rend import add_pointrend_config +from detectron2.structures import Boxes +from detectron2.utils.env import TORCH_VERSION +from detectron2.utils.file_io import PathManager +from detectron2.utils.logger import setup_logger + + +def setup_cfg(args): + cfg = get_cfg() + # cuda context is initialized before creating dataloader, so we don't fork anymore + cfg.DATALOADER.NUM_WORKERS = 0 + add_pointrend_config(cfg) + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + return cfg + + +def export_caffe2_tracing(cfg, torch_model, inputs): + from detectron2.export import Caffe2Tracer + + tracer = Caffe2Tracer(cfg, torch_model, inputs) + if args.format == "caffe2": + caffe2_model = tracer.export_caffe2() + caffe2_model.save_protobuf(args.output) + # draw the caffe2 graph + caffe2_model.save_graph(os.path.join(args.output, "model.svg"), inputs=inputs) + return caffe2_model + elif args.format == "onnx": + import onnx + + onnx_model = tracer.export_onnx() + onnx.save(onnx_model, os.path.join(args.output, "model.onnx")) + elif args.format == "torchscript": + ts_model = tracer.export_torchscript() + with PathManager.open(os.path.join(args.output, "model.ts"), "wb") as f: + torch.jit.save(ts_model, f) + dump_torchscript_IR(ts_model, args.output) + + +# experimental. API not yet final +def export_scripting(torch_model): + assert TORCH_VERSION >= (1, 8) + fields = { + "proposal_boxes": Boxes, + "objectness_logits": Tensor, + "pred_boxes": Boxes, + "scores": Tensor, + "pred_classes": Tensor, + "pred_masks": Tensor, + "pred_keypoints": torch.Tensor, + "pred_keypoint_heatmaps": torch.Tensor, + } + assert args.format == "torchscript", "Scripting only supports torchscript format." + + class ScriptableAdapterBase(nn.Module): + # Use this adapter to workaround https://github.com/pytorch/pytorch/issues/46944 + # by not retuning instances but dicts. Otherwise the exported model is not deployable + def __init__(self): + super().__init__() + self.model = torch_model + self.eval() + + if isinstance(torch_model, GeneralizedRCNN): + + class ScriptableAdapter(ScriptableAdapterBase): + def forward(self, inputs: Tuple[Dict[str, torch.Tensor]]) -> List[Dict[str, Tensor]]: + instances = self.model.inference(inputs, do_postprocess=False) + return [i.get_fields() for i in instances] + + else: + + class ScriptableAdapter(ScriptableAdapterBase): + def forward(self, inputs: Tuple[Dict[str, torch.Tensor]]) -> List[Dict[str, Tensor]]: + instances = self.model(inputs) + return [i.get_fields() for i in instances] + + ts_model = scripting_with_instances(ScriptableAdapter(), fields) + with PathManager.open(os.path.join(args.output, "model.ts"), "wb") as f: + torch.jit.save(ts_model, f) + dump_torchscript_IR(ts_model, args.output) + # TODO inference in Python now missing postprocessing glue code + return None + + +# experimental. API not yet final +def export_tracing(torch_model, inputs): + assert TORCH_VERSION >= (1, 8) + image = inputs[0]["image"] + inputs = [{"image": image}] # remove other unused keys + + if isinstance(torch_model, GeneralizedRCNN): + + def inference(model, inputs): + # use do_postprocess=False so it returns ROI mask + inst = model.inference(inputs, do_postprocess=False)[0] + return [{"instances": inst}] + + else: + inference = None # assume that we just call the model directly + + traceable_model = TracingAdapter(torch_model, inputs, inference) + + if args.format == "torchscript": + ts_model = torch.jit.trace(traceable_model, (image,)) + with PathManager.open(os.path.join(args.output, "model.ts"), "wb") as f: + torch.jit.save(ts_model, f) + dump_torchscript_IR(ts_model, args.output) + elif args.format == "onnx": + with PathManager.open(os.path.join(args.output, "model.onnx"), "wb") as f: + torch.onnx.export(traceable_model, (image,), f, opset_version=STABLE_ONNX_OPSET_VERSION) + logger.info("Inputs schema: " + str(traceable_model.inputs_schema)) + logger.info("Outputs schema: " + str(traceable_model.outputs_schema)) + + if args.format != "torchscript": + return None + if not isinstance(torch_model, (GeneralizedRCNN, RetinaNet)): + return None + + def eval_wrapper(inputs): + """ + The exported model does not contain the final resize step, which is typically + unused in deployment but needed for evaluation. We add it manually here. + """ + input = inputs[0] + instances = traceable_model.outputs_schema(ts_model(input["image"]))[0]["instances"] + postprocessed = detector_postprocess(instances, input["height"], input["width"]) + return [{"instances": postprocessed}] + + return eval_wrapper + + +def get_sample_inputs(args): + + if args.sample_image is None: + # get a first batch from dataset + data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) + first_batch = next(iter(data_loader)) + return first_batch + else: + # get a sample data + original_image = detection_utils.read_image(args.sample_image, format=cfg.INPUT.FORMAT) + # Do same preprocessing as DefaultPredictor + aug = T.ResizeShortestEdge( + [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST + ) + height, width = original_image.shape[:2] + image = aug.get_transform(original_image).apply_image(original_image) + image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) + + inputs = {"image": image, "height": height, "width": width} + + # Sample ready + sample_inputs = [inputs] + return sample_inputs + +from modules.layoutlmv3.model_init import * +def main() -> None: + global logger, cfg, args + parser = argparse.ArgumentParser(description="Export a model for deployment.") + parser.add_argument( + "--format", + choices=["caffe2", "onnx", "torchscript"], + help="output format", + default="torchscript", + ) + parser.add_argument( + "--export-method", + choices=["caffe2_tracing", "tracing", "scripting"], + help="Method to export models", + default="tracing", + ) + parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file") + parser.add_argument("--sample-image", default=None, type=str, help="sample image for input") + parser.add_argument("--run-eval", action="store_true") + parser.add_argument("--output", help="output directory for the converted model") + parser.add_argument( + "opts", + help="Modify config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + args = parser.parse_args() + logger = setup_logger() + logger.info("Command line arguments: " + str(args)) + PathManager.mkdirs(args.output) + # Disable re-specialization on new shapes. Otherwise --run-eval will be slow + torch._C._jit_set_bailout_depth(1) + + layout_args = { + "config_file": "modules/layoutlmv3/layoutlmv3_base_inference.yaml", + "resume": False, + "eval_only": False, + "num_gpus": 1, + "num_machines": 1, + "machine_rank": 0, + "dist_url": "tcp://127.0.0.1:57823", + "opts": ["MODEL.WEIGHTS", "models/Layout/model_final.pth"], + } + layout_args = DotDict(layout_args) + + cfg = setup(layout_args) + + # create a torch model + torch_model = build_model(cfg) + DetectionCheckpointer(torch_model).resume_or_load(cfg.MODEL.WEIGHTS) + torch_model.eval() + + # convert and save model + if args.export_method == "caffe2_tracing": + sample_inputs = get_sample_inputs(args) + exported_model = export_caffe2_tracing(cfg, torch_model, sample_inputs) + elif args.export_method == "scripting": + exported_model = export_scripting(torch_model) + elif args.export_method == "tracing": + sample_inputs = get_sample_inputs(args) + exported_model = export_tracing(torch_model, sample_inputs) + + # run evaluation with the converted model + if args.run_eval: + assert exported_model is not None, ( + "Python inference is not yet implemented for " + f"export_method={args.export_method}, format={args.format}." + ) + logger.info("Running evaluation ... this takes a long time if you export to CPU.") + dataset = cfg.DATASETS.TEST[0] + data_loader = build_detection_test_loader(cfg, dataset) + # NOTE: hard-coded evaluator. change to the evaluator for your dataset + evaluator = COCOEvaluator(dataset, output_dir=args.output) + metrics = inference_on_dataset(exported_model, data_loader, evaluator) + print_csv_format(metrics) + logger.info("Success.") + + +if __name__ == "__main__": + main() # pragma: no cover \ No newline at end of file diff --git a/convension/detectron2/onnx_utils.py b/convension/detectron2/onnx_utils.py new file mode 100644 index 0000000..56d280f --- /dev/null +++ b/convension/detectron2/onnx_utils.py @@ -0,0 +1,261 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import logging +import numpy as np +import onnx_graphsurgeon as gs + +logging.basicConfig(level=logging.INFO) +logging.getLogger("ModelHelper").setLevel(logging.INFO) +log = logging.getLogger("ModelHelper") + +@gs.Graph.register() +def op_with_const(self, op, name, input, value): + """ + Add an operation with constant to the graph which will operate on the input tensor with the value(s) given. + :param op: The ONNX operation to perform, i.e. "Add" or "Mul". + :param input: The tensor to operate on. + :param value: The value array to operate with. + :param name: The name to use for the node. + """ + input_tensor = input if type(input) is gs.Variable else input[0] + log.debug("Created {} node '{}': {}".format(op, name, value.squeeze())) + const = gs.Constant(name="{}_value:0".format(name), values=value) + return self.layer(name=name, op=op, inputs=[input_tensor, const], outputs=[name + ":0"]) + +@gs.Graph.register() +def matmul(self, name, input, value): + """ + Add MatMul operation to the graph which will operate on the input tensor with the value(s) given. + :param input: The tensor to operate on. + :param value: The linear transformation matrix to operate with. + :param name: The name to use for the node. + """ + input_tensor = input if type(input) is gs.Variable else input[0] + log.debug("Created {} node '{}': {}".format("MatMul", name, value.squeeze())) + const = gs.Constant(name="{}_value:0".format(name), values=value) + return self.layer(name=name, op="MatMul", inputs=[input_tensor, const], outputs=[name + ":0"]) + +@gs.Graph.register() +def clip(self, name, input, clip_min, clip_max): + """ + Add Clip operation to the graph which will operate on the input tensor with the value(s) given. + :param input: The tensor to operate on. + :param name: The name to use for the node. + :param clip_min: Minimum value to include, less is clipped. + :param clip_max: Maximum value to include, more is clipped. + """ + input_tensor = input if type(input) is gs.Variable else input[0] + log.debug("Created {} node '{}".format("Clip", name)) + const_min = gs.Constant(name="{}_value:0".format(name), values=np.asarray([clip_min], dtype=np.float32)) + const_max = gs.Constant(name="{}_value:1".format(name), values=np.asarray([clip_max], dtype=np.float32)) + return self.layer(name=name, op="Clip", inputs=[input_tensor, const_min, const_max], outputs=[name + ":0"]) + +@gs.Graph.register() +def slice(self, name, input, starts, ends, axes): + """ + Add Slice operation to the graph which will operate on the input tensor with the value(s) given. + :param op: The ONNX operation to perform, i.e. "Add" or "Mul". + :param input: The tensor to operate on. + :param name: The name to use for the node. + :param starts: Value at which Slice starts. + :param ends: Value at which Slice ends. + :param axes: Axes on which Slice operation should be performed. + """ + + input_tensor = input if type(input) is gs.Variable else input[0] + log.debug("Created {} node '{}".format("Slice", name)) + const_start = gs.Constant(name="{}_value:0".format(name), values=np.asarray([starts], dtype=np.int64)) + const_end = gs.Constant(name="{}_value:1".format(name), values=np.asarray([ends], dtype=np.int64)) + const_axes = gs.Constant(name="{}_value:2".format(name), values=np.asarray([axes], dtype=np.int64)) + return self.layer(name=name, op="Slice", inputs=[input_tensor, const_start, const_end, const_axes], outputs=[name + ":0"]) + +@gs.Graph.register() +def unsqueeze(self, name, input, axes=[3]): + """ + Adds to the graph an Unsqueeze node for the given axes and to the given input. + :param self: The gs.Graph object being extended. + :param name: The name to use for the node. + :param input: The tensor to be "unsqueezed". + :param axes: A list of axes on which to add the new dimension(s). + :return: The first output tensor, to allow chained graph construction. + """ + input_tensor = input if type(input) is gs.Variable else input[0] + log.debug("Created Unsqueeze node '{}': {}".format(name, axes)) + return self.layer(name=name, op="Unsqueeze", inputs=[input_tensor], outputs=[name + ":0"], attrs={'axes': axes}) + +@gs.Graph.register() +def squeeze(self, name, input, axes=[2]): + """ + Adds to the graph an Squeeze node for the given axes and to the given input. + :param self: The gs.Graph object being extended. + :param name: The name to use for the node. + :param input: The tensor to be "squeezed". + :param axes: A list of axes on which to remove a dimension(s). + :return: The first output tensor, to allow chained graph construction. + """ + input_tensor = input if type(input) is gs.Variable else input[0] + log.debug("Created Squeeze node '{}': {}".format(name, axes)) + return self.layer(name=name, op="Squeeze", inputs=[input_tensor], outputs=[name + ":0"], attrs={'axes': axes}) + +@gs.Graph.register() +def gather(self, name, data, indices, axes=0): + """ + Adds to the graph a Gather node for the given axes and to the given input. + :param self: The gs.Graph object being extended. + :param name: The name to use for the node. + :param data: Data from which to gather specific tensors. + :param indices: Indices by which to gather data tensors. + :param axes: A list of axes on which to perform gather operation + """ + data_tensor = data if type(data) is gs.Variable else data[0] + indices_tensor = indices if type(indices) is gs.Variable else indices[0] + log.debug("Created Gather node '{}': {}".format(name, axes)) + return self.layer(name=name, op="Gather", inputs=[data_tensor, indices_tensor], outputs=[name + ":0"], attrs={'axes': axes}) + +@gs.Graph.register() +def transpose(self, name, input, perm): + """ + Adds to the graph a Transpose node for the given axes permutation and to the given input. + :param self: The gs.Graph object being extended. + :param name: The name to use for the node. + :param input: The tensor to be transposed. + :param perm: A list of axes defining their order after transposing occurs. + :return: The first output tensor, to allow chained graph construction. + """ + input_tensor = input if type(input) is gs.Variable else input[0] + log.debug("Created Transpose node '{}': {}".format(name, perm)) + return self.layer(name=name, op="Transpose", inputs=[input_tensor], outputs=[name + ":0"], attrs={'perm': perm}) + +@gs.Graph.register() +def sigmoid(self, name, input): + """ + Adds to the graph a Sigmoid node for the given input. + :param self: The gs.Graph object being extended. + :param name: The name to use for the node. + :param input: The tensor to be applied to. + :return: The first output tensor, to allow chained graph construction. + """ + input_tensor = input if type(input) is gs.Variable else input[0] + log.debug("Created Sigmoid node '{}'".format(name)) + return self.layer(name=name, op="Sigmoid", inputs=[input_tensor], outputs=[name + ":0"]) + +@gs.Graph.register() +def plugin(self, op, name, inputs: list, outputs: list, attrs): + """ + Adds to the graph a TensorRT plugin node with the given name, inputs and outputs. The attrs dictionary holds + attributes to be added to the plugin node. + :param self: The gs.Graph object being extended. + :param op: The registered name for the TensorRT plugin. + :param name: The name to use for the node. + :param inputs: The list of tensors to use an inputs. + :param outputs: The list of tensors to use as outputs. + :param attrs: The dictionary to use as attributes. + :return: The first output tensor, to allow chained graph construction. + """ + log.debug("Created TRT Plugin node '{}': {}".format(name, attrs)) + return self.layer(op=op, name=name, inputs=inputs, outputs=outputs, attrs=attrs) + +@gs.Graph.register() +def find_node_by_op(self, op): + """ + Finds the first node in the graph with the given operation name. + :param self: The gs.Graph object being extended. + :param op: The operation name to search for. + :return: The first node matching that performs that op. + """ + for node in self.nodes: + if node.op == op: + return node + return None + +@gs.Graph.register() +def find_node_by_op_name(self, op, name): + """ + Finds the first node in the graph with the given operation name. + :param self: The gs.Graph object being extended. + :param op: The operation name to search for. + :param name: Selected node name. + :return: The first node matching that performs that op. + """ + for node in self.nodes: + if node.op == op and node.name == name: + return node + return None + +@gs.Graph.register() +def find_node_by_op_input_output_name(self, op, input_name, output_name, input_pos=0, output_pos=0): + """ + Finds the first node in the graph with the given operation name. + :param self: The gs.Graph object being extended. + :param op: The operation name to search for. + :param input_pos: Which input to consider, default is 0. + :param output_pos: Which output to consider, default is 0. + :param input_name: Selected input's name. + :param output_name: Selected output's name. + :return: The first node matching that performs that op. + """ + for node in self.nodes: + if node.op == op and node.inputs[input_pos].name == input_name and node.outputs[output_pos].name == output_name: + return node + return None + +@gs.Graph.register() +def find_descendant_by_op(self, node, op, depth=10): + """ + Starting from the given node, finds a node lower in the graph matching the given operation name. + This is not an exhaustive graph search. + In order to graph search bfs is used, so runtime complexity is O(V+E). + :param self: The gs.Graph object being extended. + :param node: The node to start searching from. + :param op: The operation name to search for. + :param depth: Stop searching after traversing these many nodes. + :return: The first descendant node matching that performs that op. + """ + queue = [] + for i in range(depth): + queue.append(node.o()) + while queue: + node = queue.pop(0) + if node.op == op: + return node + for child in node.outputs[0].outputs: + queue.append(child) + return None + +@gs.Graph.register() +def find_ancestor_by_op(self, node, op, depth=10): + """ + Starting from the given node, finds a node higher in the graph matching the given operation name. + This is not an exhaustive graph search. + In order to graph search bfs is used, so runtime complexity is O(V+E). + :param self: The gs.Graph object being extended. + :param node: The node to start searching from. + :param op: The operation name to search for. + :param depth: Stop searching after traversing these many nodes. + :return: The first ancestor node matching that performs that op. + """ + queue = [] + for i in range(depth): + queue.append(node.i()) + while queue: + node = queue.pop(0) + if node.op == op: + return node + for child in node.inputs[-1].inputs: + queue.append(child) + return None diff --git a/convension/detectron2/optimize_onnx.py b/convension/detectron2/optimize_onnx.py new file mode 100644 index 0000000..6a78bf9 --- /dev/null +++ b/convension/detectron2/optimize_onnx.py @@ -0,0 +1,7 @@ +from onnxruntime_tools import optimizer +onnx_model_path = "model.onnx" +optimized_model_path = "optimized_model.onnx" +opt_model = optimizer.optimize_model(onnx_model_path, model_type='bert') +opt_model.save_model_to_file(optimized_model_path) + +print("Model optimized and saved to", optimized_model_path) \ No newline at end of file diff --git a/convension/detectron2/run_detectron_tensorRT.py b/convension/detectron2/run_detectron_tensorRT.py new file mode 100644 index 0000000..e96dec1 --- /dev/null +++ b/convension/detectron2/run_detectron_tensorRT.py @@ -0,0 +1,99 @@ +# import torch +# from rough_layout import * +# with open('configs/model_configs.yaml') as f: +# model_configs = yaml.load(f, Loader=yaml.FullLoader) + + +# img_size = model_configs['model_args']['img_size'] +# conf_thres= model_configs['model_args']['conf_thres'] +# iou_thres = model_configs['model_args']['iou_thres'] +# device = model_configs['model_args']['device'] +# dpi = model_configs['model_args']['pdf_dpi'] +# layout_model = get_layout_model(model_configs) + +# batched_inputs = torch.load("convension/detectron2/batched_inputs.pt") + +# import torch_tensorrt + +# model = layout_model.predictor.model.eval() +# x = batched_inputs[0]['image'][None] + +# inputs = [x] +# trt_gm = torch_tensorrt.compile(model, ir="dynamo", inputs=inputs) +# import os +# os.makedirs("models/layout/",exist_ok=True) +# torch_tensorrt.save(trt_gm, "models/layout//trt.ep", inputs=inputs) # PyTorch only supports Python runtime for an ExportedProgram. For C++ deployment, use a TorchScript file +# torch_tensorrt.save(trt_gm, "models/layout//trt.ts", output_format="torchscript", inputs=inputs) +import os +os.environ['CUDA_MODULE_LOADING'] = 'LAZY' +from rough_layout import * +with open('configs/model_configs.yaml') as f: + model_configs = yaml.load(f, Loader=yaml.FullLoader) + +img_size = model_configs['model_args']['img_size'] +conf_thres= model_configs['model_args']['conf_thres'] +iou_thres = model_configs['model_args']['iou_thres'] +device = model_configs['model_args']['device'] +dpi = model_configs['model_args']['pdf_dpi'] +layout_model = get_layout_model(model_configs) +mfd_model = get_batch_YOLO_model(model_configs) + +self = layout_model.predictor.model + +batched_inputs = torch.load("convension/detectron2/batched_inputs.pt") +with torch.no_grad(): + images = self.preprocess_image(batched_inputs) + input = self.get_batch(batched_inputs, images) + features = self.backbone(input) + proposals, _ = self.proposal_generator(images, features, None) + results, _ = self.roi_heads(images, features, proposals, None) + +model = layout_model.predictor.model.backbone +class ModelWrapper(torch.nn.Module): + def __init__(self, model): + super(ModelWrapper, self).__init__() + self.model = model + + def forward(self, images): + with torch.inference_mode(): + outputs = self.model({'images':images}) + return ( + outputs['p2'], + outputs['p3'], + outputs['p4'], + outputs['p5'], + outputs['p6'], + ) + +# Wrap the model +wrapped_model = ModelWrapper(model) + +import torch +import torch.onnx + +# Assuming `model` is your pre-trained model +model.eval() # Set the model to evaluation mode +# Create a sample input tensor with the shape (B, 3, 1052, 800) +batch_size = 1 # Adjust as needed +sample_input = torch.randn(batch_size, 3, 1056, 800).cuda() + +# Define the path where the ONNX model will be saved +onnx_model_path = "model.onnx" + +# Export the model +torch.onnx.export( + wrapped_model, # The wrapped model to be exported + (sample_input,), # The input example (tuple) + onnx_model_path, # The path where the model will be saved + input_names=["images"], # The names of the input tensors + output_names=['p2','p3','p4','p5','p6'], # The names of the output tensors + # dynamic_axes={ + # 'images': {0: 'batch_size'}, # Variable length axes for input + # 'p2': {0: 'batch_size'}, # Variable length axes for output + # 'p3': {0: 'batch_size'}, + # 'p4': {0: 'batch_size'}, + # 'p5': {0: 'batch_size'}, + # 'p6': {0: 'batch_size'} + # }, + opset_version=17 # ONNX opset version (can be adjusted) +) diff --git a/convension/detectron2/sample_1042x800.png b/convension/detectron2/sample_1042x800.png new file mode 100644 index 0000000..ba482b5 Binary files /dev/null and b/convension/detectron2/sample_1042x800.png differ diff --git a/convension/unimernet/README.md b/convension/unimernet/README.md new file mode 100644 index 0000000..c9ff352 --- /dev/null +++ b/convension/unimernet/README.md @@ -0,0 +1,738 @@ + +# Multi-Modal + +This document shows how to run multimodal pipelines with TensorRT-LLM, e.g. from image+text input modalities to text output. + +Multimodal models' LLM part has an additional parameter `--max_multimodal_len` compared to LLM-only build commands. Under the hood, `max_multimodal_len` and `max_prompt_embedding_table_size` are effectively the same concept, i.e., prepended/concatenated embeddings (either multimodal feature embeddings or prompt tuning embeddings) to the LLM input embeddings. The multimodal features from the visual encoder of shape `[batch_size, num_visual_features, visual_hidden_dim]` is flattened as `[batch_size * num_visual_features, visual_hidden_dim]` and passed like a prompt embedding table. + +We first describe how to run each model on a single GPU. We then provide general guidelines on using tensor parallelism for the LLM part of the pipeline. + +- [BLIP2](#blip2) +- [CogVLM](#cogvlm) +- [Deplot](#deplot) +- [Fuyu](#fuyu) +- [Kosmos-2](#kosmos-2) +- [LLaVA, LLaVa-NeXT and VILA](#llava-llava-next-and-vila) +- [NeVA](#neva) +- [Nougat](#nougat) +- [Phi-3-vision](#phi-3-vision) +- [Video NeVA](#video-neva) +- [Enabling tensor parallelism for multi-GPU](#enabling-tensor-parallelism-for-multi-gpu) + +## BLIP2 + +This BLIP section covers both BLIP2-OPT and BLIP2-T5, with minor changes needed when switching the LLM backbone. + +1. Download Huggingface weights and convert original checkpoint to TRT-LLM checkpoint format + following example in `examples/opt/README.md` and `examples/enc_dec/README.md`. + + ```bash + export MODEL_NAME="blip2-opt-2.7b" # options: blip2-opt-6.7b, blip2-flan-t5-xl, blip2-flan-t5-xxl + git clone https://huggingface.co/Salesforce/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + ``` + + For BLIP2-OPT family, + ```bash + python ../opt/convert_checkpoint.py --model_type blip2 \ + --model_dir tmp/hf_models/${MODEL_NAME} \ + --output_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \ + --dtype float16 + ``` + + For BLIP2-T5 family, + ```bash + python ../enc_dec/convert_checkpoint.py --model_type blip2 \ + --model_dir tmp/hf_models/${MODEL_NAME} \ + --output_dir tmp/trt_models/${MODEL_NAME}/bfloat16 \ + --tp_size 1 \ + --pp_size 1 \ + --dtype bfloat16 + ``` + +2. Build TRT-LLM engine from TRT-LLM checkpoint + + For BLIP2-OPT family, + ```bash + trtllm-build \ + --checkpoint_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \ + --output_dir tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu \ + --gemm_plugin float16 \ + --max_beam_width 1 \ + --max_batch_size 8 \ + --max_seq_len 1024 \ + --max_input_len 924 \ + --max_multimodal_len 256 # 8 (max_batch_size) * 32 (num_visual_features) + ``` + + For BLIP2-T5 family, + ```bash + trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/bfloat16/encoder \ + --output_dir tmp/trt_engines/${MODEL_NAME}/bfloat16/encoder \ + --paged_kv_cache disable \ + --moe_plugin disable \ + --enable_xqa disable \ + --gemm_plugin bfloat16 \ + --bert_attention_plugin bfloat16 \ + --gpt_attention_plugin bfloat16 \ + --remove_input_padding enable \ + --context_fmha disable \ + --max_beam_width 1 \ + --max_batch_size 8 \ + --max_input_len 924 \ + --max_multimodal_len 256 # 8 (max_batch_size) * 32 (num_visual_features) + + trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/bfloat16/decoder \ + --output_dir tmp/trt_engines/${MODEL_NAME}/bfloat16/decoder \ + --paged_kv_cache disable \ + --moe_plugin disable \ + --enable_xqa disable \ + --gemm_plugin bfloat16 \ + --bert_attention_plugin bfloat16 \ + --gpt_attention_plugin bfloat16 \ + --remove_input_padding enable \ + --context_fmha disable \ + --max_beam_width 1 \ + --max_batch_size 8 \ + --max_seq_len 1024 \ + --max_encoder_input_len 924 \ + --max_input_len 1 # Same command for decoder but don't set --max_multimodal_len + ``` + + **NOTE**: `max_multimodal_len = max_batch_size * num_visual_features`, so if you change max_batch_size, max multimodal length **MUST** be changed accordingly. + +3. Build TensorRT engines for vision encoders + + ```bash + python build_visual_engine.py --model_type blip2 --model_path tmp/hf_models/${MODEL_NAME} --max_batch_size 8 + ``` + + The built engines are located in `tmp/trt_engines/${MODEL_NAME}/vision_encoder`. + + To run the BLIP2 pipeline with batch size > 1, change `--max_batch_size` argument to `build_visual_engine.py` accordingly. + +4. Assemble everything into BLIP2 pipeline + + For BLIP2-OPT family, + ```bash + python run.py \ + --max_new_tokens 30 \ + --input_text "Question: which city is this? Answer:" \ + --hf_model_dir tmp/hf_models/${MODEL_NAME} \ + --visual_engine_dir tmp/trt_engines/${MODEL_NAME}/vision_encoder \ + --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu + ``` + + For BLIP2-T5 family, + ```bash + python run.py \ + --max_new_tokens 30 \ + --input_text "Question: which city is this? Answer:" \ + --hf_model_dir tmp/hf_models/${MODEL_NAME} \ + --visual_engine_dir tmp/trt_engines/${MODEL_NAME}/vision_encoder \ + --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/bfloat16 + ``` + +5. (Optional) INT8/INT4 weight-only quantization for OPT can be enabled using commands as follows (take `INT4` as an example, while `INT8` is the default precision for weight-only quantization): + ```bash + python ../opt/convert_checkpoint.py \ + --model_dir tmp/hf_models/${MODEL_NAME} \ + --dtype float16 \ + --output_dir tmp/trt_models/${MODEL_NAME}/int4_weightonly/1-gpu \ + --use_weight_only \ + --weight_only_precision int4 + + trtllm-build \ + --checkpoint_dir tmp/trt_models/${MODEL_NAME}/int4_weightonly/1-gpu \ + --output_dir tmp/trt_engines/${MODEL_NAME}/int4_weightonly/1-gpu \ + --gemm_plugin float16 \ + --max_beam_width 1 \ + --max_batch_size 8 \ + --max_multimodal_len 256 \ + --max_input_len 924 \ + --max_seq_len 1024 + ``` + + The built OPT engines lie in `tmp/trt_engines/${MODEL_NAME}/int4_weightonly/1-gpu`. + You should use this directory as `--llm_engine_dir` argument to `run.py` + + **NOTE:** INT8/INT4 option is not supported for BLIP2-T5, because quantization support has not been + added for encoder-decoder models yet. + +## CogVLM + +Currently, CogVLM only support bfloat16 precision and doesn't support `remove_input_padding` feature. + +1. Download Huggingface weights + + ```bash + export MODEL_NAME="cogvlm-chat-hf" + git clone https://huggingface.co/THUDM/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + export TOKENIZER_NAME="vicuna-7b-v1.5" + git clone https://huggingface.co/lmsys/${TOKENIZER_NAME} tmp/hf_models/${TOKENIZER_NAME} + ``` + + Because currently onnx doesn't support `xops.memory_efficient_attention`, we need to modify some source code of the huggingface CogVLM. + ``` + cd tmp/hf_models/${MODEL_NAME} + sed -i '4s/.*//;40s/.*/ out = self.attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).transpose(1, 2).contiguous()/;41s/.*//;42s/.*//' visual.py # It will replace memory_efficient_attention with some basic ops + ``` + +2. Convert Huggingface weights into TRT-LLM checkpoints and build TRT engines using scripts in `examples/cogvlm` + + CogVLM uses a Vit encoder as LLM encoder and a modified Llama as decoder. + + ```bash + python ../cogvlm/convert_checkpoint.py --model_dir tmp/hf_models/${MODEL_NAME} --output_dir tmp/trt_models/${MODEL_NAME} --dtype bfloat16 --use_prompt_tuning + + trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME} \ + --output_dir tmp/trt_engines/${MODEL_NAME}/bf16/1-gpu \ + --gemm_plugin bfloat16 \ + --gpt_attention_plugin bfloat16 \ + --remove_input_padding disable \ + --max_batch_size 48 \ + --max_input_len 2048 \ + --max_seq_len 3076 \ + --paged_kv_cache disable \ + --enable_xqa disable \ + --bert_attention_plugin disable \ + --moe_plugin disable \ + --max_multimodal_len 61440 # 48 (max_batch_size) * 1280 (max_num_visual_features) + ``` + +3. Generate TensorRT engines for visual components and combine everything into final pipeline. + + ```bash + python build_visual_engine.py --model_type cogvlm --model_path tmp/hf_models/${MODEL_NAME} --max_batch_size 48 + + python run.py \ + --max_new_tokens 1000 \ + --input_text " [INST] please describe this image in detail [/INST] " \ + --hf_model_dir tmp/hf_models/${TOKENIZER_NAME} \ + --visual_engine_dir tmp/trt_engines/${MODEL_NAME}/vision_encoder \ + --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/bf16/1-gpu \ + --batch_size 1 \ + --top_p 0.4 \ + --top_k 1 \ + --temperature 0.2 \ + --repetition_penalty 1.2 \ + --enable_context_fmha_fp32_acc + ``` + +## Deplot + +1. Download Huggingface weights and convert original checkpoint to TRT-LLM checkpoint format + following example in `examples/enc_dec/README.md`. + + ```bash + export MODEL_NAME="deplot" + git clone https://huggingface.co/google/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + + python ../enc_dec/convert_checkpoint.py --model_type pix2struct \ + --model_dir tmp/hf_models/${MODEL_NAME} \ + --output_dir tmp/trt_models/${MODEL_NAME}/float16 \ + --tp_size 1 \ + --pp_size 1 \ + --dtype float16 + ``` + +2. Build TRT-LLM engine from TRT-LLM checkpoint + + ```bash + trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/float16/decoder \ + --output_dir tmp/trt_engines/${MODEL_NAME}/1-gpu/float16/decoder \ + --paged_kv_cache disable \ + --moe_plugin disable \ + --enable_xqa disable \ + --gemm_plugin float16 \ + --bert_attention_plugin float16 \ + --gpt_attention_plugin float16 \ + --remove_input_padding enable \ + --context_fmha disable \ + --max_beam_width 1 \ + --max_batch_size 8 \ + --max_seq_len 2558 \ + --max_encoder_input_len 2048 \ + --max_input_len 1 + ``` + + The built deplot engines are located in `tmp/trt_engines/${MODEL_NAME}/1-gpu/float16`. + +3. Build TensorRT engines for visual components + + ```bash + python build_visual_engine.py --model_type pix2struct --model_path tmp/hf_models/${MODEL_NAME} --max_batch_size 8 + ``` + + The built visual engines are located in `tmp/trt_engines/${MODEL_NAME}/vision_encoder`. + + To run the deplot pipeline with batch size > 1, change `--max_batch_size` argument to `build_visual_engine.py` accordingly. + +4. Assemble everything into deplot pipeline + + ```bash + python run.py \ + --max_new_tokens 100 \ + --input_text "" \ + --hf_model_dir tmp/hf_models/${MODEL_NAME} \ + --visual_engine_dir tmp/trt_engines/${MODEL_NAME}/vision_encoder \ + --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/1-gpu/float16 + ``` + +## Fuyu + +1. Download Huggingface weights + + ```bash + export MODEL_NAME="fuyu-8b" + git clone https://huggingface.co/adept/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + ``` + +2. Convert Huggingface weights into TRT-LLM checkpoints and build TRT engines using scripts in `examples/gpt`. + The LLM portion of Fuyu uses a Persimmon model + ```bash + python ../gpt/convert_checkpoint.py \ + --model_dir tmp/hf_models/${MODEL_NAME} \ + --output_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \ + --dtype float16 \ + --gpt_variant persimmon + + trtllm-build \ + --checkpoint_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \ + --output_dir tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu \ + --gemm_plugin float16 \ + --use_fused_mlp=enable \ + --max_batch_size 1 \ + --max_input_len 2048 \ + --max_seq_len 2560 \ + --max_multimodal_len 2048 + ``` + +3. Generate TensorRT engines for visual components and combine everything into final pipeline. + + ```bash + python build_visual_engine.py --model_type fuyu --model_path tmp/hf_models/${MODEL_NAME} + + python run.py \ + --hf_model_dir tmp/hf_models/${MODEL_NAME} \ + --visual_engine_dir tmp/trt_engines/${MODEL_NAME}/vision_encoder \ + --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu + ``` + +## Kosmos-2 + +1. Download Huggingface weights + + ```bash + export MODEL_NAME="kosmos-2" + git clone https://huggingface.co/microsoft/kosmos-2-patch14-224 tmp/hf_models/${MODEL_NAME} + ``` + +2. Convert Huggingface weights into TRT-LLM checkpoints and build TRT engines using scripts in `examples/gpt`. + ```bash + python ../gpt/convert_checkpoint.py \ + --model_dir tmp/hf_models/${MODEL_NAME} \ + --output_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \ + --dtype float16 \ + --gpt_variant ${MODEL_NAME} + + trtllm-build \ + --checkpoint_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \ + --output_dir tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu \ + --gpt_attention_plugin float16 \ + --gemm_plugin float16 \ + --max_batch_size 1 \ + --max_input_len 512 \ + --max_seq_len 1024 \ + --max_multimodal_len 64 # 1 (max_batch_size) * 64 (num_visual_features) + ``` + +3. Generate TensorRT engines for visual components and combine everything into final pipeline. + + ```bash + python build_visual_engine.py --model_type kosmos-2 --model_path tmp/hf_models/${MODEL_NAME} + + python run.py \ + --hf_model_dir tmp/hf_models/${MODEL_NAME} \ + --visual_engine_dir tmp/trt_engines/${MODEL_NAME}/vision_encoder \ + --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu + ``` + +## LLaVA, LLaVa-NeXT and VILA + +[LLaVA](https://github.com/haotian-liu/LLaVA) and [VILA](https://github.com/Efficient-Large-Model/VILA) are both visual language models (VLM) that can be deployed in TensorRT-LLM with many quantization options. [LLaVA-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf) is an extension of LLaVA. TRT-LLM currently supports [Mistral-7b](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) and [ Nous-Hermes-2-Yi-34B](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) variant of LLaVA-NeXT. + +1. Download Huggingface model weights. These models have both visual and LLM components + unlike BLIP2 example which downloads only LLM components from Huggingface. + + For LLaVA, + + ```bash + export MODEL_NAME="llava-1.5-7b-hf" # also llava-1.5-13b-hf + git clone https://huggingface.co/llava-hf/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + ``` + For LLaVA-NeXT, + + ```bash + export MODEL_NAME="llava-v1.6-mistral-7b-hf" #for 34b variant "llava-v1.6-34b-hf" + git clone https://huggingface.co/llava-hf/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + ``` + + For VILA, we need a few more steps until it is added to HF model zoo + + ```bash + # install the following dependency + pip install -r requirements-vila.txt + + # clone original VILA repo + export VILA_PATH="tmp/hf_models/VILA" + git clone https://github.com/Efficient-Large-Model/VILA.git ${VILA_PATH} + + # download VILA checkpoints + export MODEL_NAME="vila1.5-3b" + git clone https://huggingface.co/Efficient-Large-Model/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + ``` + +2. Generate TRT-LLM engine for LLaMA following example in `examples/llama/README.md` + + ```bash + python ../llama/convert_checkpoint.py \ + --model_dir tmp/hf_models/${MODEL_NAME} \ + --output_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \ + --dtype float16 + + # for LLaVA + trtllm-build \ + --checkpoint_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \ + --output_dir tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu \ + --gemm_plugin float16 \ + --use_fused_mlp=enable \ + --max_batch_size 1 \ + --max_input_len 2048 \ + --max_seq_len 2560 \ + --max_multimodal_len 576 # 1 (max_batch_size) * 576 (num_visual_features) + + # for LLaVA-NeXT + trtllm-build \ + --checkpoint_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \ + --output_dir tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu \ + --gpt_attention_plugin float16 \ + --gemm_plugin float16 \ + --use_fused_mlp=enable \ + --max_batch_size 1 \ + --max_input_len 4096 \ + --max_seq_len 5120 \ + --max_num_tokens 4096 \ # 1 (max_batch_size) * 4096 (max_input_len) + --max_multimodal_len 4096 # 1 (max_batch_size) * 4096 (max_input_len) + + # for VILA + trtllm-build \ + --checkpoint_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \ + --output_dir tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu \ + --gemm_plugin float16 \ + --use_fused_mlp=enable \ + --max_batch_size 1 \ + --max_input_len 2048 \ + --max_seq_len 2560 \ + --max_multimodal_len 4096 # 1 (max_batch_size) * 4096 (num_visual_features) + ``` + +3. Build TensorRT engines for visual components + + ```bash + python build_visual_engine.py --model_path tmp/hf_models/${MODEL_NAME} --model_type llava # for LLaVA + + python build_visual_engine.py --model_path tmp/hf_models/${MODEL_NAME} --model_type llava_next --model_path tmp/hf_models/${MODEL_NAME} --max_batch_size 5 # 1 (max_batch_size) * 5 (because LLAVA-NeXT visual encoder can have at most 5 patches) # for LLaVA-NeXT + + python build_visual_engine.py --model_path tmp/hf_models/${MODEL_NAME} --model_type vila --vila_path ${VILA_PATH} # for VILA + ``` + + ```bash + python run.py \ + --max_new_tokens 30 \ + --hf_model_dir tmp/hf_models/${MODEL_NAME} \ + --visual_engine_dir tmp/trt_engines/${MODEL_NAME}/vision_encoder \ + --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu \ + --input_text "Question: which city is this? Answer:" # for LLaVA and for LLaVA-NeXT + ``` + + For VILA, you can use either local file or web url as input images. + Suppose you have a local image `av.png` downloaded from `https://github.com/Efficient-Large-Model/VILA/blob/main/demo_trt_llm/av.png` and the url of `merlion.png` + ```bash + wget -O av.png https://raw.githubusercontent.com/Efficient-Large-Model/VILA/main/demo_images/av.png + + python run.py \ + --max_new_tokens 100 \ + --hf_model_dir tmp/hf_models/${MODEL_NAME} \ + --visual_engine_dir tmp/trt_engines/${MODEL_NAME}/vision_encoder \ + --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu \ + --image_path=av.png,https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png \ + --input_text="\n\n Please elaborate what you see in the images?" \ + --batch_size=1 # for VILA mode 1 + + python run.py \ + --max_new_tokens 100 \ + --hf_model_dir tmp/hf_models/${MODEL_NAME} \ + --visual_engine_dir tmp/trt_engines/${MODEL_NAME}/vision_encoder \ + --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu \ + --image_path=av.png,https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png \ + --input_text="\n Please elaborate what you see in the images?" \ + --batch_size=2 # for VILA mode 2 + ``` + + Note that VILA can support different modes in terms of batching: + - Mode 1: if you want to query N images as a whole using a prompt, `--batch_size=1` should be used (which is the default value). Example is given above. + - Mode 2: if you want to query N images individually using the same prompt (replicated), `--batch_size=N` should be used. Don't forget to set the `--max_batch_size` and `--max_multimodal_len` during engine building. + + Note: use `--run_profiling` for performance measurement, use `--check_accuracy` for accuracy check. + +4. (Optional) Different quantization methods supported in LLaMA can be applied to LLaVA/VILA as well, such as INT4/INT8 weight-only, SmoothQuant, and INT4 Activation-Aware Quantization (AWQ). Detailed instructions can be found in LLaMA [README](../llama/README.md). + + For example, + + ```bash + # INT4 weight only + python ../llama/convert_checkpoint.py \ + --model_dir tmp/hf_models/${MODEL_NAME} \ + --dtype float16 \ + --output_dir tmp/trt_models/${MODEL_NAME}/int4_weightonly/1-gpu \ + --use_weight_only \ + --weight_only_precision int4 + + # INT4 AWQ + python ../quantization/quantize.py \ + --model_dir tmp/hf_models/${MODEL_NAME} \ + --output_dir tmp/trt_models/${MODEL_NAME}/int4_awq/1-gpu \ + --dtype float16 \ + --qformat int4_awq \ + --calib_size 32 + ``` + + Then follow the same `trtllm-build` and `run.py` steps as before. NOTE: for `trtllm-build` command, do not use `--use_fused_mlp=enable` in these quantization modes. + +## NeVA + +[NeVA](https://docs.nvidia.com/nemo-framework/user-guide/latest/multimodalmodels/neva/index.html) is a groundbreaking addition to the NeMo Multimodal ecosystem. This model seamlessly integrates large language-centric models with a vision encoder, that can be deployed in TensorRT-LLM. + +1. Generate TRT-LLM engine for NVGPT following example in `examples/gpt/README.md`. To adhere to the NVGPT conventions of the conversion script, some layer keys have to be remapped using `--nemo_rename_key`. + + ```bash + export MODEL_NAME="neva" + python ../gpt/convert_checkpoint.py \ + --nemo_ckpt_path ./${MODEL_NAME}.nemo \ + --dtype bfloat16 \ + --output_dir tmp/trt_models/${MODEL_NAME} \ + --nemo_rename_key model:model.language_model \ + attention.linear_qkv.layer_norm_bias:input_layernorm.bias \ + attention.linear_qkv.layer_norm_weight:input_layernorm.weight \ + mlp.linear_fc1.layer_norm_bias:post_attention_layernorm.bias \ + mlp.linear_fc1.layer_norm_weight:post_attention_layernorm.weight \ + linear_qkv:query_key_value \ + linear_fc1:dense_h_to_4h \ + linear_fc2:dense_4h_to_h \ + linear_proj:dense \ + decoder:encoder + + trtllm-build \ + --checkpoint_dir tmp/trt_models/${MODEL_NAME} \ + --output_dir tmp/trt_engines/${MODEL_NAME}/bf16/1-gpu \ + --gpt_attention_plugin bfloat16 \ + --gemm_plugin bfloat16 \ + --max_batch_size 1 \ + --max_input_len 2048 \ + --max_seq_len 2560 \ + --max_multimodal_len 729 # 1 (max_batch_size) * 729 (num_visual_features) + ``` + +2. Build TensorRT engines for visual components + + ```bash + python build_visual_engine.py --model_path ./${MODEL_NAME}.nemo --model_type neva + ``` + + ```bash + python run.py \ + --max_new_tokens 30 \ + --hf_model_dir tmp/trt_models/${MODEL_NAME} \ + --visual_engine_dir tmp/trt_engines/${MODEL_NAME}/vision_encoder \ + --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/bf16/1-gpu \ + --input_text "Question: which city is this? Answer:" + ``` + + Note: use `--run_profiling` for performance measurement, use `--check_accuracy` for accuracy check. + +## Nougat + +1. Download Huggingface weights + + ```bash + export MODEL_NAME="nougat-base" # also nougat-small + git clone https://huggingface.co/facebook/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + ``` + +2. Convert Huggingface weights into TRT-LLM checkpoints and build TRT engines using scripts in `examples/enc_dec` + + Nougat uses mBART architecture but replaces the LLM encoder with a Swin Transformer encoder. + To achieve this, we add an extra `--nougat` flag (over mBART example) to + `convert_checkpoint.py` in `examples/enc_dec` and `trtllm-build`. + + ```bash + python ../enc_dec/convert_checkpoint.py --model_type bart \ + --model_dir tmp/hf_models/${MODEL_NAME} \ + --output_dir tmp/trt_models/${MODEL_NAME}/bfloat16 \ + --tp_size 1 \ + --pp_size 1 \ + --dtype bfloat16 \ + --nougat + + trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/bfloat16/decoder \ + --output_dir tmp/trt_engines/${MODEL_NAME}/1-gpu/bfloat16/decoder \ + --paged_kv_cache disable \ + --moe_plugin disable \ + --enable_xqa disable \ + --gemm_plugin bfloat16 \ + --bert_attention_plugin bfloat16 \ + --gpt_attention_plugin bfloat16 \ + --remove_input_padding enable \ + --max_beam_width 1 \ + --max_batch_size 1 \ + --max_seq_len 101 \ + --max_input_len 1 \ + --max_encoder_input_len 588 # 1 (max_batch_size) * 588 (num_visual_features) + ``` + +3. Generate TensorRT engines for visual components and combine everything into final pipeline. + + ```bash + python build_visual_engine.py --model_type nougat --model_path tmp/hf_models/${MODEL_NAME} + + python run.py \ + --hf_model_dir tmp/hf_models/${MODEL_NAME} \ + --visual_engine_dir tmp/trt_engines/${MODEL_NAME}/vision_encoder \ + --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/1-gpu/bfloat16 + ``` + + Note: Nougat models usually do not need a text prompt. + + +## Phi-3-vision + +1. Download Huggingface weights + + ```bash + export MODEL_NAME="Phi-3-vision-128k-instruct" + git clone https://huggingface.co/microsoft/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + ``` + +2. Convert Huggingface weights into TRT-LLM checkpoints and build TRT engines using scripts in `examples/phi`. + ```bash + python ../gpt/convert_checkpoint.py \ + --model_dir tmp/hf_models/${MODEL_NAME} \ + --output_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \ + --dtype float16 + + trtllm-build \ + --checkpoint_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \ + --output_dir tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu \ + --gpt_attention_plugin float16 \ + --gemm_plugin float16 \ + --max_batch_size 1 \ + --max_input_len 4096 \ + --max_seq_len 4608 \ + --max_multimodal_len 4096 + ``` + +3. Generate TensorRT engines for visual components and combine everything into final pipeline. + + ```bash + python build_visual_engine.py --model_type phi-3-vision --model_path tmp/hf_models/${MODEL_NAME} + + python run.py \ + --hf_model_dir tmp/hf_models/${MODEL_NAME} \ + --visual_engine_dir tmp/trt_engines/${MODEL_NAME}/vision_encoder \ + --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu/ \ + --image_path=https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png + ``` + +## Video NeVA + +[Video NeVA](https://github.com/NVIDIA/NeMo/blob/main/docs/source/multimodal/mllm/video_neva.rst) is a groundbreaking addition to the NeMo Multimodal ecosystem that could work with video modality. This model seamlessly integrates large language-centric models with a vision encoder, that can be deployed in TensorRT-LLM. + +1. Generate TRT-LLM engine for Nemotron model following example in `examples/nemotron/README.md`. To adhere to the NVGPT conventions of the conversion script. This will be used as our base LM for inference. + + ```bash + pip install decord # used for loading video + + python3 ../quantization/quantize.py \ + --nemo_ckpt_path /path/to/nemotron/model.nemo \ + --dtype bfloat16 \ + --batch_size 64 \ + --qformat full_prec \ + --output_dir nemotron-3/trt_ckpt/bf16/1-gpu + + + trtllm-build \ + --checkpoint_dir nemotron-3/trt_ckpt/bf16/1-gpu \ + --output_dir tmp/trt_engines/nemotron-3/bf16/1-gpu \ + --gpt_attention_plugin bfloat16 \ + --gemm_plugin bfloat16 \ + --max_batch_size 1 \ + --max_input_len 4096 \ + --max_seq_len 4352 \ + --max_multimodal_len 3072 # 1 (max_batch_size) * (12 num_frames) * (256 image_token_len) + ``` + +2. Build TensorRT engines for visual components + + ```bash + python build_visual_engine.py --model_path /path/to/video/neva/projector.nemo --model_type video-neva --output_dir tmp/trt_engines/nemotron-3/visual_encoder + ``` + + ```bash + python run.py \ + --max_new_tokens 30 \ + --hf_model_dir nemotron-3/trt_ckpt/bf16/1-gpu \ + --visual_engine_dir tmp/trt_engines/nemotron-3/visual_encoder \ + --llm_engine_dir tmp/trt_engines/nemotron-3/bf16/1-gpu \ + --input_text "Question: what is in the video? Answer:" \ + --video_path /path/to/your/local/video/file + ``` + + Note: use `--run_profiling` for performance measurement, use `--check_accuracy` for accuracy check. + +## Enabling tensor parallelism for multi-GPU + +The LLM part of the pipeline can be run on multiple GPUs using tensor parallelism. +The visual encoder will be replicated on each GPU and operate in a data parallel fashion. + +To enable tensor parallelism, both weight conversion step (from Huggingface to FT format) +and engine building step should use additional arguments. Finally `run.py` should be prefixed +with `mpirun -n NUM_GPUS --allow-run-as-root`. + +The full set of commands to enable 2-way tensor parallelism for LLaVA is: + + ```bash + export MODEL_NAME="llava-1.5-7b-hf" + + python ../llama/convert_checkpoint.py \ + --model_dir tmp/hf_models/${MODEL_NAME} \ + --output_dir tmp/trt_models/${MODEL_NAME}/fp16/2-gpu \ + --dtype float16 --tp_size 2 + + trtllm-build \ + --checkpoint_dir tmp/trt_models/${MODEL_NAME}/fp16/2-gpu \ + --output_dir tmp/trt_engines/${MODEL_NAME}/fp16/2-gpu \ + --gemm_plugin float16 \ + --max_batch_size 1 \ + --max_input_len 2048 \ + --max_seq_len 2560 \ + --max_multimodal_len 576 + + python build_visual_engine.py --model_type llava --model_path tmp/hf_models/${MODEL_NAME} + + mpirun -n 2 --allow-run-as-root \ + python run.py \ + --max_new_tokens 30 \ + --hf_model_dir tmp/hf_models/${MODEL_NAME} \ + --visual_engine_dir tmp/trt_engines/${MODEL_NAME}/vision_encoder \ + --llm_engine_dir tmp/trt_engines/${MODEL_NAME}/fp16/2-gpu \ + ``` diff --git a/convension/unimernet/build_engine.sh b/convension/unimernet/build_engine.sh new file mode 100644 index 0000000..e8130c6 --- /dev/null +++ b/convension/unimernet/build_engine.sh @@ -0,0 +1,10 @@ + +BATCHSIZE=128 +PARTITION=AI4Chem +srun -p $PARTITION -N1 -c8 --gres=gpu:1 --mpi=pmi2 trtllm-build --checkpoint_dir examples/multimodal/trt_engines/unimernet/bfloat16/decoder \ + --output_dir examples/multimodal/trt_engines.b$BATCHSIZE/unimernet/1-gpu/bfloat16/decoder --paged_kv_cache disable \ + --moe_plugin disable --enable_xqa disable --gemm_plugin bfloat16 --bert_attention_plugin bfloat16 \ + --gpt_attention_plugin bfloat16 --remove_input_padding enable --max_beam_width 1 --max_batch_size $BATCHSIZE \ + --max_seq_len 101 --max_input_len 1 --max_encoder_input_len 588 + +srun -p $PARTITION -N1 -c8 --gres=gpu:1 --mpi=pmi2 python build_visual_engine.py $BATCHSIZE "examples/multimodal/trt_engines.b$BATCHSIZE/vision_encoder/" \ No newline at end of file diff --git a/convension/unimernet/build_visual_engine.py b/convension/unimernet/build_visual_engine.py new file mode 100644 index 0000000..c4ad26c --- /dev/null +++ b/convension/unimernet/build_visual_engine.py @@ -0,0 +1,837 @@ +import os +import shutil +import sys +import tarfile +from time import time + +import yaml + +# isort: off +import torch +import tensorrt as trt +from tensorrt_llm.builder import Builder +from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM, + AutoModelForVision2Seq, AutoProcessor, + Blip2ForConditionalGeneration, Blip2Processor, + FuyuForCausalLM, FuyuProcessor, + LlavaForConditionalGeneration, NougatProcessor, + Pix2StructForConditionalGeneration, + VisionEncoderDecoderModel) +# isort: on +import json +import math + +import torch.nn.functional as F +from PIL import Image +from safetensors.torch import save_file + + +def add_multimodal_arguments(parser): + parser.add_argument('--model_type', + type=str, + default=None, + choices=[ + 'blip2', 'llava', 'llava_next', 'vila', 'nougat', + 'cogvlm', 'fuyu', 'pix2struct', 'neva', 'kosmos-2', + 'video-neva', 'phi-3-vision' + ], + help="Model type") + parser.add_argument( + '--model_path', + type=str, + default=None, + help= + "Huggingface repo, local directory with weights or path to checkpoint file" + ) + parser.add_argument('--vila_path', + type=str, + default=None, + help="Path to VILA source code directory") + parser.add_argument('--output_dir', + type=str, + default=None, + help="Directory where visual TRT engines are saved") + parser.add_argument('--max_batch_size', + type=int, + default=4, + help="Maximum batch size for input images") + return parser + + +class VisionEngineBuilder: + + def __init__(self, args): + args.device = torch.device( + "cuda") if torch.cuda.is_available() else "cpu" + if args.output_dir is None: + # default path to save the engines + model_name = args.model_path.split('/')[-1] + args.output_dir = f'tmp/trt_engines/{model_name}/vision_encoder' + + os.makedirs(args.output_dir, exist_ok=True) + + self.args = args + + def build(self): + args = self.args + if args.model_type == 'blip2': + build_blip2_engine(args) + elif args.model_type == 'pix2struct': + build_pix2struct_engine(args) + elif 'llava' in args.model_type: + build_llava_engine(args) + elif args.model_type == 'vila': + assert args.vila_path is not None, "Please clone and provide VILA source code path" + build_vila_engine(args) + elif args.model_type == 'nougat': + build_nougat_engine(args) + elif args.model_type == 'cogvlm': + build_cogvlm_engine(args) + elif args.model_type == 'fuyu': + build_fuyu_engine(args) + elif args.model_type == 'neva': + build_neva_engine(args) + elif args.model_type == 'video-neva': + build_video_neva_engine(args) + elif args.model_type == 'kosmos-2': + build_kosmos_engine(args) + elif args.model_type == 'phi-3-vision': + build_phi_engine(args) + else: + raise RuntimeError(f"Invalid model type {args.model_type}") + + +def export_onnx(model, + input, + onnx_dir, + onnx_name='model.onnx', + input_names=['input'], + output_names=['output'], + dynamic_axes={'input': { + 0: 'batch' + }}, + logger=trt.Logger(trt.Logger.INFO)): + logger.log(trt.Logger.INFO, f"Exporting onnx to {onnx_dir}/{onnx_name}") + os.makedirs(onnx_dir, exist_ok=True) + torch.onnx.export(model, + input, + f'{onnx_dir}/{onnx_name}', + opset_version=17, + input_names=input_names, + output_names=output_names, + dynamic_axes=dynamic_axes) + + +def build_trt_engine(model_type, + input_sizes, + onnx_dir, + engine_dir, + max_batch_size, + dtype=torch.float16, + num_frames=None, + onnx_name='model.onnx', + engine_name='model.engine', + delete_onnx=True, + logger=trt.Logger(trt.Logger.INFO)): + onnx_file = f'{onnx_dir}/{onnx_name}' + engine_file = f'{engine_dir}/{engine_name}' + config_file = f'{engine_dir}/config.json' + logger.log(trt.Logger.INFO, f"Building TRT engine to {engine_file}") + + builder = trt.Builder(logger) + network = builder.create_network( + 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) + profile = builder.create_optimization_profile() + + config_args = { + "precision": str(dtype).split('.')[-1], + "model_type": model_type + } + if num_frames is not None: + config_args["num_frames"] = num_frames + + config_wrapper = Builder().create_builder_config(**config_args) + config = config_wrapper.trt_builder_config + + parser = trt.OnnxParser(network, logger) + + with open(onnx_file, 'rb') as model: + if not parser.parse(model.read(), os.path.abspath(onnx_file)): + logger.log(trt.Logger.ERROR, "Failed parsing %s" % onnx_file) + for error in range(parser.num_errors): + logger.log(trt.Logger.ERROR, parser.get_error(error)) + logger.log(trt.Logger.INFO, "Succeeded parsing %s" % onnx_file) + + nBS = -1 + nMinBS = 1 + nOptBS = max(nMinBS, int(max_batch_size / 2)) + nMaxBS = max_batch_size + + inputT = network.get_input(0) + + # input sizes can be: + # - integer list, when inputs are constant size images. e.g. [3, H, W] + # - list of integer lists, when inputs are dynamic size images. e.g. [[1, 1, 2700], [1, 500, 2700], [1, 4096, 2700]] + assert isinstance(input_sizes, list), "input_sizes must be a list" + if isinstance(input_sizes[0], int): + logger.log(trt.Logger.INFO, f"Processed input sizes {input_sizes}") + inputT.shape = [nBS, *input_sizes] + min_size = opt_size = max_size = input_sizes + elif len(input_sizes) == 3 and isinstance(input_sizes[0], list): + min_size, opt_size, max_size = input_sizes + logger.log( + trt.Logger.INFO, + f"Processed min/opt/max input sizes {min_size}/{opt_size}/{max_size}" + ) + else: + raise ValueError(f"invalid input sizes: {input_sizes}") + + profile.set_shape(inputT.name, [nMinBS, *min_size], [nOptBS, *opt_size], + [nMaxBS, *max_size]) + if model_type == "pix2struct": + inputT = network.get_input(1) + P = input_sizes[0] # Number of patches + inputT.shape = [nBS, P] + profile.set_shape(inputT.name, [nMinBS, P], [nOptBS, P], [nMaxBS, P]) + config.add_optimization_profile(profile) + + t0 = time() + engine_string = builder.build_serialized_network(network, config) + t1 = time() + if engine_string is None: + raise RuntimeError("Failed building %s" % (engine_file)) + else: + logger.log(trt.Logger.INFO, + "Succeeded building %s in %d s" % (engine_file, t1 - t0)) + os.makedirs(engine_dir, exist_ok=True) + with open(engine_file, 'wb') as f: + f.write(engine_string) + + # Clear onnx files since we no longer need them after a successful engine build + if delete_onnx: + shutil.rmtree(onnx_dir) + + Builder.save_config(config_wrapper, config_file) + + +def build_blip2_engine(args): + processor = Blip2Processor.from_pretrained(args.model_path) + + raw_image = Image.new('RGB', [10, 10]) # dummy image + prompt = "Question: what is this? Answer:" + inputs = processor(raw_image, prompt, + return_tensors="pt").to(args.device, torch.float16) + image = inputs['pixel_values'] + + class Blip2VisionWrapper(torch.nn.Module): + + def __init__(self, vision_model, qformer, projector, query_tokens): + super().__init__() + self.vision_model = vision_model + self.qformer = qformer + self.projector = projector + self.query_tokens = query_tokens + + def forward(self, image): + features = self.vision_model(image)[0] + qformer_output = self.qformer(query_embeds=self.query_tokens, + encoder_hidden_states=features, + return_dict=True) + return self.projector(qformer_output.last_hidden_state) + + model = Blip2ForConditionalGeneration.from_pretrained( + args.model_path, torch_dtype=torch.float16) + + blip2_llm = "" + if model.language_model.config.architectures[ + 0] == 'T5ForConditionalGeneration': + blip2_llm = "t5" + elif model.language_model.config.architectures[0] == 'OPTForCausalLM': + blip2_llm = "opt" + + wrapper = Blip2VisionWrapper(model.vision_model, model.qformer, + model.language_projection, model.query_tokens) + wrapper.to(args.device) + + export_onnx(wrapper, image, f'{args.output_dir}/onnx') + build_trt_engine( + args.model_type + "-" + blip2_llm, # blip2-t5 or blip2-opt + [image.shape[1], image.shape[2], image.shape[3]], # [3, H, W] + f'{args.output_dir}/onnx', + args.output_dir, + args.max_batch_size) + + +def build_pix2struct_engine(args): + processor = AutoProcessor.from_pretrained(args.model_path) + raw_image = Image.new('RGB', [10, 10]) # dummy image + dtype = torch.float16 + inputs = processor(text="dummy", images=raw_image, return_tensors="pt") + image = inputs['flattened_patches'].to(args.device, dtype) + attention_mask = inputs['attention_mask'].to(args.device, torch.int) + + class pix2structVisionWrapper(torch.nn.Module): + + def __init__(self, encoder): + super().__init__() + self.encoder = encoder + + def forward(self, image, attention_mask): + vision_x = self.encoder.embeddings(image) + img_features = self.encoder.encoder(vision_x, + attention_mask=attention_mask) + img_features = self.encoder.layernorm(img_features[0]) + return img_features + + model = Pix2StructForConditionalGeneration.from_pretrained( + args.model_path, torch_dtype=dtype) + + wrapper = pix2structVisionWrapper(model.encoder.to(args.device)) + # input shape: batch size, number of patches, hidden dimension + # attention mask shape: batch size, number of patches + # The number of image patches can vary depending on the image size, but it typically + # falls within a relatively narrow range. To improve performance, we can avoid using + # dynamic axis for the input patches and instead use a fixed number of patches along + # with an attention mask. + export_onnx(wrapper, (image, attention_mask), + f'{args.output_dir}/onnx', + input_names=['input', 'attention_mask'], + dynamic_axes={ + 'input': { + 0: 'batch' + }, + 'attention_mask': { + 0: 'batch' + } + }) + build_trt_engine( + args.model_type, + [image.shape[1], image.shape[2]], # Number of Patches, Hidden Dimension + f'{args.output_dir}/onnx', + args.output_dir, + args.max_batch_size, + dtype=torch.bfloat16) + + +def build_llava_engine(args): + processor = AutoProcessor.from_pretrained(args.model_path) + if args.model_type == "llava": + raw_image = Image.new('RGB', [10, 10]) # dummy image + image = processor(text="dummy", images=raw_image, + return_tensors="pt")['pixel_values'].to( + args.device, torch.float16) + + class LlavaVisionWrapper(torch.nn.Module): + + def __init__(self, tower, projector, feature_layer): + super().__init__() + self.tower = tower + self.projector = projector + self.feature_layer = feature_layer + + def forward(self, image): + all_hidden_states = self.tower( + image, output_hidden_states=True).hidden_states + features = all_hidden_states[self.feature_layer][:, 1:] + return self.projector(features) + + model = LlavaForConditionalGeneration.from_pretrained( + args.model_path, torch_dtype=torch.float16) + wrapper = LlavaVisionWrapper( + model.vision_tower.to(args.device), + model.multi_modal_projector.to(args.device), + model.config.vision_feature_layer) + elif args.model_type == "llava_next": + from transformers import LlavaNextForConditionalGeneration + raw_image = Image.new('RGB', [512, 512]) + image = processor(text="dummy", images=raw_image, + return_tensors="pt")['pixel_values'].to( + args.device, torch.float16)[0] + + class LlavaNextVisionWrapper(torch.nn.Module): + + def __init__(self, vision_tower, projector): + super().__init__() + self.vision_tower = vision_tower + self.projector = projector + + def forward(self, pixel_values): + image_features = self.vision_tower(pixel_values, + output_hidden_states=True) + selected_image_feature = image_features.hidden_states[-2][:, 1:] + image_features = self.projector(selected_image_feature) + return image_features # (bs, 576, c) + + model = LlavaNextForConditionalGeneration.from_pretrained( + args.model_path, torch_dtype=torch.float16) + wrapper = LlavaNextVisionWrapper( + model.vision_tower.vision_model.to(args.device), + model.multi_modal_projector.to(args.device), + ) + + export_onnx(wrapper, image, f'{args.output_dir}/onnx') + build_trt_engine( + args.model_type, + [image.shape[1], image.shape[2], image.shape[3]], # [3, H, W] + f'{args.output_dir}/onnx', + args.output_dir, + args.max_batch_size) + if args.model_type == "llava_next": + image_newline = model.image_newline.data + tensor_img_newline = {"image_newline": image_newline} + save_file(tensor_img_newline, + os.path.join(args.output_dir, "image_newlines.safetensors")) + + +def build_vila_engine(args): + # Note: VILA model is not in public HF model zoo yet. We need to explicitly import from the git repo + sys.path.append(args.vila_path) + from llava.model import LlavaLlamaConfig, LlavaLlamaModel # noqa + from transformers import AutoModel + model = AutoModel.from_pretrained( + args.model_path, + device_map='auto', + ) + + vision_tower = model.get_vision_tower() + image_processor = vision_tower.image_processor + raw_image = Image.new('RGB', [10, 10]) # dummy image + image = image_processor(images=raw_image, + return_tensors="pt")['pixel_values'] + if isinstance(image, list): + image = image[0].unsqueeze(0) + image = image.to(args.device, torch.float16) + + class VilaVisionWrapper(torch.nn.Module): + + def __init__(self, tower, projector): + super().__init__() + self.tower = tower + self.projector = projector + + def forward(self, image): + features = self.tower(image) + return self.projector(features) + + model = AutoModel.from_pretrained( + args.model_path, + device_map='auto', + ) + wrapper = VilaVisionWrapper(model.get_vision_tower().to(args.device), + model.mm_projector.to(args.device)) + export_onnx(wrapper, image, f'{args.output_dir}/onnx') + build_trt_engine( + args.model_type, + [image.shape[1], image.shape[2], image.shape[3]], # [3, H, W] + f'{args.output_dir}/onnx', + args.output_dir, + args.max_batch_size) + + +def build_nougat_engine(args): + processor = NougatProcessor.from_pretrained(args.model_path) + raw_image = Image.new('RGB', [10, 10]) # dummy image + image = processor(raw_image, return_tensors="pt")['pixel_values'].to( + args.device, torch.float16) + + class SwinEncoderWrapper(torch.nn.Module): + + def __init__(self, encoder): + super().__init__() + self.encoder = encoder + + def forward(self, image): + return self.encoder(image).last_hidden_state + + model = VisionEncoderDecoderModel.from_pretrained(args.model_path, + torch_dtype=torch.float16) + swin_encoder = model.get_encoder().to(args.device) + wrapper = SwinEncoderWrapper(swin_encoder) + + export_onnx(wrapper, image, f'{args.output_dir}/onnx') + build_trt_engine( + args.model_type, + [image.shape[1], image.shape[2], image.shape[3]], # [3, H, W] + f'{args.output_dir}/onnx', + args.output_dir, + args.max_batch_size) + + +def build_cogvlm_engine(args): + hf_config = AutoConfig.from_pretrained(args.model_path, + trust_remote_code=True) + image_size = hf_config.vision_config['image_size'] + dtype = hf_config.torch_dtype + image = torch.empty(1, + 3, + image_size, + image_size, + dtype=dtype, + device=args.device) # dummy image + + class CogVlmVisionWrapper(torch.nn.Module): + + def __init__(self, encoder): + super().__init__() + self.encoder = encoder + + def forward(self, image): + return self.encoder(image) + + cogvlm = AutoModelForCausalLM.from_pretrained(args.model_path, + torch_dtype=dtype, + trust_remote_code=True) + vit_encoder = cogvlm.model.vision.to(args.device).eval() + + wrapper = CogVlmVisionWrapper(vit_encoder) + export_onnx(wrapper, image, f'{args.output_dir}/onnx') + build_trt_engine( + args.model_type, + [image.shape[1], image.shape[2], image.shape[3]], # [3, H, W] + f'{args.output_dir}/onnx', + args.output_dir, + args.max_batch_size, + dtype=dtype) + + +def build_fuyu_engine(args): + processor = FuyuProcessor.from_pretrained(args.model_path) + raw_image = Image.new('RGB', [10, 10]) + image = processor(text="dummy", images=raw_image, + return_tensors="pt")['image_patches'][0].to( + args.device, torch.float16).unsqueeze(0) + + class FuyuEncoderWrapper(torch.nn.Module): + + def __init__(self, linear): + super().__init__() + self.linear = linear.to(torch.float16) + + def forward(self, patches): + return self.linear(patches).flatten(0, 1) + + model = FuyuForCausalLM.from_pretrained(args.model_path, + torch_dtype=torch.float16) + + vision_encoder = model.vision_embed_tokens + wrapper = FuyuEncoderWrapper(vision_encoder).to(args.device) + + export_onnx(wrapper, + image, + f'{args.output_dir}/onnx', + dynamic_axes={'input': { + 0: 'batch', + 2: 'patch' + }}) + build_trt_engine( + args.model_type, + # [nImgs, nImgPatches, nDims] + # nImgs is always one since each query has exactly one image + # nImgPatches depends on image size (patch size: 30x30) + # nDims is 30x30x3=2700 (patch size x color channels) + [[1, 1, 2700], [1, 500, 2700], [1, 4096, 2700]], + f'{args.output_dir}/onnx', + args.output_dir, + args.max_batch_size) + + +def build_neva_engine(args): + # extract NeMo checkpoint + with tarfile.open(args.model_path) as tar: + nemo_config = yaml.safe_load(tar.extractfile("./model_config.yaml")) + try: + # trained without TP + mp0_weights = torch.load(tar.extractfile("./model_weights.ckpt"), + map_location=args.device) + except KeyError: + # trained with TP + mp0_weights = torch.load( + tar.extractfile("./mp_rank_00/model_weights.ckpt"), + map_location=args.device) + + vision_config = nemo_config["mm_cfg"]["vision_encoder"] + + class VisionEncoderWrapper(torch.nn.Module): + + def __init__(self, encoder, connector): + super().__init__() + self.encoder = encoder + self.connector = connector + + def forward(self, images): + vision_x = self.encoder(pixel_values=images, + output_hidden_states=True) + vision_x = vision_x.hidden_states[-2] + vision_x = vision_x[:, 1:] + vision_x = self.connector(vision_x) + return vision_x + + vision_path = vision_config["from_pretrained"] + joined_path = os.path.join(os.path.dirname(args.model_path), + os.path.basename(vision_path)) + if os.path.isdir(joined_path): + vision_path = joined_path + encoder = AutoModel.from_pretrained(vision_path, + torch_dtype=torch.bfloat16, + trust_remote_code=True) + vision_encoder = encoder.vision_model + hf_config = encoder.config + dtype = hf_config.torch_dtype + + # connector + assert nemo_config["mm_cfg"]["mm_mlp_adapter_type"] == "mlp2x_gelu" + vision_connector = torch.nn.Sequential( + torch.nn.Linear(vision_config["hidden_size"], + nemo_config["hidden_size"], + bias=True), torch.nn.GELU(), + torch.nn.Linear(nemo_config["hidden_size"], + nemo_config["hidden_size"], + bias=True)).to(dtype=dtype) + + key_prefix = "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector" + for layer in range(0, 3, 2): + vision_connector[layer].load_state_dict({ + 'weight': + mp0_weights[f"{key_prefix}.{layer}.weight"].to(dtype), + 'bias': + mp0_weights[f"{key_prefix}.{layer}.bias"].to(dtype), + }) + + # export the whole wrapper + wrapper = VisionEncoderWrapper(vision_encoder, + vision_connector).to(args.device, dtype) + image_size = hf_config.vision_config.image_size + dummy_image = torch.empty( + 1, 3, image_size, image_size, dtype=dtype, + device=args.device) # dummy image shape [B, C, H, W] + export_onnx(wrapper, dummy_image, f'{args.output_dir}/onnx') + build_trt_engine( + args.model_type, + [3, image_size, image_size], # [3, H, W] + f'{args.output_dir}/onnx', + args.output_dir, + args.max_batch_size, + dtype=dtype) + + +def build_video_neva_engine(args): + # extract NeMo checkpoint + with tarfile.open(args.model_path) as tar: + nemo_config = yaml.safe_load(tar.extractfile("./model_config.yaml")) + try: + # trained without TP + mp0_weights = torch.load(tar.extractfile("./model_weights.ckpt"), + map_location=args.device) + except KeyError: + # trained with TP + mp0_weights = torch.load( + tar.extractfile("./mp_rank_00/model_weights.ckpt"), + map_location=args.device) + + vision_config = nemo_config["mm_cfg"]["vision_encoder"] + + class VisionEncoderWrapper(torch.nn.Module): + + def __init__(self, encoder, connector): + super().__init__() + self.encoder = encoder + self.connector = connector + + def forward(self, images): + b, num_frames, c, h, w = images.shape + images = images.view(b * num_frames, c, h, w) + vision_x = self.encoder( + pixel_values=images, #[(B num_frames), C, H, W] + output_hidden_states=True) + vision_x = vision_x.hidden_states[-2] + vision_x = vision_x[:, 1:] + + # reshape back to [B, num_frames, img_size, hidden_size] + vision_x = vision_x.view(b, num_frames, -1, vision_x.shape[-1]) + + vision_x = self.connector(vision_x) + return vision_x + + encoder = AutoModel.from_pretrained(vision_config["from_pretrained"], + torch_dtype=torch.bfloat16, + trust_remote_code=True) + vision_encoder = encoder.vision_model + hf_config = encoder.config + dtype = hf_config.torch_dtype + + # connector + assert nemo_config["mm_cfg"]["mm_mlp_adapter_type"] == "linear" + vision_connector = torch.nn.Linear(vision_config["hidden_size"], + nemo_config["hidden_size"], + bias=True) + + key_prefix = "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector" + vision_connector.load_state_dict({ + 'weight': + mp0_weights[f"{key_prefix}.weight"].to(dtype), + 'bias': + mp0_weights[f"{key_prefix}.bias"].to(dtype), + }) + + # export the whole wrapper + wrapper = VisionEncoderWrapper(vision_encoder, + vision_connector).to(args.device, dtype) + image_size = hf_config.vision_config.image_size + num_frames = nemo_config['data']['num_frames'] + dummy_video = torch.empty(1, + num_frames, + 3, + image_size, + image_size, + dtype=dtype, + device=args.device) # dummy image + export_onnx(wrapper, dummy_video, f'{args.output_dir}/onnx') + build_trt_engine( + args.model_type, + [num_frames, 3, image_size, image_size], # [num_frames, 3, H, W] + f'{args.output_dir}/onnx', + args.output_dir, + args.max_batch_size, + dtype=dtype, + num_frames=num_frames) + + +def build_kosmos_engine(args): + processor = AutoProcessor.from_pretrained(args.model_path) + raw_image = Image.new('RGB', [10, 10]) # dummy image + image = processor(text="dummy", images=raw_image, + return_tensors="pt")['pixel_values'].to( + args.device, torch.float16) + + class VisionEncoderWrapper(torch.nn.Module): + + def __init__(self, encoder, connector): + super().__init__() + self.encoder = encoder + self.connector = connector + + def forward(self, images): + vision_x = self.encoder(images, output_hidden_states=True) + img_features = self.encoder.model.post_layernorm( + vision_x.last_hidden_state) + img_features = F.normalize(img_features, dim=-1) + img_features, _ = self.connector(img_features) + return img_features + + model = AutoModelForVision2Seq.from_pretrained(args.model_path, + torch_dtype=torch.float16) + wrapper = VisionEncoderWrapper( + model.vision_model.to(args.device), + model.image_to_text_projection.to(args.device)) + + export_onnx(wrapper, image, f'{args.output_dir}/onnx') + build_trt_engine( + args.model_type, + [image.shape[1], image.shape[2], image.shape[3]], # [3, H, W] + f'{args.output_dir}/onnx', + args.output_dir, + args.max_batch_size) + + +def build_phi_engine(args): + processor = AutoProcessor.from_pretrained(args.model_path, + trust_remote_code=True) + raw_image = Image.new('RGB', [10, 10]) # dummy image + image = processor(text="<|image_1|>\ndummy", + images=raw_image, + return_tensors="pt")['pixel_values'].to( + args.device, torch.float16) + try: + with open(f"{args.model_path}/preprocessor_config.json", "r") as file: + config = file.read() + config_dict = json.loads(config) + num_crops = config_dict.get("num_crops") + except: + num_crops = 16 + + class Phi3VisionWrapper(torch.nn.Module): + + def __init__(self, img_processor, img_projection, layer_idx, + image_dim_out): + super().__init__() + self.img_processor = img_processor + self.img_projection = img_projection + self.layer_idx = layer_idx + self.image_dim_out = image_dim_out + + def get_img_features( + self, img_embeds: torch.FloatTensor) -> torch.FloatTensor: + LAYER_IDX = self.layer_idx + + img_processor_output = self.img_processor(img_embeds, + output_hidden_states=True) + img_feature = img_processor_output.hidden_states[LAYER_IDX] + + patch_feature = img_feature[:, 1:] + return patch_feature + + def forward(self, image): + img_features = self.get_img_features(image) + base_feat_height = int(math.sqrt(img_features.shape[1])) + C = self.image_dim_out + H = base_feat_height + img_features = img_features.reshape(-1, H, H, C).reshape( + -1, H // 2, 2, H // 2, 2, + C).contiguous().permute(0, 1, 3, 2, 4, + 5).reshape(-1, H // 2, H // 2, + 4 * C).contiguous() + return self.apply_img_projection(img_features) + + def apply_img_projection(self, input): + return self.img_projection(input) + + model = AutoModelForCausalLM.from_pretrained(args.model_path, + torch_dtype=torch.float16, + trust_remote_code=True).to( + args.device) + + wrapper = Phi3VisionWrapper(model.model.vision_embed_tokens.img_processor, + model.model.vision_embed_tokens.img_projection, + model.model.vision_embed_tokens.layer_idx, + model.model.vision_embed_tokens.image_dim_out) + image = image.flatten(0, 1) + glb_GN = wrapper.apply_img_projection( + model.model.vision_embed_tokens.glb_GN) + sub_GN = wrapper.apply_img_projection( + model.model.vision_embed_tokens.sub_GN) + tensors = {"glb_GN": glb_GN, "sub_GN": sub_GN} + save_file(tensors, args.output_dir + "/image_newlines.safetensors") + export_onnx(wrapper, image, f'{args.output_dir}/onnx') + build_trt_engine( + args.model_type, [image.shape[1], image.shape[2], image.shape[3]], + f'{args.output_dir}/onnx', args.output_dir, + args.max_batch_size * (num_crops + 1)) #TODO: Take input from config + +if __name__ == '__main__': + # processor = NougatProcessor.from_pretrained(args.model_path) + # raw_image = Image.new('RGB', [10, 10]) # dummy image + # image = processor(raw_image, return_tensors="pt")['pixel_values'].to(args.device, torch.float16) + import sys + max_batch_size = int(sys.argv[1]) + output_dir=sys.argv[2] + class SwinEncoderWrapper(torch.nn.Module): + def __init__(self, encoder): + super().__init__() + self.encoder = encoder + + def forward(self, image): + return self.encoder(image).last_hidden_state + device="cuda" + model_type = 'nougat' + + model = VisionEncoderDecoderModel.from_pretrained("weights/unimernet_clean",torch_dtype=torch.float16) + swin_encoder = model.get_encoder().to(device) + wrapper = SwinEncoderWrapper(swin_encoder) + image = torch.randn(max_batch_size, 3, 192, 672).to(device, torch.float16) + export_onnx(wrapper, image, f'{output_dir}/onnx') + build_trt_engine(model_type, + [image.shape[1], image.shape[2], image.shape[3]], # [3, H, W] + f'{output_dir}/onnx', + output_dir, + max_batch_size) + diff --git a/convension/unimernet/convert_checkpoint.py b/convension/unimernet/convert_checkpoint.py new file mode 100644 index 0000000..d01058f --- /dev/null +++ b/convension/unimernet/convert_checkpoint.py @@ -0,0 +1,1535 @@ +import argparse +import configparser +import copy +import json +import logging +import os +import types +from ast import literal_eval +from datetime import datetime +from pathlib import Path + +import safetensors +from helper import convert_weight_to_dtype, fuse_qkv_one_layer, reshape, split +from transformers import (AutoModelForSeq2SeqLM, Blip2ForConditionalGeneration, + MBartForConditionalGeneration, + Pix2StructForConditionalGeneration, + T5ForConditionalGeneration, VisionEncoderDecoderModel) + +from tensorrt_llm.functional import (LayerNormPositionType, LayerNormType, + MLPType) +from tensorrt_llm.models import PretrainedConfig + +dir_path = os.path.dirname(os.path.realpath(__file__)) +LOGGER = logging.getLogger(__name__) + +layernorm_type_map = {i.name: i.value for i in LayerNormType} +layernorm_position_map = {i.name: i.value for i in LayerNormPositionType} +mlp_type_map = {i.name: i.value for i in MLPType} + + +def copy_args_to_component_config(component_config, args): + for arg in vars(args): + setattr(component_config, arg, getattr(args, arg)) + return component_config + + +def parse_t5_config(args, hf_model): + config = configparser.ConfigParser() + + config["encoder"] = {} + for key, val in hf_model.encoder.config.to_dict().items(): + config["encoder"][key] = f"{val}" + + # manually set q_scaling to offset attention scaling's effect. + # TODO: modify kernels to control whether to disable attention scaling + def get_offset_q_scaling(config): + scaling = 1 / config.head_size**.5 + return scaling + + config["decoder"] = {} + for key, val in hf_model.decoder.config.to_dict().items(): + config["decoder"][key] = f"{val}" + + config["structure"] = dict() + config["structure"]["t5_with_bias"] = "false" + config["structure"]["use_gated_activation"] = str( + hf_model.encoder.config.is_gated_act) + config["structure"]["position_embedding_type"] = "relative" + config["structure"]["model_type"] = args.model_type + + def parse_t5_config_by_component(config, component, args): + component_config = types.SimpleNamespace() + component_config = copy_args_to_component_config(component_config, args) + component_config.n_head = config.getint(component, 'num_heads') + component_config.head_size = config.getint(component, 'd_kv') + component_config.hidden_size = config.getint(component, 'd_model') + component_config.ffn_hidden_size = config.getint(component, 'd_ff') + component_config.vocab_size = config.getint(component, 'vocab_size') + component_config.n_positions = config.getint(component, + 'n_positions', + fallback=512) + component_config.has_position_embedding = config.getboolean( + component, 'has_position_embedding', + fallback=False) # TODO: hardcoded here + + component_config.has_token_type_embedding = config.getboolean( + component, 'has_token_type_embedding', fallback=False) + component_config.has_embedding_layernorm = config.getboolean( + component, 'has_embedding_layernorm', fallback=False) + component_config.has_embedding_scale = config.getboolean( + component, 'has_embedding_scale', fallback=False) + component_config.q_scaling = get_offset_q_scaling(component_config) + component_config.has_attention_qkvo_bias = config.getboolean( + component, 'has_attention_qkvo_bias', + fallback=False) # TODO: hardcoded here + component_config.has_mlp_bias = config.getboolean(component, + 'has_mlp_bias', + fallback=False) + component_config.has_model_final_layernorm = config.getboolean( + component, 'has_model_final_layernorm', fallback=True) + component_config.layernorm_eps = config.getfloat( + component, 'layer_norm_epsilon') + component_config.layernorm_position = layernorm_position_map[config.get( + component, 'layernorm_position', + fallback='pre_layernorm')] # TODO: hardcoded here + component_config.layernorm_type = layernorm_type_map[config.get( + component, 'layernorm_type', fallback='RmsNorm')] + component_config.hidden_act = config.get(component, 'dense_act_fn') + component_config.gated_act = config.getboolean(component, + 'is_gated_act') + component_config.mlp_type = mlp_type_map['GatedMLP' if component_config. + gated_act else 'MLP'] + component_config.num_buckets = config.getint( + component, 'relative_attention_num_buckets') + component_config.max_distance = config.getint( + component, 'relative_attention_max_distance') + component_config.position_embedding_type = config.get( + 'structure', 'position_embedding_type') + component_config.logits_dtype = config.get(component, + 'logits_dtype', + fallback='float32') + + if component == 'encoder': + component_config.n_layer = config.getint(component, 'num_layers') + + component_config.relative_attention = config.get( + 'structure', 'position_embedding_type') == 'relative' + + elif component == 'decoder': + component_config.n_layer = config.getint(component, + 'num_decoder_layers') + component_config.has_lm_head_bias = config.getboolean( + component, # TODO: T5 with bias + 'has_lm_head_bias', + fallback=False) + component_config.relative_attention = config.getboolean( + component, 'relative_attention', fallback=True) + component_config.rescale_before_lm_head = config.getboolean( + component, 'tie_word_embeddings' + ) # default is True (for T5), but False for Flan-T5 + component_config.encoder_hidden_size = config.getint( + 'encoder', 'd_model') + component_config.encoder_num_heads = config.getint( + 'encoder', 'num_heads') + component_config.encoder_head_size = config.getint( + 'encoder', 'd_kv') + component_config.decoder_start_token_id = config.getint( + 'decoder', 'decoder_start_token_id') + component_config.eos_token_id = config.getint( + 'decoder', 'eos_token_id') + bos_token_id = config.get('decoder', 'bos_token_id') + # T5 does not have bos_token_id + component_config.bos_token_id = int( + bos_token_id) if bos_token_id != "None" else None + component_config.pad_token_id = config.getint( + 'decoder', 'pad_token_id') + + else: + assert False, 'Unsupported component!' + + return component_config + + encoder_config = parse_t5_config_by_component(config, "encoder", args) + decoder_config = parse_t5_config_by_component(config, "decoder", args) + + return encoder_config, decoder_config + + +def convert_t5_weights_to_tllm_safetensors(config, component, params): + weights = {} + + mapping = config.mapping + + convert_weight_to_dtype(params, config.dtype) + hidden_size = config.hidden_size + ffn_hidden_size = config.intermediate_size + num_layers = config.num_hidden_layers + n_head = config.num_attention_heads + head_size = config.head_size + attention_hidden_size = n_head * head_size # head size * num_heads not necessarily equals hidden_dim, such as Flan-T5 + + hf_param_prefix = f'{component}' + trtllm_layer_name = f'{component}_layers' + trtllm_attn_layer_name = 'attention' if component == 'encoder' else 'self_attention' + trtllm_attn_layernorm_name = 'self_attention_layernorm' if component == 'decoder' else 'attention_layernorm' + hf_component_idx = 1 if component == 'encoder' else 2 + + def get_attn_module_name(component, block, layer, attn_type): + return f'{component}.block.{int(block)}.layer.{int(layer)}.{attn_type}' + + weights['embedding.vocab_embedding.weight'] = reshape( + params['shared.weight'].clone(), None) + + layers_range = mapping.pp_layers(num_layers) + for layer_idx in layers_range: + local_layer_idx = layer_idx - layers_range[0] + trtllm_layer_name_prefix = f'{trtllm_layer_name}.{local_layer_idx}' + hf_layer_name_prefix = f'{hf_param_prefix}.block.{layer_idx}' + + hidden_layer_name_split = { + f'{hf_layer_name_prefix}.layer.0.SelfAttention.o.weight': { + "name": + f'{trtllm_layer_name_prefix}.{trtllm_attn_layer_name}.dense.weight', + "shape": + (hidden_size, attention_hidden_size // mapping.tp_size), + "split_dim": -1 + }, + f'{hf_layer_name_prefix}.layer.{hf_component_idx}.DenseReluDense.wo.weight': + { + "name": f'{trtllm_layer_name_prefix}.mlp.proj.weight', + "shape": (hidden_size, ffn_hidden_size // mapping.tp_size), + "split_dim": -1 + }, + f'{hf_layer_name_prefix}.layer.{hf_component_idx}.DenseReluDense.wi.weight': + { + "name": f'{trtllm_layer_name_prefix}.mlp.fc.weight', + "shape": (ffn_hidden_size // mapping.tp_size, hidden_size), + "split_dim": 0 + }, + f'{hf_layer_name_prefix}.layer.{hf_component_idx}.DenseReluDense.wi_0.weight': + { + "name": f'{trtllm_layer_name_prefix}.mlp.fc.weight', + "shape": (ffn_hidden_size // mapping.tp_size, hidden_size), + "split_dim": 0 + }, + } + + hidden_layer_name_no_split = { + f'{hf_layer_name_prefix}.layer.0.layer_norm.weight': { + "name": + f'{trtllm_layer_name_prefix}.{trtllm_attn_layernorm_name}.weight', + "shape": None + }, + f'{hf_layer_name_prefix}.layer.{hf_component_idx}.layer_norm.weight': + { + "name": f'{trtllm_layer_name_prefix}.mlp_layernorm.weight', + "shape": None + }, + } + + if config.gated_act: + hidden_layer_name_split.update({ + f'{hf_layer_name_prefix}.layer.{hf_component_idx}.DenseReluDense.wi2.weight': + { + "name": f'{trtllm_layer_name_prefix}.mlp.gate.weight', + "shape": (ffn_hidden_size // mapping.tp_size, hidden_size), + "split_dim": 0 + }, + f'{hf_layer_name_prefix}.layer.{hf_component_idx}.DenseReluDense.wi_1.weight': + { + "name": f'{trtllm_layer_name_prefix}.mlp.gate.weight', + "shape": (ffn_hidden_size // mapping.tp_size, hidden_size), + "split_dim": 0 + }, + }) + + if component == 'decoder': + hidden_layer_name_split.update({ + f'{hf_layer_name_prefix}.layer.1.EncDecAttention.o.weight': { + "name": + f'{trtllm_layer_name_prefix}.cross_attention.dense.weight', + "shape": + (hidden_size, attention_hidden_size // mapping.tp_size), + "split_dim": -1 + }, + }) + hidden_layer_name_no_split.update({ + f'{hf_layer_name_prefix}.layer.1.layer_norm.weight': { + "name": + f'{trtllm_layer_name_prefix}.cross_attention_layernorm.weight', + "shape": None + }, + }) + self_attn_module_name = get_attn_module_name( + component, layer_idx, "1", 'EncDecAttention') + weights.update( + fuse_qkv_one_layer( + params, self_attn_module_name, + f'{trtllm_layer_name_prefix}.cross_attention', + mapping.tp_size, mapping.tp_rank, config.model_type, + (attention_hidden_size * 3 // mapping.tp_size, hidden_size), + None)) + + self_attn_module_name = get_attn_module_name(component, layer_idx, "0", + 'SelfAttention') + weights.update( + fuse_qkv_one_layer( + params, self_attn_module_name, + f'{trtllm_layer_name_prefix}.{trtllm_attn_layer_name}', + mapping.tp_size, mapping.tp_rank, config.model_type, + (attention_hidden_size * 3 // mapping.tp_size, hidden_size), + None)) + + weights[ + f'{trtllm_layer_name_prefix}.{trtllm_attn_layer_name}.rel_attn_table'] = reshape( + split( + params[ + f'{component}.block.0.layer.0.SelfAttention.relative_attention_bias.weight'] + .T, mapping.tp_size, mapping.tp_rank, 0), + (n_head // mapping.tp_size, config.num_buckets)) + + for hf_weight_name, weight_info in hidden_layer_name_split.items(): + if hf_weight_name in params.keys(): + weights[weight_info["name"]] = reshape( + split(params[hf_weight_name], + mapping.tp_size, + mapping.tp_rank, + dim=weight_info["split_dim"]), weight_info["shape"]) + for hf_weight_name, weight_info in hidden_layer_name_no_split.items(): + if hf_weight_name in params.keys(): + weights[weight_info["name"]] = reshape( + params[hf_weight_name].clone(), shape=weight_info["shape"]) + + weights['final_layernorm.weight'] = reshape( + params[f'{component}.final_layer_norm.weight'].clone(), None) + + if component == 'decoder': + weights['lm_head.weight'] = reshape( + split(params['lm_head.weight'], + mapping.tp_size, + mapping.tp_rank, + dim=0), (config.vocab_size // mapping.tp_size, hidden_size)) + if not config.use_implicit_relative_attention: + weights['rel_attn_table'] = reshape( + split( + params[ + f'{component}.block.0.layer.0.SelfAttention.relative_attention_bias.weight'] + .T, mapping.tp_size, mapping.tp_rank, 0), + (n_head // mapping.tp_size, config.num_buckets)) + + return weights + + +convert_blip2_weights_to_tllm_safetensors = convert_t5_weights_to_tllm_safetensors # func alias + + +def parse_nmt_config(args, model): + config = configparser.ConfigParser() + fairseq_config = vars(model.cfg.model) # Namespace --> dict + + config['encoder'] = dict() + for key, val in fairseq_config.items(): + config["encoder"][key] = f"{val}" + config["encoder"]["q_scaling"] = '1' + # NMT has final layernorm for pre-norm model architecture. + config['encoder']['has_model_final_layernorm'] = config['encoder'][ + 'encoder_normalize_before'] + config['encoder']['vocab_size'] = str(len(model.src_dict)) # fairseq naming + + config['decoder'] = dict() + for key, val in fairseq_config.items(): + config["decoder"][key] = f"{val}" + config["decoder"]["q_scaling"] = '1' + config["decoder"]["rescale_before_lm_head"] = 'false' + config['decoder']['has_model_final_layernorm'] = str( + config['decoder'].getboolean('decoder_normalize_before', False) + and not config['decoder'].getboolean('no_decoder_final_norm', False)) + config['decoder']['vocab_size'] = str(len(model.tgt_dict)) # fairseq naming + + config["structure"] = dict() + config["structure"]["t5_with_bias"] = "true" + config["structure"]["use_gated_activation"] = "false" + config["structure"][ + "position_embedding_type"] = "learned_absolute" # "sinusoid" + config["structure"]["model_type"] = args.model_type + + def parse_nmt_config_by_component(config, component, args): + assert component in ('encoder', 'decoder'), 'Unsupported component!' + component_config = types.SimpleNamespace() + component_config = copy_args_to_component_config(component_config, args) + component_config.n_layer = config.getint(component, + f'{component}_layers') + component_config.n_head = config.getint(component, + f'{component}_attention_heads') + component_config.hidden_size = config.getint( + component, f'{component}_embed_dim') # fairseq naming + component_config.head_size = config.getint( + component, + 'd_kv', + fallback=component_config.hidden_size // component_config.n_head) + component_config.ffn_hidden_size = config.getint( + component, f'{component}_ffn_embed_dim') # fairseq naming + component_config.vocab_size = config.getint(component, 'vocab_size') + component_config.n_positions = config.getint( + component, 'max_source_positions') # fairseq naming + component_config.has_position_embedding = not config.getboolean( + component, 'no_token_positional_embeddings', + fallback=False) # fairseq naming + component_config.has_token_type_embedding = config.getboolean( + component, 'has_token_type_embedding', fallback=False) + component_config.has_embedding_layernorm = config.getboolean( + component, 'layernorm_embedding', fallback=True) # fairseq naming + component_config.has_embedding_scale = not config.getboolean( + component, 'no_scale_embedding') # fairseq naming + component_config.q_scaling = config.getfloat(component, + 'q_scaling', + fallback=1.0) + component_config.has_attention_qkvo_bias = config.getboolean( + 'structure', 't5_with_bias', fallback=True) + component_config.has_mlp_bias = config.getboolean('structure', + 't5_with_bias', + fallback=True) + component_config.has_model_final_layernorm = config.getboolean( + component, 'has_model_final_layernorm') + component_config.layernorm_eps = config.getfloat( + component, 'layer_norm_epsilon', fallback=1e-5) # fairseq naming + + normalize_before = config.getboolean( + component, f'{component}_normalize_before') # fairseq naming + component_config.layernorm_position = layernorm_position_map[ + 'pre_layernorm' if normalize_before else 'post_layernorm'] + + component_config.layernorm_type = layernorm_type_map[config.get( + component, 'layernorm_type', fallback='LayerNorm')] + component_config.hidden_act = config.get( + component, 'activation_fn') # fairseq naming + component_config.gated_act = config.getboolean(component, + 'is_gated_act', + fallback=False) + component_config.mlp_type = mlp_type_map['GatedMLP' if component_config. + gated_act else 'MLP'] + component_config.relative_attention = config.get( + 'structure', 'position_embedding_type') == 'relative' + + component_config.num_buckets = config.getint( + component, 'relative_attention_num_buckets', fallback=0) + component_config.max_distance = config.getint( + component, 'relative_attention_max_distance', fallback=0) + component_config.position_embedding_type = config.get( + 'structure', 'position_embedding_type') + component_config.logits_dtype = config.get(component, + 'logits_dtype', + fallback='float32') + if component == 'decoder': + component_config.rescale_before_lm_head = config.getboolean( + component, 'rescale_before_lm_head') + + component_config.encoder_hidden_size = config.getint( + 'encoder', 'encoder_embed_dim') # fairseq naming + component_config.encoder_num_heads = config.getint( + 'encoder', 'encoder_attention_heads') + component_config.encoder_head_size = config.getint( + 'encoder', + 'd_kv', + fallback=component_config.encoder_hidden_size // + component_config.encoder_num_heads) + component_config.decoder_start_token_id = None + component_config.eos_token_id = None + component_config.bos_token_id = None + component_config.pad_token_id = None + + return component_config + + encoder_config = parse_nmt_config_by_component(config, "encoder", args) + decoder_config = parse_nmt_config_by_component(config, "decoder", args) + + return encoder_config, decoder_config + + +def convert_nmt_weights_to_tllm_safetensors(config, component, params, + sin_pos_embedding): + weights = {} + + mapping = config.mapping + + hidden_size = config.hidden_size + + convert_weight_to_dtype(params, config.dtype) + ffn_hidden_size = config.intermediate_size + vocab_size = config.vocab_size + + hf_param_prefix = f'models.0.{component}' + trtllm_layer_name = f'{component}_layers' + trtllm_attn_layer_name = 'attention' if component == 'encoder' else 'self_attention' + trtllm_attn_layernorm_name = 'self_attention_layernorm' if component == 'decoder' else 'attention_layernorm' + + hidden_layer_name_split = { + 'self_attn.out_proj.weight': { + "name": f'{trtllm_attn_layer_name}.dense.weight', + "shape": (hidden_size, hidden_size // mapping.tp_size), + "split_dim": -1 + }, + 'fc1.weight': { + "name": 'mlp.fc.weight', + "shape": (ffn_hidden_size // mapping.tp_size, hidden_size), + "split_dim": 0 + }, + 'fc1.bias': { + "name": 'mlp.fc.bias', + "shape": (ffn_hidden_size // mapping.tp_size), + "split_dim": 0 + }, + 'fc2.weight': { + "name": 'mlp.proj.weight', + "shape": (hidden_size, ffn_hidden_size // mapping.tp_size), + "split_dim": -1 + }, + } + + hidden_layer_name_no_split = { + 'self_attn.out_proj.bias': { + "name": f'{trtllm_attn_layer_name}.dense.bias', + "shape": (hidden_size) + }, + 'self_attn_layer_norm.weight': { + "name": f'{trtllm_attn_layernorm_name}.weight', + "shape": None + }, + 'self_attn_layer_norm.bias': { + "name": f'{trtllm_attn_layernorm_name}.bias', + "shape": None + }, + 'fc2.bias': { + "name": 'mlp.proj.bias', + "shape": (hidden_size) + }, + 'final_layer_norm.weight': { + "name": 'mlp_layernorm.weight', + "shape": None + }, + 'final_layer_norm.bias': { + "name": 'mlp_layernorm.bias', + "shape": None + }, + } + + if component == "decoder": + hidden_layer_name_split.update({ + 'encoder_attn.out_proj.weight': { + "name": 'cross_attention.dense.weight', + "shape": (hidden_size, hidden_size // mapping.tp_size), + "split_dim": -1 + }, + }) + hidden_layer_name_no_split.update({ + 'encoder_attn.out_proj.bias': { + "name": 'cross_attention.dense.bias', + "shape": (hidden_size) + }, + 'encoder_attn_layer_norm.weight': { + "name": 'cross_attention_layernorm.weight', + "shape": None, + }, + 'encoder_attn_layer_norm.bias': { + "name": 'cross_attention_layernorm.bias', + "shape": None + }, + }) + + def get_attn_module_name(component, layer, attn_type): + return f'models.0.{component}.layers.{int(layer)}.{attn_type}' + + weights["embedding.vocab_embedding.weight"] = reshape( + params[f'{hf_param_prefix}.embed_tokens.weight'].clone(), + (vocab_size, -1)) + weights["embedding.position_embedding.weight"] = reshape( + sin_pos_embedding, (config.max_position_embeddings, hidden_size)) + + num_layers = config.num_hidden_layers + + layers_range = mapping.pp_layers(num_layers) + for layer_idx in layers_range: + local_layer_idx = layer_idx - layers_range[0] + hf_layer_name_prefix = f'{hf_param_prefix}.layers.{layer_idx}' + trtllm_layer_name_prefix = f'{trtllm_layer_name}.{local_layer_idx}' + + for hf_weight_name, weight_info in hidden_layer_name_split.items(): + weights[ + f'{trtllm_layer_name_prefix}.{weight_info["name"]}'] = reshape( + split(params[f'{hf_layer_name_prefix}.{hf_weight_name}'], + mapping.tp_size, + mapping.tp_rank, + dim=weight_info["split_dim"]), weight_info["shape"]) + + for hf_weight_name, weight_info in hidden_layer_name_no_split.items(): + trtllm_layer_fullname = f'{trtllm_layer_name_prefix}.{weight_info["name"]}' + hf_layer_fullname = f'{hf_layer_name_prefix}.{hf_weight_name}' + weights[trtllm_layer_fullname] = reshape( + params[hf_layer_fullname].clone(), shape=weight_info["shape"]) + + self_attn_module_name = get_attn_module_name(component, layer_idx, + 'self_attn') + weights.update( + fuse_qkv_one_layer( + params, self_attn_module_name, + f'{trtllm_layer_name_prefix}.{trtllm_attn_layer_name}', + mapping.tp_size, mapping.tp_rank, config.model_type, + (hidden_size * 3 // mapping.tp_size, hidden_size), + (hidden_size * 3 // mapping.tp_size))) + if component == 'decoder': + cross_attn_module_name = get_attn_module_name( + component, layer_idx, 'encoder_attn') + weights.update( + fuse_qkv_one_layer( + params, cross_attn_module_name, + f'{trtllm_layer_name_prefix}.cross_attention', + mapping.tp_size, mapping.tp_rank, config.model_type, + (hidden_size * 3 // mapping.tp_size, hidden_size), + (hidden_size * 3 // mapping.tp_size))) + + if component == 'decoder': + weights['lm_head.weight'] = reshape( + split(params[f'{hf_param_prefix}.output_projection.weight'], + mapping.tp_size, + mapping.tp_rank, + dim=0), (config.vocab_size // mapping.tp_size, hidden_size)) + + if config.has_model_final_layernorm: + weights['final_layernorm.weight'] = params[ + f'{hf_param_prefix}.layer_norm.weight'].clone() + weights['final_layernorm.bias'] = params[ + f'{hf_param_prefix}.layer_norm.bias'].clone() + + return weights + + +def parse_bart_config(args, hf_model): + + config = configparser.ConfigParser() + + config['decoder'] = dict() + for key, val in hf_model.model.decoder.config.to_dict().items(): + config["decoder"][key] = f"{val}" + config["decoder"]["q_scaling"] = '1' + config["decoder"]["rescale_before_lm_head"] = str(False) + config['decoder']['has_model_final_layernorm'] = str( + args.nougat or isinstance(hf_model, MBartForConditionalGeneration)) + + if args.nougat: + # These flags are true for mbart decoders, but missing in HF config + config['decoder']['normalize_before'] = str(True) + config['decoder']['normalize_embeddings'] = str(True) + + config['encoder'] = dict() + # Init few encoder configs, needed by build, from decoder config + encoder_config_keys = [ + "encoder_ffn_dim", "encoder_layers", "encoder_attention_heads", + "encoder_layerdrop", "d_model" + ] + for key in encoder_config_keys: + config['encoder'][key] = config['decoder'][key] + else: + config['encoder'] = dict() + for key, val in hf_model.model.encoder.config.to_dict().items(): + config["encoder"][key] = f"{val}" + config["encoder"]["q_scaling"] = '1' + + # mBART has final layernorm, BART does not + config['encoder']['has_model_final_layernorm'] = str( + isinstance(hf_model, MBartForConditionalGeneration)) + + config["structure"] = dict() + config["structure"]["t5_with_bias"] = "true" + config["structure"]["use_gated_activation"] = "false" + config["structure"]["position_embedding_type"] = "learned_absolute" + config["structure"]["model_type"] = args.model_type + + def parse_bart_config_by_component(config, component, args): + assert component in ('encoder', 'decoder'), 'Unsupported component!' + component_config = types.SimpleNamespace() + component_config = copy_args_to_component_config(component_config, args) + component_config.n_layer = config.getint(component, + f'{component}_layers') + component_config.n_head = config.getint(component, + f'{component}_attention_heads') + component_config.hidden_size = config.getint(component, 'd_model') + component_config.head_size = config.getint( + component, + 'd_kv', + fallback=component_config.hidden_size // component_config.n_head) + component_config.ffn_hidden_size = config.getint( + component, f'{component}_ffn_dim') + component_config.vocab_size = config.getint(component, 'vocab_size') + component_config.n_positions = config.getint(component, + 'max_position_embeddings') + component_config.has_position_embedding = config.getboolean( + component, 'has_position_embedding', + fallback=True) # TODO: hardcoded here + component_config.has_token_type_embedding = config.getboolean( + component, 'has_token_type_embedding', fallback=False) + component_config.has_embedding_layernorm = config.getboolean( + component, 'has_embedding_layernorm', fallback=True) + component_config.has_embedding_scale = config.getboolean( + component, 'scale_embedding') + component_config.q_scaling = config.getfloat(component, + 'q_scaling', + fallback=1.0) + component_config.has_attention_qkvo_bias = config.getboolean( + 'structure', 't5_with_bias', fallback=True) + component_config.has_mlp_bias = config.getboolean('structure', + 't5_with_bias', + fallback=True) + component_config.has_model_final_layernorm = config.getboolean( + component, 'has_model_final_layernorm') + component_config.layernorm_eps = config.getfloat(component, + 'layer_norm_epsilon', + fallback=False) + + normalize_before = config.getboolean(component, 'normalize_before') + component_config.layernorm_position = layernorm_position_map[ + 'pre_layernorm' if normalize_before else 'post_layernorm'] + + component_config.layernorm_type = layernorm_type_map[config.get( + component, 'layernorm_type', fallback='LayerNorm')] + component_config.hidden_act = config.get(component, + 'activation_function') + component_config.gated_act = config.getboolean(component, + 'is_gated_act', + fallback=False) + component_config.mlp_type = mlp_type_map['GatedMLP' if component_config. + gated_act else 'MLP'] + component_config.relative_attention = config.get( + 'structure', 'position_embedding_type') == 'relative' + + component_config.num_buckets = config.getint( + component, 'relative_attention_num_buckets', fallback=0) + component_config.max_distance = config.getint( + component, 'relative_attention_max_distance', fallback=0) + component_config.max_lora_rank = config.getint(component, + 'max_lora_rank', + fallback=0) + component_config.lora_target_modules = literal_eval( + config.get(component, 'lora_target_modules', fallback="[]")) + component_config.hf_modules_to_trtllm_modules = literal_eval( + config.get(component, 'hf_modules_to_trtllm_modules', + fallback="{}")) + component_config.trtllm_modules_to_hf_modules = literal_eval( + config.get(component, 'trtllm_modules_to_hf_modules', + fallback="{}")) + component_config.logits_dtype = config.get(component, + 'logits_dtype', + fallback='float32') + component_config.position_embedding_type = config.get( + 'structure', 'position_embedding_type') + + if component == 'decoder': + component_config.rescale_before_lm_head = config.getboolean( + component, 'rescale_before_lm_head') + + component_config.encoder_hidden_size = config.getint( + 'encoder', 'd_model') + component_config.encoder_num_heads = config.getint( + 'encoder', 'encoder_attention_heads') + component_config.encoder_head_size = config.getint( + 'encoder', + 'd_kv', + fallback=component_config.encoder_hidden_size // + component_config.encoder_num_heads) + + # nougat has decoder_start_token_id = None, special handling + decoder_start_token_id = config.get('decoder', + 'decoder_start_token_id') + component_config.decoder_start_token_id = int( + decoder_start_token_id + ) if decoder_start_token_id != "None" else None + component_config.eos_token_id = config.getint( + 'decoder', 'eos_token_id') + component_config.bos_token_id = config.getint( + 'decoder', 'bos_token_id') + component_config.pad_token_id = config.getint( + 'decoder', 'pad_token_id') + + return component_config + + encoder_config = None + if not args.nougat: + encoder_config = parse_bart_config_by_component(config, "encoder", args) + decoder_config = parse_bart_config_by_component(config, "decoder", args) + + return encoder_config, decoder_config + + +def convert_bart_weights_to_tllm_safetensors(config, component, params): + weights = {} + + mapping = config.mapping + + hidden_size = config.hidden_size + + convert_weight_to_dtype(params, config.dtype) + ffn_hidden_size = config.intermediate_size + vocab_size = config.vocab_size + + hf_param_prefix = f'model.{component}' + trtllm_layer_name = f'{component}_layers' + trtllm_attn_layer_name = 'attention' if component == 'encoder' else 'self_attention' + trtllm_attn_layernorm_name = 'self_attention_layernorm' if component == 'decoder' else 'attention_layernorm' + embedding_layer_names = { + 'embed_tokens.weight': { + "name": 'embedding.vocab_embedding.weight', + "shape": (vocab_size, -1) + }, + 'embed_positions.weight': { + "name": 'embedding.position_embedding.weight', + "shape": (config.max_position_embeddings, hidden_size) + }, + 'layernorm_embedding.weight': { + "name": 'embedding.embedding_layernorm.weight', + "shape": None + }, + 'layernorm_embedding.bias': { + "name": 'embedding.embedding_layernorm.bias', + "shape": None + }, + } + + hidden_layer_name_split = { + 'self_attn.out_proj.weight': { + "name": f'{trtllm_attn_layer_name}.dense.weight', + "shape": (hidden_size, hidden_size // mapping.tp_size), + "split_dim": -1 + }, + 'fc1.weight': { + "name": 'mlp.fc.weight', + "shape": (ffn_hidden_size // mapping.tp_size, hidden_size), + "split_dim": 0 + }, + 'fc1.bias': { + "name": 'mlp.fc.bias', + "shape": (ffn_hidden_size // mapping.tp_size), + "split_dim": 0 + }, + 'fc2.weight': { + "name": 'mlp.proj.weight', + "shape": (hidden_size, ffn_hidden_size // mapping.tp_size), + "split_dim": -1 + }, + } + + hidden_layer_name_no_split = { + 'self_attn.out_proj.bias': { + "name": f'{trtllm_attn_layer_name}.dense.bias', + "shape": (hidden_size) + }, + 'self_attn_layer_norm.weight': { + "name": f'{trtllm_attn_layernorm_name}.weight', + "shape": None + }, + 'self_attn_layer_norm.bias': { + "name": f'{trtllm_attn_layernorm_name}.bias', + "shape": None + }, + 'fc2.bias': { + "name": 'mlp.proj.bias', + "shape": (hidden_size) + }, + 'final_layer_norm.weight': { + "name": 'mlp_layernorm.weight', + "shape": None + }, + 'final_layer_norm.bias': { + "name": 'mlp_layernorm.bias', + "shape": None + }, + } + + if config.model_type == 'mbart': + hidden_layer_name_split['layer_norm.weight'] = { + "name": 'final_layernorm.weight', + "shape": None, + "split_dim": 0 + } + hidden_layer_name_no_split['layer_norm.bias'] = { + "name": 'final_layernorm.bias', + "shape": None, + "split_dim": 0 + } + + if component == "decoder": + hidden_layer_name_split.update({ + 'encoder_attn.out_proj.weight': { + "name": 'cross_attention.dense.weight', + "shape": (hidden_size, hidden_size // mapping.tp_size), + "split_dim": -1 + } + }) + hidden_layer_name_no_split.update({ + 'encoder_attn.out_proj.bias': { + "name": 'cross_attention.dense.bias', + "shape": (hidden_size) + }, + 'encoder_attn_layer_norm.weight': { + "name": 'cross_attention_layernorm.weight', + "shape": None + }, + 'encoder_attn_layer_norm.bias': { + "name": 'cross_attention_layernorm.bias', + "shape": None + }, + }) + + def get_attn_module_name(component, layer, attn_type): + return f'model.{component}.layers.{int(layer)}.{attn_type}' + + for hf_weight_name, weight_info in embedding_layer_names.items(): + if 'position' in hf_weight_name: + weights[weight_info["name"]] = params[ + f'{hf_param_prefix}.{hf_weight_name}'][2:].clone() + else: + weights[weight_info["name"]] = params[ + f'{hf_param_prefix}.{hf_weight_name}'].clone() + weights[weight_info["name"]] = reshape(weights[weight_info["name"]], + weight_info["shape"]) + + num_layers = config.num_hidden_layers + + layers_range = mapping.pp_layers(num_layers) + for layer_idx in layers_range: + local_layer_idx = layer_idx - layers_range[0] + hf_layer_name_prefix = f'{hf_param_prefix}.layers.{layer_idx}' + trtllm_layer_name_prefix = f'{trtllm_layer_name}.{local_layer_idx}' + + for hf_weight_name, weight_info in hidden_layer_name_split.items(): + weights[ + f'{trtllm_layer_name_prefix}.{weight_info["name"]}'] = reshape( + split(params[f'{hf_layer_name_prefix}.{hf_weight_name}'], + mapping.tp_size, + mapping.tp_rank, + dim=weight_info["split_dim"]), weight_info["shape"]) + + for hf_weight_name, weight_info in hidden_layer_name_no_split.items(): + trtllm_layer_fullname = f'{trtllm_layer_name_prefix}.{weight_info["name"]}' + hf_layer_fullname = f'{hf_layer_name_prefix}.{hf_weight_name}' + weights[trtllm_layer_fullname] = reshape( + params[hf_layer_fullname].clone(), shape=weight_info["shape"]) + + self_attn_module_name = get_attn_module_name(component, layer_idx, + 'self_attn') + weights.update( + fuse_qkv_one_layer( + params, self_attn_module_name, + f'{trtllm_layer_name_prefix}.{trtllm_attn_layer_name}', + mapping.tp_size, mapping.tp_rank, config.model_type, + (hidden_size * 3 // mapping.tp_size, hidden_size), + (hidden_size * 3 // mapping.tp_size))) + if component == 'decoder': + cross_attn_module_name = get_attn_module_name( + component, layer_idx, 'encoder_attn') + weights.update( + fuse_qkv_one_layer( + params, cross_attn_module_name, + f'{trtllm_layer_name_prefix}.cross_attention', + mapping.tp_size, mapping.tp_rank, config.model_type, + (hidden_size * 3 // mapping.tp_size, hidden_size), + (hidden_size * 3 // mapping.tp_size))) + + if component == 'decoder': + weights['lm_head.weight'] = reshape( + split(params['lm_head.weight'], + mapping.tp_size, + mapping.tp_rank, + dim=0), (config.vocab_size // mapping.tp_size, hidden_size)) + + if config.has_model_final_layernorm: + weights['final_layernorm.weight'] = params[ + f'{hf_param_prefix}.layer_norm.weight'].clone() + weights['final_layernorm.bias'] = params[ + f'{hf_param_prefix}.layer_norm.bias'].clone() + + return weights + + +def parse_pix2struct_config(args, hf_model): + # manually set q_scaling to offset attention scaling's effect. + # TODO: modify kernels to control whether to disable attention scaling + config = configparser.ConfigParser() + + def get_offset_q_scaling(config) -> str: + d_model = config.hidden_size + num_heads = config.num_heads + head_size = d_model / num_heads + scaling = 1 / head_size**.5 + return str(scaling) + + config["decoder"] = {} + for key, val in hf_model.decoder.config.to_dict().items(): + config["decoder"][key] = f"{val}" + + config["decoder"]["q_scaling"] = get_offset_q_scaling( + hf_model.decoder.config) + + config["structure"] = dict() + config["structure"]["pix2struct_with_bias"] = "false" + config["structure"]["use_gated_activation"] = "false" + config["structure"]["position_embedding_type"] = "relative" + config["structure"]["model_type"] = args.model_type + + def parse_pix2struct_config_by_component(config, component, args): + if component == 'decoder': + args.n_layer = config.getint(component, 'num_layers') + args.n_head = config.getint(component, 'num_heads') + args.head_size = config.getint(component, 'd_kv') + args.hidden_size = config.getint(component, 'hidden_size') + args.ffn_hidden_size = config.getint(component, 'd_ff') + args.vocab_size = config.getint(component, 'vocab_size') + args.n_positions = config.getint(component, + 'n_positions', + fallback=512) + args.has_position_embedding = config.getboolean( + component, 'has_position_embedding', + fallback=False) # TODO: hardcoded here + args.has_token_type_embedding = config.getboolean( + component, 'has_token_type_embedding', fallback=False) + args.has_embedding_layernorm = config.getboolean( + component, 'has_embedding_layernorm', fallback=False) + args.has_embedding_scale = config.getboolean(component, + 'has_embedding_scale', + fallback=False) + args.q_scaling = config.getfloat(component, + 'q_scaling', + fallback=1.0) + args.has_attention_qkvo_bias = config.getboolean( + component, 'has_attention_qkvo_bias', fallback=False) + args.has_mlp_bias = config.getboolean(component, + 'has_mlp_bias', + fallback=False) + args.has_model_final_layernorm = config.getboolean( + component, 'has_model_final_layernorm', fallback=True) + args.layernorm_eps = config.getfloat(component, + 'layer_norm_epsilon') + args.layernorm_position = layernorm_position_map[config.get( + component, 'layernorm_position', + fallback='pre_layernorm')] # TODO: hardcoded here + args.layernorm_type = layernorm_type_map[config.get( + component, 'layernorm_type', fallback='RmsNorm')] + args.hidden_act = config.get(component, 'dense_act_fn') + args.gated_act = True + args.mlp_type = mlp_type_map['GatedMLP' if args. + gated_act else 'MLP'] + args.has_lm_head_bias = config.getboolean( + component, # TODO: T5 with bias + 'has_lm_head_bias', + fallback=False) + args.relative_attention = config.getboolean(component, + 'relative_attention', + fallback=True) + args.num_buckets = config.getint(component, + 'relative_attention_num_buckets') + args.max_distance = config.getint( + component, 'relative_attention_max_distance') + args.logits_dtype = config.get(component, + 'logits_dtype', + fallback='float32') + args.rescale_before_lm_head = config.getboolean( + component, 'tie_word_embeddings' + ) # default is True (for T5), but False for Flan-T5 + args.encoder_hidden_size = config.getint('decoder', 'hidden_size') + args.encoder_num_heads = config.getint('decoder', 'num_heads') + args.encoder_head_size = config.getint('decoder', 'd_kv') + args.position_embedding_type = config.get( + 'structure', 'position_embedding_type') + args.decoder_start_token_id = config.getint( + 'decoder', 'decoder_start_token_id') + args.eos_token_id = config.getint('decoder', 'eos_token_id') + bos_token_id = config.get('decoder', 'bos_token_id') + # pix2struct does not have bos_token_id + args.bos_token_id = int( + bos_token_id) if bos_token_id != "None" else None + args.pad_token_id = config.getint('decoder', 'pad_token_id') + + else: + assert False, 'Unsupported component!' + return args + + decoder_args = parse_pix2struct_config_by_component(config, "decoder", args) + return None, decoder_args + + +def convert_pix2struct_weights_to_tllm_safetensors(config, component, params): + weights = {} + + mapping = config.mapping + + convert_weight_to_dtype(params, config.dtype) + hidden_size = config.hidden_size + ffn_hidden_size = config.intermediate_size + num_layers = config.num_hidden_layers + n_head = config.num_attention_heads + head_size = config.head_size + attention_hidden_size = n_head * head_size # head size * num_heads not necessarily equals hidden_dim, such as Flan-T5 + + hf_param_prefix = f'{component}' + trtllm_layer_name = f'{component}_layers' + trtllm_attn_layer_name = 'self_attention' + trtllm_attn_layernorm_name = 'self_attention_layernorm' + + def get_attn_module_name(component, layer, attn_type): + return f'{component}.layer.{int(layer)}.{attn_type}.attention' + + weights['embedding.vocab_embedding.weight'] = reshape( + params[f'{hf_param_prefix}.embed_tokens.weight'].clone(), None) + + layers_range = mapping.pp_layers(num_layers) + for layer_idx in layers_range: + local_layer_idx = layer_idx - layers_range[0] + trtllm_layer_name_prefix = f'{trtllm_layer_name}.{local_layer_idx}' + hf_layer_name_prefix = f'{hf_param_prefix}.layer.{layer_idx}' + + hidden_layer_name_split = { + f'{hf_layer_name_prefix}.self_attention.attention.output.weight': { + "name": + f'{trtllm_layer_name_prefix}.{trtllm_attn_layer_name}.dense.weight', + "shape": + (hidden_size, attention_hidden_size // mapping.tp_size), + "split_dim": -1 + }, + f'{hf_layer_name_prefix}.mlp.DenseReluDense.wo.weight': { + "name": f'{trtllm_layer_name_prefix}.mlp.proj.weight', + "shape": (hidden_size, ffn_hidden_size // mapping.tp_size), + "split_dim": -1 + }, + f'{hf_layer_name_prefix}.mlp.DenseReluDense.wi_0.weight': { + "name": f'{trtllm_layer_name_prefix}.mlp.fc.weight', + "shape": (ffn_hidden_size // mapping.tp_size, hidden_size), + "split_dim": 0 + }, + } + + hidden_layer_name_no_split = { + f'{hf_layer_name_prefix}.self_attention.layer_norm.weight': { + "name": + f'{trtllm_layer_name_prefix}.{trtllm_attn_layernorm_name}.weight', + "shape": None + }, + f'{hf_layer_name_prefix}.mlp.layer_norm.weight': { + "name": f'{trtllm_layer_name_prefix}.mlp_layernorm.weight', + "shape": None + }, + } + + if config.gated_act: + hidden_layer_name_split.update({ + f'{hf_layer_name_prefix}.mlp.DenseReluDense.wi_1.weight': { + "name": f'{trtllm_layer_name_prefix}.mlp.gate.weight', + "shape": (ffn_hidden_size // mapping.tp_size, hidden_size), + "split_dim": 0 + }, + }) + + hidden_layer_name_split.update({ + f'{hf_layer_name_prefix}.encoder_decoder_attention.attention.output.weight': + { + "name": + f'{trtllm_layer_name_prefix}.cross_attention.dense.weight', + "shape": + (hidden_size, attention_hidden_size // mapping.tp_size), + "split_dim": -1 + }, + }) + hidden_layer_name_no_split.update({ + f'{hf_layer_name_prefix}.encoder_decoder_attention.layer_norm.weight': + { + "name": + f'{trtllm_layer_name_prefix}.cross_attention_layernorm.weight', + "shape": None + }, + }) + self_attn_module_name = get_attn_module_name( + component, layer_idx, 'encoder_decoder_attention') + weights.update( + fuse_qkv_one_layer( + params, self_attn_module_name, + f'{trtllm_layer_name_prefix}.cross_attention', mapping.tp_size, + mapping.tp_rank, config.model_type, + (attention_hidden_size * 3 // mapping.tp_size, hidden_size), + None)) + + self_attn_module_name = get_attn_module_name(component, layer_idx, + 'self_attention') + weights.update( + fuse_qkv_one_layer( + params, self_attn_module_name, + f'{trtllm_layer_name_prefix}.{trtllm_attn_layer_name}', + mapping.tp_size, mapping.tp_rank, config.model_type, + (attention_hidden_size * 3 // mapping.tp_size, hidden_size), + None)) + + weights[ + f'{trtllm_layer_name_prefix}.{trtllm_attn_layer_name}.rel_attn_table'] = reshape( + split( + params[ + f'{component}.layer.0.self_attention.attention.relative_attention_bias.weight'] + .T, mapping.tp_size, mapping.tp_rank, 0), + (n_head // mapping.tp_size, config.num_buckets)) + + for hf_weight_name, weight_info in hidden_layer_name_split.items(): + if hf_weight_name in params.keys(): + weights[weight_info["name"]] = reshape( + split(params[hf_weight_name], + mapping.tp_size, + mapping.tp_rank, + dim=weight_info["split_dim"]), weight_info["shape"]) + for hf_weight_name, weight_info in hidden_layer_name_no_split.items(): + if hf_weight_name in params.keys(): + weights[weight_info["name"]] = reshape( + params[hf_weight_name].clone(), shape=weight_info["shape"]) + + weights[f'final_layernorm.weight'] = reshape( + params[f'{component}.final_layer_norm.weight'].clone(), None) + + weights['lm_head.weight'] = reshape( + split(params[f'{component}.lm_head.weight'], + mapping.tp_size, + mapping.tp_rank, + dim=0), (config.vocab_size // mapping.tp_size, hidden_size)) + if not config.use_implicit_relative_attention: + weights[f'rel_attn_table'] = reshape( + split( + params[ + f'{component}.layer.0.self_attention.attention.relative_attention_bias.weight'] + .T, mapping.tp_size, mapping.tp_rank, 0), + (n_head // mapping.tp_size, config.num_buckets)) + + return weights + + +def get_model(args): + if args.model_type == "t5": + model = T5ForConditionalGeneration.from_pretrained(args.model_dir) + elif args.model_type == "nmt": + from fairseq.models.transformer import TransformerModel + model = TransformerModel.from_pretrained(args.model_dir) + elif args.model_type == "bart": + if args.nougat: + model = VisionEncoderDecoderModel.from_pretrained(args.model_dir) + model = model.get_decoder() + else: + model = AutoModelForSeq2SeqLM.from_pretrained(args.model_dir) + elif args.model_type == "pix2struct": + model = Pix2StructForConditionalGeneration.from_pretrained( + args.model_dir) + elif args.model_type == "blip2": + model = Blip2ForConditionalGeneration.from_pretrained( + args.model_dir).language_model + return model + + +def convert_checkpoint(args): + + model = get_model(args) + + saved_dir = Path(args.output_dir) + saved_dir.mkdir(parents=True, exist_ok=True) + + encoder_saved_dir = saved_dir / "encoder" + encoder_saved_dir.mkdir(parents=True, exist_ok=True) + decoder_saved_dir = saved_dir / "decoder" + decoder_saved_dir.mkdir(parents=True, exist_ok=True) + + world_size = args.tp_size * args.pp_size + + kv_cache_quant_algo = None + quant_algo = None + + model_type = args.model_type if args.model_type != "blip2" else "t5" + encoder_config, decoder_config = globals()[f'parse_{model_type}_config']( + args, model) + + additional_settings = ["gated_act"] + + if not args.nougat and args.model_type != "pix2struct": + tllm_encoder_config = { + 'architecture': "EncoderModel", + 'dtype': args.dtype, + 'logits_dtype': encoder_config.logits_dtype, + 'num_hidden_layers': encoder_config.n_layer, + 'num_attention_heads': encoder_config.n_head, + 'hidden_size': encoder_config.hidden_size, + 'norm_epsilon': encoder_config.layernorm_eps, + 'vocab_size': encoder_config.vocab_size, + 'position_embedding_type': encoder_config.position_embedding_type, + 'hidden_act': encoder_config.hidden_act, + 'quantization': { + 'quant_algo': quant_algo, + 'kv_cache_quant_algo': kv_cache_quant_algo, + }, + 'mapping': { + 'world_size': world_size, + 'tp_size': args.tp_size, + 'pp_size': args.pp_size, + }, + 'use_parallel_embedding': args.use_parallel_embedding, + 'embedding_sharding_dim': args.embedding_sharding_dim, + 'share_embedding_table': args.use_embedding_sharing, + 'max_position_embeddings': encoder_config.n_positions, + 'num_key_value_heads': encoder_config.n_head, + 'head_size': encoder_config.head_size, + 'has_position_embedding': encoder_config.has_position_embedding, + 'layernorm_type': encoder_config.layernorm_type, + 'has_attention_qkvo_bias': encoder_config.has_attention_qkvo_bias, + 'has_mlp_bias': encoder_config.has_mlp_bias, + 'has_model_final_layernorm': + encoder_config.has_model_final_layernorm, + 'has_embedding_layernorm': encoder_config.has_embedding_layernorm, + 'has_embedding_scale': encoder_config.has_embedding_scale, + 'intermediate_size': encoder_config.ffn_hidden_size, + 'q_scaling': encoder_config.q_scaling, + 'layernorm_position': encoder_config.layernorm_position, + 'mlp_type': encoder_config.mlp_type, + 'relative_attention': encoder_config.relative_attention, + 'max_distance': encoder_config.max_distance, + 'num_buckets': encoder_config.num_buckets, + 'model_type': encoder_config.model_type, + } + + for additional_setting in additional_settings: + if hasattr(encoder_config, additional_setting): + tllm_encoder_config.update({ + additional_setting: + getattr(encoder_config, additional_setting) + }) + + with (encoder_saved_dir / "config.json").open('w') as f: + json.dump(tllm_encoder_config, f, indent=4) + + encoder_convert_args = dict(params=model.state_dict(), + component="encoder") + tllm_decoder_config = { + 'architecture': "DecoderModel", + 'dtype': args.dtype, + 'logits_dtype': decoder_config.logits_dtype, + 'num_hidden_layers': decoder_config.n_layer, + 'num_attention_heads': decoder_config.n_head, + 'hidden_size': decoder_config.hidden_size, + 'norm_epsilon': decoder_config.layernorm_eps, + 'vocab_size': decoder_config.vocab_size, + 'position_embedding_type': decoder_config.position_embedding_type, + 'hidden_act': decoder_config.hidden_act, + 'quantization': { + 'quant_algo': quant_algo, + 'kv_cache_quant_algo': kv_cache_quant_algo, + }, + 'mapping': { + 'world_size': world_size, + 'tp_size': args.tp_size, + 'pp_size': args.pp_size, + }, + 'use_parallel_embedding': args.use_parallel_embedding, + 'embedding_sharding_dim': args.embedding_sharding_dim, + 'share_embedding_table': args.use_embedding_sharing, + 'max_position_embeddings': decoder_config.n_positions, + 'head_size': decoder_config.head_size, + 'has_position_embedding': decoder_config.has_position_embedding, + 'layernorm_type': decoder_config.layernorm_type, + 'has_attention_qkvo_bias': decoder_config.has_attention_qkvo_bias, + 'has_mlp_bias': decoder_config.has_mlp_bias, + 'has_model_final_layernorm': decoder_config.has_model_final_layernorm, + 'has_embedding_layernorm': decoder_config.has_embedding_layernorm, + 'has_embedding_scale': decoder_config.has_embedding_scale, + 'intermediate_size': decoder_config.ffn_hidden_size, + 'q_scaling': decoder_config.q_scaling, + 'layernorm_position': decoder_config.layernorm_position, + 'mlp_type': decoder_config.mlp_type, + 'relative_attention': decoder_config.relative_attention, + 'max_distance': decoder_config.max_distance, + 'num_buckets': decoder_config.num_buckets, + 'model_type': decoder_config.model_type, + 'rescale_before_lm_head': decoder_config.rescale_before_lm_head, + 'encoder_hidden_size': decoder_config.encoder_hidden_size, + 'encoder_num_heads': decoder_config.encoder_num_heads, + 'encoder_head_size': decoder_config.encoder_head_size, + 'skip_cross_qkv': args.skip_cross_qkv, + 'use_implicit_relative_attention': args.use_implicit_relative_attention, + 'decoder_start_token_id': decoder_config.decoder_start_token_id, + 'eos_token_id': decoder_config.eos_token_id, + 'bos_token_id': decoder_config.bos_token_id, + 'pad_token_id': decoder_config.pad_token_id, + } + for additional_setting in additional_settings: + if hasattr(decoder_config, additional_setting): + tllm_decoder_config.update({ + additional_setting: + getattr(decoder_config, additional_setting) + }) + + with (decoder_saved_dir / "config.json").open('w') as f: + json.dump(tllm_decoder_config, f, indent=4) + + decoder_convert_args = dict(params=model.state_dict(), component="decoder") + + if args.model_type == "nmt": + fairseq_config = vars(model.cfg.model) # Namespace --> dict + num_embeddings = fairseq_config['max_source_positions'] + embedding_dim = fairseq_config['encoder_embed_dim'] + padding_idx = model.models[0].encoder.embed_tokens.padding_idx # 1 + + sin_pos_embedding = model.models[ + 0].encoder.embed_positions.get_embedding( + padding_idx + 1 + num_embeddings, + embedding_dim, + padding_idx=padding_idx) # [2 + num_embeddings, embed_dim] + sin_pos_embedding = sin_pos_embedding[2:, :] # remove offset embeddings + + encoder_convert_args["sin_pos_embedding"] = sin_pos_embedding + decoder_convert_args["sin_pos_embedding"] = sin_pos_embedding + + if args.workers == 1: + if not args.nougat and args.model_type != "pix2struct": + convert(0, world_size, args, tllm_encoder_config, + encoder_convert_args, encoder_saved_dir) + convert(0, world_size, args, tllm_decoder_config, decoder_convert_args, + decoder_saved_dir) + else: + if args.workers > world_size: + args.workers = world_size + LOGGER.info(f'Convert checkpoint using {args.workers} workers.') + import torch.multiprocessing as mp + if not args.nougat and args.model_type != "pix2struct": + mp.spawn(convert, + nprocs=args.workers, + args=(world_size, args, tllm_encoder_config, + encoder_convert_args, encoder_saved_dir)) + mp.spawn(convert, + nprocs=args.workers, + args=(world_size, args, tllm_decoder_config, + decoder_convert_args, decoder_saved_dir)) + + +def convert(worker_rank, world_size, args, model_config, convert_args, + saved_dir): + for rank in range(worker_rank, world_size, args.workers): + rank_config = copy.deepcopy(PretrainedConfig.from_dict(model_config)) + rank_config.set_rank(rank) + weights = globals( + )[f'convert_{rank_config.model_type}_weights_to_tllm_safetensors']( + config=rank_config, **convert_args) + safetensors.torch.save_file(weights, + f'{saved_dir}/rank{rank}.safetensors') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument( + '--model_type', + type=str, + default='t5', + choices=['t5', 'nmt', 'bart', 'pix2struct', 'blip2'], + help= + 'Multimodal type when this script is used for multimodal conversion.') + + parser.add_argument('--tp_size', + type=int, + default=1, + help='N-way tensor parallelism size') + parser.add_argument('--pp_size', + type=int, + default=1, + help='N-way pipeline parallelism size') + parser.add_argument("--model_dir", + "-i", + type=str, + help="Path to the framework checkpoint file", + required=True) + parser.add_argument("--output_dir", + "-o", + type=str, + help="Path to the converted TRT-LLM model weight file", + required=True) + parser.add_argument( + "--workers", + type=int, + help="How many workers to spawn for conversion (default: 4)", + default=4) + parser.add_argument("--nougat", + action="store_true", + help="Model which uses vision encoder + mbart decoder") + parser.add_argument("--verbose", + action="store_true", + help="Provide verbose messages") + parser.add_argument( + '--use_parallel_embedding', + action="store_true", + default=False, + help= + 'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled' + ) + parser.add_argument( + '--embedding_sharding_dim', + type=int, + default=0, + choices=[0, 1], + help= + 'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). ' + 'To shard it along hidden dimension, set embedding_sharding_dim=1' + 'Note: embedding sharding is only enabled when embedding_sharding_dim = 0' + ) + parser.add_argument( + '--use_weight_only', + default=False, + action="store_true", + help='Quantize weights for the various GEMMs to INT4/INT8.' + 'See --weight_only_precision to set the precision') + parser.add_argument( + '--weight_only_precision', + const='int8', + type=str, + nargs='?', + default='int8', + choices=['int8', 'int4'], + help= + 'Define the precision for the weights when using weight-only quantization.' + 'You must also use --use_weight_only for that argument to have an impact.' + ) + parser.add_argument( + '--use_embedding_sharing', + action="store_true", + default=False, + help= + 'Try to reduce the engine size by sharing the embedding lookup table between two layers.' + 'Note: the flag might not take effect when the criteria are not met.') + parser.add_argument( + '--dtype', + type=str, + default='float16', + choices=['float16', 'float32', 'bfloat16'], + help= + 'Target inference dtype. Weights and Computation will be in this dtype, no matter what original dtype the weight checkpoint has.' + ) + parser.add_argument( + '--skip_cross_qkv', + action='store_true', + help= + 'Skip redundant cross qkv computation by using TensorRT IfConditional switch (experimental).' + ) + parser.add_argument( + '--use_implicit_relative_attention', + action='store_true', + help= + 'Compute relative attention bias on the fly instead of pre-compute a relative attention bias table.' + ) + args = parser.parse_args() + log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s" + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO, + format=log_format) + LOGGER.info("\n=============== Argument ===============") + for key in vars(args): + LOGGER.info(f"{key}: {vars(args)[key]}") + LOGGER.info("========================================") + + start_time = datetime.now() + convert_checkpoint(args) + stop_time = datetime.now() + run_time = (stop_time - start_time) + LOGGER.info("Spend {} (h:m:s) to convert the model".format(run_time)) diff --git a/convension/unimernet/convert_checkpoint.sh b/convension/unimernet/convert_checkpoint.sh new file mode 100644 index 0000000..8d3ce00 --- /dev/null +++ b/convension/unimernet/convert_checkpoint.sh @@ -0,0 +1,8 @@ +#!/bin/bash +#SBATCH -J ParseSciHUB +#SBATCH -o log/%j-convert.out +#SBATCH -e log/%j-convert.out +mpirun -n 1 python convert_checkpoint.py --model_type bart \ + --model_dir /mnt/petrelfs/zhangtianning.di/projects/PDF-Extract-Kit/weights/unimernet_clean \ + --output_dir trt_models/unimernet/bfloat16 --tp_size 1 --pp_size 1 --dtype bfloat16 --nougat +## please use sbatch \ No newline at end of file diff --git a/convension/unimernet/findmpi.py b/convension/unimernet/findmpi.py new file mode 100644 index 0000000..90f0b75 --- /dev/null +++ b/convension/unimernet/findmpi.py @@ -0,0 +1,20 @@ +import os + +def find_libmpi(): + # Get the current conda environment path + conda_env_path = os.environ.get('CONDA_PREFIX') + if not conda_env_path: + return 'No conda environment found.' + + # Search for libmpi.so files + libmpi_files = [] + for root, dirs, files in os.walk(conda_env_path): + for file in files: + if 'libmpi.so' in file: + libmpi_files.append(os.path.join(root, file)) + + return libmpi_files + +# Execute the function and print the results +libmpi_locations = find_libmpi() +print(libmpi_locations) \ No newline at end of file diff --git a/convension/unimernet/requirements-vila.txt b/convension/unimernet/requirements-vila.txt new file mode 100644 index 0000000..2b7e34f --- /dev/null +++ b/convension/unimernet/requirements-vila.txt @@ -0,0 +1,2 @@ +git+https://github.com/bfshi/scaling_on_scales.git +transformers==4.36.2 diff --git a/convension/unimernet/run.py b/convension/unimernet/run.py new file mode 100644 index 0000000..e2dda25 --- /dev/null +++ b/convension/unimernet/run.py @@ -0,0 +1,133 @@ +import argparse +import os + +import tensorrt_llm +import tensorrt_llm.profiler as profiler +from tensorrt_llm import logger +from tensorrt_llm.runtime import MultimodalModelRunner + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--max_new_tokens', type=int, default=30) + parser.add_argument('--batch_size', type=int, default=1) + parser.add_argument('--log_level', type=str, default='info') + parser.add_argument('--visual_engine_dir', + type=str, + default=None, + help='Directory containing visual TRT engines') + parser.add_argument('--visual_engine_name', + type=str, + default='model.engine', + help='Name of visual TRT engine') + parser.add_argument('--llm_engine_dir', + type=str, + default=None, + help='Directory containing TRT-LLM engines') + parser.add_argument('--hf_model_dir', + type=str, + default=None, + help="Directory containing tokenizer") + parser.add_argument('--input_text', + type=str, + default=None, + help='Text prompt to LLM') + parser.add_argument('--num_beams', + type=int, + help="Use beam search if num_beams >1", + default=1) + parser.add_argument('--top_k', type=int, default=1) + parser.add_argument('--top_p', type=float, default=0.0) + parser.add_argument('--temperature', type=float, default=1.0) + parser.add_argument('--repetition_penalty', type=float, default=1.0) + parser.add_argument('--run_profiling', + action='store_true', + help='Profile runtime over several iterations') + parser.add_argument('--profiling_iterations', + type=int, + help="Number of iterations to run profiling", + default=20) + parser.add_argument('--check_accuracy', + action='store_true', + help='Check correctness of text output') + parser.add_argument('--video_path', + type=str, + default=None, + help='Path to your local video file') + parser.add_argument("--image_path", + type=str, + default=None, + help='List of input image paths, separated by symbol') + parser.add_argument("--path_sep", + type=str, + default=",", + help='Path separator symbol') + parser.add_argument('--enable_context_fmha_fp32_acc', + action='store_true', + default=None, + help="Enable FMHA runner FP32 accumulation.") + + return parser.parse_args() + + +def print_result(model, input_text, output_text, args): + logger.info("---------------------------------------------------------") + if model.model_type != 'nougat': + logger.info(f"\n[Q] {input_text}") + for i in range(len(output_text)): + logger.info(f"\n[A]: {output_text[i]}") + + if args.num_beams == 1: + output_ids = model.tokenizer(output_text[0][0], + add_special_tokens=False)['input_ids'] + logger.info(f"Generated {len(output_ids)} tokens") + + if args.check_accuracy: + if model.model_type != 'nougat': + if model.model_type == "vila": + if len(args.image_path.split(args.path_sep)) == 1: + assert output_text[0][0].lower( + ) == "the image captures a bustling city intersection teeming with life. from the perspective of a car's dashboard camera, we see" + elif model.model_type == 'fuyu': + assert output_text[0][0].lower() == '4' + elif model.model_type == "pix2struct": + assert "characteristic | cat food, day | cat food, wet | cat treats" in output_text[ + 0][0].lower() + elif model.model_type in [ + 'blip2', 'neva', 'phi-3-vision', 'llava_next' + ]: + assert 'singapore' in output_text[0][0].lower() + elif model.model_type == 'video-neva': + assert 'robot' in output_text[0][0].lower() + elif model.model_type == 'kosmos-2': + assert 'snowman' in output_text[0][0].lower() + else: + assert output_text[0][0].lower() == 'singapore' + + if args.run_profiling: + msec_per_batch = lambda name: 1000 * profiler.elapsed_time_in_sec( + name) / args.profiling_iterations + logger.info('Latencies per batch (msec)') + logger.info('TRT vision encoder: %.1f' % (msec_per_batch('Vision'))) + logger.info('TRTLLM LLM generate: %.1f' % (msec_per_batch('LLM'))) + logger.info('Multimodal generate: %.1f' % (msec_per_batch('Generate'))) + + logger.info("---------------------------------------------------------") + + +if __name__ == '__main__': + os.environ["TOKENIZERS_PARALLELISM"] = "false" + args = parse_arguments() + logger.set_level(args.log_level) + + model = MultimodalModelRunner(args) + raw_image = model.load_test_image() + + num_iters = args.profiling_iterations if args.run_profiling else 1 + for _ in range(num_iters): + input_text, output_text = model.run(args.input_text, raw_image, + args.max_new_tokens) + + runtime_rank = tensorrt_llm.mpi_rank() + if runtime_rank == 0: + print_result(model, input_text, output_text, args) diff --git a/convension/unimernet/run.sh b/convension/unimernet/run.sh new file mode 100644 index 0000000..932d628 --- /dev/null +++ b/convension/unimernet/run.sh @@ -0,0 +1,8 @@ +#!/bin/bash +#SBATCH -J ParseSciHUB +#SBATCH -o log/%j-convert.out +#SBATCH -e log/%j-convert.out +mpirun -n 1 python convert_checkpoint.py --model_type bart \ + --model_dir /mnt/petrelfs/zhangtianning.di/projects/PDF-Extract-Kit/weights/unimernet_clean \ + --output_dir trt_models/unimernet/bfloat16 --tp_size 1 --pp_size 1 --dtype bfloat16 --nougat +## please use sbatch convert_checkpoint.sh to submit this job, use srun --mpi==pmi2 can not work \ No newline at end of file diff --git a/models/README.md b/models/README.md deleted file mode 100644 index 6c1be21..0000000 --- a/models/README.md +++ /dev/null @@ -1,70 +0,0 @@ -### Install Git LFS -Before you begin, make sure Git Large File Storage (Git LFS) is installed on your system. Install it using the following command: - -```bash -git lfs install -``` - -### Download the Model from Hugging Face -To download the `PDF-Extract-Kit` model from Hugging Face, use the following command: - -```bash -git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit -``` - -Ensure that Git LFS is enabled during the clone to properly download all large files. - - - -### Download the Model from ModelScope - -#### SDK Download - -```bash -# First, install the ModelScope library using pip: -pip install modelscope -``` - -```python -# Use the following Python code to download the model using the ModelScope SDK: -from modelscope import snapshot_download -model_dir = snapshot_download('wanderkid/PDF-Extract-Kit') -``` - -#### Git Download -Alternatively, you can use Git to clone the model repository from ModelScope: - -```bash -git clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git -``` - - -Put [model files]() here: - -``` -./ -├── Layout -│ ├── config.json -│ └── model_final.pth -├── MFD -│ └── weights.pt -├── MFR -│ └── UniMERNet -│ ├── config.json -│ ├── preprocessor_config.json -│ ├── pytorch_model.bin -│ ├── README.md -│ ├── tokenizer_config.json -│ └── tokenizer.json -├── TabRec -│ └── StructEqTable -│ ├── config.json -│ ├──generation_config.json -│ ├──model.safetensors -│ ├──preprocessor_config.json -│ ├──special_tokens_map.json -│ ├──spiece.model -│ ├──tokenizer_config.json -│ └──tokenizer.json -└── README.md -``` \ No newline at end of file diff --git a/modules/layoutlmv3/backbone.py b/modules/layoutlmv3/backbone.py index 5364f86..8d81ddb 100644 --- a/modules/layoutlmv3/backbone.py +++ b/modules/layoutlmv3/backbone.py @@ -86,7 +86,8 @@ def __init__(self, name, out_features, drop_path, img_size, pos_type, model_kwar config.has_spatial_attention_bias = False config.has_relative_attention_bias = False self.backbone = LayoutLMv3Model(config, detection=True, - out_features=out_features, image_only=image_only) + out_features=out_features, + image_only=image_only) else: self.backbone = model_func(img_size=img_size, out_features=out_features, @@ -103,13 +104,17 @@ def forward(self, x): dict[str->Tensor]: names and the corresponding features """ if "layoutlmv3" in self.name: - return self.backbone.forward( - input_ids=x["input_ids"] if "input_ids" in x else None, - bbox=x["bbox"] if "bbox" in x else None, - images=x["images"] if "images" in x else None, - attention_mask=x["attention_mask"] if "attention_mask" in x else None, - # output_hidden_states=True, - ) + if isinstance(x,torch.Tensor): + return self.backbone.forward(images=x) + else: + + return self.backbone.forward( + input_ids=x["input_ids"] if "input_ids" in x else None, + bbox=x["bbox"] if "bbox" in x else None, + images=x["images"] if "images" in x else None, + attention_mask=x["attention_mask"] if "attention_mask" in x else None, + # output_hidden_states=True, + ) assert x.dim() == 4, f"VIT takes an input of shape (N, C, H, W). Got {x.shape} instead!" return self.backbone.forward_features(x) diff --git a/modules/layoutlmv3/layoutlmv3_base_inference.yaml b/modules/layoutlmv3/layoutlmv3_base_inference.yaml index 3e0fd30..128227b 100644 --- a/modules/layoutlmv3/layoutlmv3_base_inference.yaml +++ b/modules/layoutlmv3/layoutlmv3_base_inference.yaml @@ -68,7 +68,6 @@ MODEL: BACKBONE: FREEZE_AT: 2 NAME: build_vit_fpn_backbone - CONFIG_PATH: '' DEVICE: cuda FPN: FUSE_TYPE: sum diff --git a/modules/layoutlmv3/model_init.py b/modules/layoutlmv3/model_init.py index 671ed16..6d3c86d 100644 --- a/modules/layoutlmv3/model_init.py +++ b/modules/layoutlmv3/model_init.py @@ -66,11 +66,14 @@ def setup(args): """ cfg = get_cfg() # add_coat_config(cfg) + add_vit_config(cfg) cfg.merge_from_file(args.config_file) cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2 # set threshold for this model cfg.merge_from_list(args.opts) + cfg.freeze() + default_setup(cfg, args) register_coco_instances( @@ -116,7 +119,7 @@ def __init__(self, weights): self.mapping = ["title", "plain text", "abandon", "figure", "figure_caption", "table", "table_caption", "table_footnote", "isolate_formula", "formula_caption"] MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).thing_classes = self.mapping self.predictor = DefaultPredictor(cfg) - + def __call__(self, image, ignore_catids=[]): page_layout_result = { "layout_dets": [] @@ -138,4 +141,5 @@ def __call__(self, image, ignore_catids=[]): ], "score": scores[bbox_idx] }) - return page_layout_result \ No newline at end of file + return page_layout_result + diff --git a/modules/layoutlmv3/rcnn_vl.py b/modules/layoutlmv3/rcnn_vl.py index 46b2e16..5a7d7b8 100644 --- a/modules/layoutlmv3/rcnn_vl.py +++ b/modules/layoutlmv3/rcnn_vl.py @@ -28,6 +28,41 @@ class VLGeneralizedRCNN(GeneralizedRCNN): 3. Per-region feature extraction and prediction """ + def forward_fast(self,images): + """ + Run inference on the given inputs. + + Args: + batched_inputs (list[dict]): same as in :meth:`forward` + detected_instances (None or list[Instances]): if not None, it + contains an `Instances` object per image. The `Instances` + object contains "pred_boxes" and "pred_classes" which are + known boxes in the image. + The inference will then skip the detection of bounding boxes, + and only predict other per-ROI outputs. + do_postprocess (bool): whether to apply post-processing on the outputs. + + Returns: + When do_postprocess=True, same as in :meth:`forward`. + Otherwise, a list[Instances] containing raw network outputs. + """ + assert not self.training + + images = self.preprocess_image_batch(images) + features = self.backbone({'images':images.tensor}) + proposals, _ = self.proposal_generator(images, features, None) + results, _ = self.roi_heads(images, features, proposals, None) + return results + + def preprocess_image_batch(self, images:torch.Tensor)->ImageList: + """ + Normalize, pad and batch the input images. + """ + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors(list(images), 32) + return images + + def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): """ Args: @@ -52,7 +87,9 @@ def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" """ if not self.training: - return self.inference(batched_inputs) + with torch.inference_mode(): + out = self.inference(batched_inputs) + return out images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: diff --git a/modules/self_modify.py b/modules/self_modify.py index 1f830b8..c62ec9d 100644 --- a/modules/self_modify.py +++ b/modules/self_modify.py @@ -11,6 +11,7 @@ from ppocr.utils.logging import get_logger from ppocr.utils.utility import check_and_read, alpha_to_color, binarize_img from tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop +from .batch_text_detector import BatchTextDetector logger = get_logger() def img_decode(content: bytes): @@ -119,7 +120,70 @@ def update_det_boxes(dt_boxes, mfdetrec_res): return new_dt_boxes + + class ModifiedPaddleOCR(PaddleOCR): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.batch_det_model = BatchTextDetector() + + def batch_detect(self, img_list, mfd_res_list, ori_im_list, cls=True): + time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0} + start = time.time() + + dt_boxes_list, elapse = self.batch_det_model(img_list) + time_dict['det'] = elapse + + dt_boxes_list_new = [] + for mfd_res in mfd_res_list: + if mfd_res: + bef = time.time() + dt_boxes = update_det_boxes(dt_boxes, mfd_res) + aft = time.time() + logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(len(dt_boxes), aft-bef)) + dt_boxes_list_new.append(dt_boxes) + dt_boxes_list = dt_boxes_list_new + + img_crop_list = [] + boxes_partition= [0] + for dt_boxes, ori_im in zip(dt_boxes_list, ori_im_list): + for bno in range(len(dt_boxes)): + tmp_box = copy.deepcopy(dt_boxes[bno]) + if self.args.det_box_type == "quad": + img_crop = get_rotate_crop_image(ori_im, tmp_box) + else: + img_crop = get_minarea_rect_crop(ori_im, tmp_box) + img_crop_list.append(img_crop) + boxes_partition.append(len(img_crop_list)) + + if self.use_angle_cls and cls: + img_crop_list, angle_list, elapse = self.text_classifier(img_crop_list) + time_dict['cls'] = elapse + logger.debug("cls num : {}, elapsed : {}".format(len(img_crop_list), elapse)) + + rec_res, elapse = self.text_recognizer(img_crop_list) + time_dict['rec'] = elapse + logger.debug("rec_res num : {}, elapsed : {}".format(len(rec_res), elapse)) + # if self.args.save_crop_res:self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list,rec_res) + + filter_boxes_list =[] + filter_rec_res_list=[] + for partition,(dt_boxes, ori_im) in enumerate(zip(dt_boxes_list, ori_im_list)): + filter_boxes, filter_rec_res = [], [] + rec_res = rec_res[boxes_partition[partition]:boxes_partition[partition+1]] + for box, rec_result in zip(dt_boxes, rec_res): + text, score = rec_result + if score >= self.drop_score: + filter_boxes.append(box) + filter_rec_res.append(rec_result) + filter_boxes_list.append(filter_boxes) + filter_rec_res_list.append(filter_rec_res) + + end = time.time() + time_dict['all'] = end - start + return filter_boxes, filter_rec_res, time_dict + + def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, mfd_res=None, alpha_color=(255, 255, 255)): """ OCR with PaddleOCR @@ -235,6 +299,7 @@ def __call__(self, img, cls=True, mfd_res=None): else: img_crop = get_minarea_rect_crop(ori_im, tmp_box) img_crop_list.append(img_crop) + if self.use_angle_cls and cls: img_crop_list, angle_list, elapse = self.text_classifier( img_crop_list) @@ -244,8 +309,7 @@ def __call__(self, img, cls=True, mfd_res=None): rec_res, elapse = self.text_recognizer(img_crop_list) time_dict['rec'] = elapse - logger.debug("rec_res num : {}, elapsed : {}".format( - len(rec_res), elapse)) + logger.debug("rec_res num : {}, elapsed : {}".format(len(rec_res), elapse)) if self.args.save_crop_res: self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list, rec_res) diff --git a/pdf_extract.py b/pdf_extract.py index df6fccb..5a1830a 100644 --- a/pdf_extract.py +++ b/pdf_extract.py @@ -73,6 +73,43 @@ def __getitem__(self, idx): image = self.transform(raw_image) return image +# def parser_per_image(image): + + +def rough_layout(layout_model, image): + layout_res = layout_model(image, ignore_catids=[]) + return layout_res + +from dataclasses import dataclass + +@dataclass +class FineGrainedConfig: + conf_thres: float = 0.3 + iou_thres: float = 0.5 + verbose: bool = False + +def fine_grained_layout(mfd_model,image, layout_res, config:FineGrainedConfig): + latex_filling_list = [] + mf_image_list = [] + mfd_res = mfd_model.predict(image, imgsz=img_size, conf=config.conf_thres, iou=config.iou_thres, verbose=config.verbose)[0] + for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), + mfd_res.boxes.conf.cpu(), + mfd_res.boxes.cls.cpu() ): + xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy] + new_item = { + 'category_id': 13 + int(cla.item()), + 'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax], + 'score': round(float(conf.item()), 2), + 'latex': '', + } + layout_res['layout_dets'].append(new_item) + latex_filling_list.append(new_item) + bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax]) + mf_image_list.append(bbox_img) + + return latex_filling_list, mf_image_list + +from tqdm.auto import tqdm if __name__ == '__main__': parser = argparse.ArgumentParser() @@ -93,6 +130,7 @@ def __getitem__(self, idx): with open('configs/model_configs.yaml') as f: model_configs = yaml.load(f, Loader=yaml.FullLoader) img_size = model_configs['model_args']['img_size'] + conf_thres = model_configs['model_args']['conf_thres'] iou_thres = model_configs['model_args']['iou_thres'] device = model_configs['model_args']['device'] @@ -113,6 +151,7 @@ def __getitem__(self, idx): else: all_pdfs = [args.pdf] print("total files:", len(all_pdfs)) + fineconfig = FineGrainedConfig() for idx, single_pdf in enumerate(all_pdfs): try: img_list = load_pdf_fitz(single_pdf, dpi=dpi) @@ -126,7 +165,7 @@ def __getitem__(self, idx): doc_layout_result = [] latex_filling_list = [] mf_image_list = [] - for idx, image in enumerate(img_list): + for idx, image in tqdm(enumerate(img_list)): img_H, img_W = image.shape[0], image.shape[1] layout_res = layout_model(image, ignore_catids=[]) mfd_res = mfd_model.predict(image, imgsz=img_size, conf=conf_thres, iou=iou_thres, verbose=True)[0] @@ -188,6 +227,7 @@ def __getitem__(self, idx): cropped_img = Image.new('RGB', pil_img.size, 'white') cropped_img.paste(pil_img.crop(crop_box), crop_box) cropped_img = cv2.cvtColor(np.asarray(cropped_img), cv2.COLOR_RGB2BGR) + print(cropped_img.shape) ocr_res = ocr_model.ocr(cropped_img, mfd_res=single_page_mfdetrec_res)[0] if ocr_res: for box_ocr_res in ocr_res: diff --git a/script/batch_job_dispatch.sh b/script/batch_job_dispatch.sh new file mode 100644 index 0000000..f390751 --- /dev/null +++ b/script/batch_job_dispatch.sh @@ -0,0 +1,34 @@ + +# FILEPATH=sci_index_files.addon.filelist #page_num_map.remain.filelist +# JOBSCRIPT=scan_and_get_page_num_map.py + +# FILEPATH=sci_index_files.finished.filelist +# JOBSCRIPT=check_the_detected_row_is_part_of_one_category.py + +FILEPATH=script/batch_job_dispatch.sh +JOBSCRIPT=scan_and_judge_ready_for_ocr.py + +# JOBSCRIPT=physics_collection/collect_and_upload_to_ceph.py +# FILEPATH=physics_collection/physics_collection.metadata.filelist + + + +PARTION_TOTAL=320 #32 # +PARTION_INTERVAL=16 # 32 +PARTION_NUM=$(($PARTION_TOTAL / $PARTION_INTERVAL)) +#echo $PARTION_NUM +for ((i=0; i .log/convert/thread.$TRUEINDICES.log 2>&1 & + pids[$CPU]=$! +done + +for pid in "${pids[@]}"; do + wait $pid +done + +echo "All processes have completed." diff --git a/script/merge_addon_part_back_into_cloud.py b/script/merge_addon_part_back_into_cloud.py new file mode 100644 index 0000000..862dc72 --- /dev/null +++ b/script/merge_addon_part_back_into_cloud.py @@ -0,0 +1,78 @@ +import os +import json +from tqdm.auto import tqdm +from simple_parsing import ArgumentParser +from batch_run_utils import obtain_processed_filelist, process_files,save_analysis, BatchModeConfig +from get_data_utils import * +def clean_pdf_path(pdf_path): + return pdf_path[len("opendata:"):] if pdf_path.startswith("opendata:") else pdf_path +def load_partition_pdf_mapping(): + + trace_id_to_original_partition = {} + original_partition_to_trace_id = {} + with open("analysis/better_addon.filelist",'r') as f: + lines = f.readlines() + for line in tqdm(lines,desc="read better_addon.filelist"): + line = line.strip().split() + original_partition = line[0] + if original_partition not in original_partition_to_trace_id: + original_partition_to_trace_id[original_partition] = [] + line = " ".join(line[1:]) + line = json.loads(line) + for metadata in line: + pdf_path = clean_pdf_path(metadata["path"]) + + original_partition_to_trace_id[original_partition].append(pdf_path) + trace_id_to_original_partition[pdf_path] = original_partition + + return original_partition_to_trace_id, trace_id_to_original_partition + +def load_pdf_results_map(): + ## then we read whole result into memory + pdf_path_to_result = {} + RESULT_ROOT_PATH = "analysis/add_on_metadata/metadata/layoutV6/result" + for result_file_name in tqdm(os.listdir(RESULT_ROOT_PATH),desc="read result"): + result_file_path = os.path.join(RESULT_ROOT_PATH,result_file_name) + + with open(result_file_path,'r') as f: + lines = f.readlines() + for line in tqdm(lines, desc=f"read {result_file_name}",leave=False): + line = line.strip() + line = json.loads(line) + pdf_path = clean_pdf_path(line["path"]) + pdf_path_to_result[pdf_path] = line + # break + return pdf_path_to_result + +pdf_path_to_result = load_pdf_results_map() +client = build_client() +def process_file(partition, pdf_path_list): + original_metadata_list = read_json_from_path(partition,client) + original_metadata_map = {clean_pdf_path(metadata["path"]):metadata for metadata in original_metadata_list} + do_we_add_Q = False + for pdf_path in pdf_path_list: + if pdf_path not in pdf_path_to_result: + #tqdm.write(f"pdf_path={pdf_path} not in result") + continue + if pdf_path in original_metadata_map: + continue + original_metadata_list.append(pdf_path_to_result[pdf_path]) + do_we_add_Q = True + if do_we_add_Q:write_jsonl_to_path(original_metadata_list,partition,client) + +def process_one_file_wrapper(args): + (partition, pdf_path_list), args = args + return process_file(partition, pdf_path_list) + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_arguments(BatchModeConfig, dest="config") + args = parser.parse_args() + args = args.config + args.task_name = "scan" + original_partition_to_trace_id, trace_id_to_original_partition = load_partition_pdf_mapping() + alread_processing_file_list = list(original_partition_to_trace_id.items()) + alread_processing_file_list = obtain_processed_filelist(args, alread_processing_file_list) + results = process_files(process_one_file_wrapper, alread_processing_file_list, args) + + diff --git a/script/merge_all_patch_back/merge_all_patch_back.py b/script/merge_all_patch_back/merge_all_patch_back.py new file mode 100644 index 0000000..1055145 --- /dev/null +++ b/script/merge_all_patch_back/merge_all_patch_back.py @@ -0,0 +1,48 @@ +import sys,os +sys.path.append(os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))) +from batch_running_task.get_data_utils import * +from batch_running_task.batch_run_utils import obtain_processed_filelist, process_files,save_analysis, BatchModeConfig,dataclass +import json +from tqdm.auto import tqdm +from simple_parsing import ArgumentParser +import time +import subprocess +client = build_client() +OriginDATAROOT="opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub" + +from batch_running_task.utils import convert_boxes +output_width =1472 #pdf_metadata['width']#1472 +output_height=1920 #pdf_metadata['height']#1920 + + +client = build_client() +def process_file(result_path, args): + if result_path.startswith("s3:"): + result_path = "opendata:"+result_path + filename = os.path.basename(result_path) + target_file_path = os.path.join(os.path.dirname(os.path.dirname(result_path)),"final_20240923",filename) + if not args.redo and check_path_exists(target_file_path,client): + tqdm.write(f"skip {target_file_path}") + return + #target_file_path = "test.jsonl" + result = read_data_with_version(result_path,client) + tqdm.write(f"read {result_path} to {target_file_path}") + + write_jsonl_to_path(result,target_file_path ,client) + #return pdf_path_map_to_page_num + +def process_one_file_wrapper(args): + arxiv_path, args = args + return process_file(arxiv_path,args) + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_arguments(BatchModeConfig, dest="config") + args = parser.parse_args() + args = args.config + args.task_name = "scan" + alread_processing_file_list = obtain_processed_filelist(args) + results = process_files(process_one_file_wrapper, alread_processing_file_list, args) + + + \ No newline at end of file diff --git a/script/merge_all_patch_back/merge_mfr_patch_back.py b/script/merge_all_patch_back/merge_mfr_patch_back.py new file mode 100644 index 0000000..05d8491 --- /dev/null +++ b/script/merge_all_patch_back/merge_mfr_patch_back.py @@ -0,0 +1,45 @@ +import sys,os +sys.path.append(os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))) +from batch_running_task.get_data_utils import * +from batch_running_task.batch_run_utils import obtain_processed_filelist, process_files,save_analysis, BatchModeConfig,dataclass +import json +from tqdm.auto import tqdm +from simple_parsing import ArgumentParser +import time +import subprocess +client = build_client() +OriginDATAROOT="opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub" + +from batch_running_task.utils import convert_boxes +output_width =1472 #pdf_metadata['width']#1472 +output_height=1920 #pdf_metadata['height']#1920 + + +client = build_client() +def process_file(result_path, args): + if result_path.startswith("s3:"): + result_path = "opendata:"+result_path + filename = os.path.basename(result_path) + target_file_path = os.path.join(os.path.dirname(os.path.dirname(result_path)),"add_mfr",filename) + #result = read_data_with_mfr(result_path,client) + result = read_data_with_mfr(result_path,client) + #tqdm.write(f"read {result_path} to {target_file_path}") + + write_jsonl_to_path(result,target_file_path ,client) + #return pdf_path_map_to_page_num + +def process_one_file_wrapper(args): + arxiv_path, args = args + return process_file(arxiv_path,args) + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_arguments(BatchModeConfig, dest="config") + args = parser.parse_args() + args = args.config + args.task_name = "scan" + alread_processing_file_list = obtain_processed_filelist(args) + results = process_files(process_one_file_wrapper, alread_processing_file_list, args) + + + \ No newline at end of file diff --git a/script/obtain_whole_pdf_filepath.py b/script/obtain_whole_pdf_filepath.py new file mode 100644 index 0000000..c4cc0bb --- /dev/null +++ b/script/obtain_whole_pdf_filepath.py @@ -0,0 +1,44 @@ +from get_data_utils import * + +from pathlib import Path +import sys, os +from batch_run_utils import BatchModeConfig, obtain_processed_filelist, process_files,dataclass, save_analysis +from simple_parsing import ArgumentParser + +from petrel_client.client import Client # 安装完成后才可导入 +client = Client(conf_path="~/petreloss.conf") # 实例化Petrel Client,然后就可以调用下面的APIs +@dataclass +class GetWholePDFConfig(BatchModeConfig): + task_name = 'get_whole_PDF_files' + +def check_one_path_wrapper(args): + metadata_json_path, args = args + data = read_json_from_path(metadata_json_path,client) + return data + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_arguments(GetWholePDFConfig, dest="config") + args = parser.parse_args() + args = args.config + + + #if args.mode == 'analysis': + alread_processing_file_list = obtain_processed_filelist(args) + results = process_files(check_one_path_wrapper, alread_processing_file_list, args) + whole_path_list = [] + for pathlist in results: + whole_path_list.extend(whole_path_list) + filename = os.path.basename(args.root_path).replace('.jsonl','.filelist') + with open(f"pdf_path_collections/{filename}",'w') as f: + f.write('\n'.join(whole_path_list)) + # #print(results) + # analysis= {} + # for arxivid, _type in results: + # if _type not in analysis: + # analysis[_type] = [] + # analysis[_type].append(arxivid) + + # totally_paper_num = len(alread_processing_file_list) + # save_analysis(analysis, totally_paper_num==1, args) diff --git a/script/scan_and_get_file_map/batch_job_dispatch.sh b/script/scan_and_get_file_map/batch_job_dispatch.sh new file mode 100644 index 0000000..ddd2109 --- /dev/null +++ b/script/scan_and_get_file_map/batch_job_dispatch.sh @@ -0,0 +1,34 @@ + +# FILEPATH=sci_index_files.addon.filelist #page_num_map.remain.filelist +# JOBSCRIPT=scan_and_get_page_num_map.py + +# FILEPATH=sci_index_files.finished.filelist +# JOBSCRIPT=check_the_detected_row_is_part_of_one_category.py + +FILEPATH=scihub_collection/sci_index_files.finished.filelist #scihub_collection/sci_index_files.filelist +JOBSCRIPT=script/scan_and_get_file_map/scan_and_track_id_for_each_file.py + +# JOBSCRIPT=physics_collection/collect_and_upload_to_ceph.py +# FILEPATH=physics_collection/physics_collection.metadata.filelist + + + +PARTION_TOTAL=320 #32 # +PARTION_INTERVAL=16 # 32 +PARTION_NUM=$(($PARTION_TOTAL / $PARTION_INTERVAL)) +#echo $PARTION_NUM +for ((i=0; i .log/convert/thread.$TRUEINDICES.log 2>&1 & + pids[$CPU]=$! +done + +for pid in "${pids[@]}"; do + wait $pid +done + +echo "All processes have completed." diff --git a/script/scan_and_get_file_map/scan_and_track_id_for_each_file.py b/script/scan_and_get_file_map/scan_and_track_id_for_each_file.py new file mode 100644 index 0000000..bdd85b7 --- /dev/null +++ b/script/scan_and_get_file_map/scan_and_track_id_for_each_file.py @@ -0,0 +1,41 @@ +import sys,os +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from get_data_utils import * +import json +from tqdm.auto import tqdm +from simple_parsing import ArgumentParser +from batch_run_utils import obtain_processed_filelist, process_files,save_analysis, BatchModeConfig +import time +import subprocess +PageInformationROOT="opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared/page_num_map" +OriginDATAROOT="opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub" + +client = build_client() +def process_file(metadata_file, args:BatchModeConfig): + pdf_path_map_to_page_num = [] + if metadata_file.startswith("s3:"): + metadata_file = "opendata:"+metadata_file + metadata_file_name = os.path.basename(metadata_file) + metadata_list = read_json_from_path(metadata_file,client) + track_id_list = [metadata["track_id"] for metadata in metadata_list] + return metadata_file + " " + ",".join(track_id_list) + +def process_one_file_wrapper(args): + arxiv_path, args = args + return process_file(arxiv_path,args) + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_arguments(BatchModeConfig, dest="config") + args = parser.parse_args() + args = args.config + args.task_name = "scan" + alread_processing_file_list = obtain_processed_filelist(args) + results = process_files(process_one_file_wrapper, alread_processing_file_list, args) + fold = os.path.join(args.savepath,f"track_id_for_each_file.split") + os.makedirs(fold,exist_ok=True) + savepath = os.path.join(fold,f"{args.start_index:07d}-{args.end_index:07d}") + with open(savepath,'w') as f: + for result in results: + f.write(result+'\n') + \ No newline at end of file diff --git a/script/scan_and_get_page_num_map/batch_job_dispatch.sh b/script/scan_and_get_page_num_map/batch_job_dispatch.sh new file mode 100644 index 0000000..7d910f5 --- /dev/null +++ b/script/scan_and_get_page_num_map/batch_job_dispatch.sh @@ -0,0 +1,25 @@ + +FILEPATH=physics_collection/physics.files.filelist #page_num_map.remain.filelist +JOBSCRIPT=script/scan_and_get_page_num_map/scan_and_get_page_num_map.py + + + +PARTION_TOTAL=320 #32 # +PARTION_INTERVAL=16 # 32 +PARTION_NUM=$(($PARTION_TOTAL / $PARTION_INTERVAL)) +#echo $PARTION_NUM +for ((i=0; i .log/convert/thread.$TRUEINDICES.log 2>&1 & + pids[$CPU]=$! +done + +for pid in "${pids[@]}"; do + wait $pid +done + +echo "All processes have completed." diff --git a/script/scan_and_get_page_num_map/scan_and_get_page_num_map.py b/script/scan_and_get_page_num_map/scan_and_get_page_num_map.py new file mode 100644 index 0000000..14a5132 --- /dev/null +++ b/script/scan_and_get_page_num_map/scan_and_get_page_num_map.py @@ -0,0 +1,81 @@ +import sys,os + +sys.path.append(os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))) +from batch_running_task.get_data_utils import * +from batch_running_task.batch_run_utils import obtain_processed_filelist, process_files,save_analysis, BatchModeConfig,dataclass + +import json +from tqdm.auto import tqdm + +from simple_parsing import ArgumentParser +import time +import subprocess +client = build_client() +OriginDATAROOT="opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub" + +@dataclass +class PageNumConfig(BatchModeConfig): + savepath: str = "opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared" + fullpage_check: bool = False +client = build_client() +def process_file(metadata_file, args:PageNumConfig): + if metadata_file.startswith("s3:"): metadata_file = "opendata:"+ metadata_file + pdf_path_map_to_page_num = [] + if "layoutV" in metadata_file: + filename = os.path.basename(metadata_file) + metadata_file = os.path.join(OriginDATAROOT,filename) + metadata_file_name = metadata_file.split("/")[-1].replace('.jsonl','.json') + target_file_path = os.path.join(args.savepath, f"page_num_map/{metadata_file_name}") + if not args.redo and check_path_exists(target_file_path, client): + tqdm.write(f"already processed {metadata_file}, we pass") + return + metadatalist = read_json_from_path(metadata_file, client) + tqdm.write(f"save to {target_file_path}") + iterater = tqdm(metadatalist,position=1,leave=False) if args.batch_num==0 else metadatalist + for metadata in iterater: + pdfpath = metadata['path'] + if pdfpath.startswith("s3:"): pdfpath = "opendata:"+ pdfpath + if not check_path_exists(pdfpath, client): + pdf_path_map_to_page_num.append([pdfpath, 0]) + continue + try: + pdf_buffer = read_pdf_from_path(pdfpath, client) + if args.fullpage_check: + page_id_list = range(len(pdf_buffer)) + else: + page_id_list = [0] + for page_id in page_id_list: + page = pdf_buffer.load_page(page_id) + dpi = 20 + pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72)) + except Exception as e: + tqdm.write(f"""error in loading pdf {pdfpath}, we pass""") + pdf_path_map_to_page_num.append([pdfpath, -1]) + print(e) + continue + pdf_path_map_to_page_num.append([pdfpath, len(pdf_buffer)]) + + write_json_to_path(pdf_path_map_to_page_num,target_file_path ,client) + #return pdf_path_map_to_page_num + +def process_one_file_wrapper(args): + arxiv_path, args = args + return process_file(arxiv_path,args) + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_arguments(PageNumConfig, dest="config") + args = parser.parse_args() + args = args.config + args.task_name = "scan" + alread_processing_file_list = obtain_processed_filelist(args) + results = process_files(process_one_file_wrapper, alread_processing_file_list, args) + # pdf_path_map_to_page_num_total = {} + # for pdf_path_map_to_page_num in results: + # pdf_path_map_to_page_num_total.update(pdf_path_map_to_page_num) + # with open(f"page_num_map/pdf_path_map_to_page_num.{args.index_part}.{args.num_parts}.json",'w') as f: + # json.dump(pdf_path_map_to_page_num_total,f) + + + + \ No newline at end of file diff --git a/script/scan_and_judge_det_ocr_status/batch_job_dispatch.sh b/script/scan_and_judge_det_ocr_status/batch_job_dispatch.sh new file mode 100644 index 0000000..4dae970 --- /dev/null +++ b/script/scan_and_judge_det_ocr_status/batch_job_dispatch.sh @@ -0,0 +1,34 @@ + +# FILEPATH=sci_index_files.addon.filelist #page_num_map.remain.filelist +# JOBSCRIPT=scan_and_get_page_num_map.py + +# FILEPATH=sci_index_files.finished.filelist +# JOBSCRIPT=check_the_detected_row_is_part_of_one_category.py + +FILEPATH=scihub_collection/sci_hub.finished.filelist +JOBSCRIPT=script/scan_and_judge_det_ocr_status/scan_and_judge_ready_for_ocr.py + +# JOBSCRIPT=physics_collection/collect_and_upload_to_ceph.py +# FILEPATH=physics_collection/physics_collection.metadata.filelist + + + +PARTION_TOTAL=320 #32 # +PARTION_INTERVAL=16 # 32 +PARTION_NUM=$(($PARTION_TOTAL / $PARTION_INTERVAL)) +#echo $PARTION_NUM +for ((i=0; i .log/convert/thread.$TRUEINDICES.log 2>&1 & + pids[$CPU]=$! +done + +for pid in "${pids[@]}"; do + wait $pid +done + +echo "All processes have completed." diff --git a/script/scan_and_judge_det_ocr_status/scan_and_judge_ready_for_ocr.py b/script/scan_and_judge_det_ocr_status/scan_and_judge_ready_for_ocr.py new file mode 100644 index 0000000..528ac5a --- /dev/null +++ b/script/scan_and_judge_det_ocr_status/scan_and_judge_ready_for_ocr.py @@ -0,0 +1,241 @@ +import sys,os +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from get_data_utils import * +import json +from tqdm.auto import tqdm +from simple_parsing import ArgumentParser +from batch_run_utils import obtain_processed_filelist, process_files,save_analysis, BatchModeConfig, dataclass +import time +import subprocess +from typing import Dict, Any + + + +@dataclass +class StatusCheckConfig(BatchModeConfig): + PageInformationROOT:str="opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared/page_num_map" + OriginDATAROOT:str="opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub" + use_patch:bool=False + use_candidate:bool=False + +def clean_pdf_path(pdf_path): + return pdf_path[len("opendata:"):] if pdf_path.startswith("opendata:") else pdf_path + +def get_page_info_path(metadata_file,args): + metadata_file_name = os.path.basename(metadata_file) + + if metadata_file.startswith("s3:"):metadata_file="opendata:"+metadata_file + if metadata_file.startswith("opendata:"): + page_information_file_path= os.path.join(args.PageInformationROOT, metadata_file_name.replace(".jsonl",".json")) + else: + root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(metadata_file)))) + page_information_file_path= os.path.join(root, "page_num_map", metadata_file_name.replace(".jsonl",".json")) + if not os.path.exists(page_information_file_path): + page_information_file_path = os.path.join(args.PageInformationROOT, metadata_file_name.replace(".jsonl",".json")) + #print(page_information_file_path) + return page_information_file_path + +def get_origin_path(metadata_file,args): + metadata_file_name = os.path.basename(metadata_file) + if metadata_file.startswith("opendata:") or metadata_file.startswith("s3:"): + origin_path= os.path.join(args.OriginDATAROOT, metadata_file_name) + else: + root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(metadata_file)))) + origin_path= os.path.join(root, "metadata", metadata_file_name) + return origin_path + +def get_box_status(box:Dict[str,Any]): + category_id = box['category_id'] + if category_id == 15: + if box.get('text',"") != "" or 'sub_boxes' in box: + return boxstatus.has_category_rec_and_get_rec + else: + return boxstatus.has_category_rec_without_rec + elif category_id in {0,1,2,4,6,7}: + return boxstatus.has_category_layout + elif category_id in {13, 14}: + if box.get('latex',"") != "": + return boxstatus.has_category_mfd_and_get_mfr + else: + return boxstatus.has_category_mfd_without_mfr + +def judge_one_page_status_via_box_status(box_status_list): + box_status_list = set(box_status_list) + #assert boxstatus.has_category_layout in box_status_list, f"box_status_list={box_status_list}" + if boxstatus.has_category_layout not in box_status_list: + return page_status.none + ## not mfd and rec + if (boxstatus.has_category_mfd_and_get_mfr not in box_status_list and boxstatus.has_category_rec_and_get_rec not in box_status_list + and boxstatus.has_category_mfd_without_mfr not in box_status_list and boxstatus.has_category_rec_without_rec not in box_status_list): + return page_status.only_have_layout + ## has mfd or rec but without get mfr or rec + elif (boxstatus.has_category_mfd_and_get_mfr not in box_status_list and boxstatus.has_category_rec_and_get_rec not in box_status_list): + return page_status.layout_complete + elif ( (boxstatus.has_category_mfd_and_get_mfr in box_status_list and boxstatus.has_category_rec_and_get_rec in box_status_list) + or (boxstatus.has_category_mfd_and_get_mfr in box_status_list and boxstatus.has_category_rec_without_rec not in box_status_list) + or (boxstatus.has_category_rec_and_get_rec in box_status_list and boxstatus.has_category_mfd_without_mfr not in box_status_list)): + return page_status.layout_complete_and_ocr_finished + elif (boxstatus.has_category_mfd_and_get_mfr in box_status_list and boxstatus.has_category_rec_without_rec in box_status_list): + return page_status.layout_complete_and_ocr_only_for_rec + elif (boxstatus.has_category_rec_and_get_rec in box_status_list and boxstatus.has_category_mfd_without_mfr in box_status_list): + return page_status.layout_complete_and_ocr_only_for_mfd + +def judge_one_pdf_status_via_page_status(page_status_list): + page_status_list = set(page_status_list) + if page_status.only_have_layout in page_status_list: + return pdf_status.layout_not_complete + elif page_status.layout_complete in page_status_list: + return pdf_status.layout_has_complete + elif page_status.layout_complete_and_ocr_finished in page_status_list: + return pdf_status.layout_complete_and_ocr_finished + elif page_status.layout_complete_and_ocr_only_for_rec in page_status_list: + return pdf_status.layout_complete_without_ocr + elif page_status.layout_complete_and_ocr_only_for_mfd in page_status_list: + return pdf_status.layout_complete_without_ocr + else: + return pdf_status.layout_not_complete + +def judge_package_status_via_pdf_status(pdf_status_list): + pdf_status_list = set(pdf_status_list) + if pdf_status.layout_not_complete in pdf_status_list: + return packstatus.layout_not_complete + elif pdf_status.layout_has_complete in pdf_status_list: + return packstatus.whole_layout_complete + elif pdf_status.layout_complete_and_ocr_finished in pdf_status_list: + return packstatus.whole_ocr_complete + else: + return packstatus.better_redo + + + +client = build_client() +def process_file(metadata_file, args:StatusCheckConfig): + pdf_path_map_to_page_num = [] + if metadata_file.startswith("s3:"): + metadata_file = "opendata:"+metadata_file + metadata_file_name = os.path.basename(metadata_file) + page_information_file_path= get_page_info_path(metadata_file,args) + if args.use_candidate: + new_path = metadata_file.replace('result/','final_layout/') + if check_path_exists(new_path,client): + metadata_file = new_path + if args.use_patch: + metadata_list = read_data_with_patch(metadata_file,client) + else: + metadata_list = read_json_from_path(metadata_file,client) + page_information= read_json_from_path(page_information_file_path,client) + if len(metadata_list) != len(page_information) and len(page_information) - len(metadata_list) > 500: + exit_reason = packstatus.better_redo + return exit_reason, metadata_file, [metadata_file,"fail due to unmatch pdf number origin is {}, page information is {}".format(len(metadata_list), len(page_information))] + + + page_information_map = {} + for pdf_path, pdf_page_num in page_information: + if pdf_page_num <= 0: + continue + pdf_path = pdf_path[len("opendata:"):] if pdf_path.startswith("opendata:") else pdf_path + page_information_map[pdf_path] = pdf_page_num + + if len(metadata_list) != len(page_information_map) and len(page_information_map) - len(metadata_list) <= 500: + if len(metadata_list) < len(page_information_map): + exit_reason = packstatus.better_addon + missing_part= [] + origin_data_path = get_origin_path(metadata_file,args) + print(f"pdf num in page information is {len(page_information_map)} from {len(page_information)}") + print(f"pdf num in result is {len(metadata_list)}") + unique_pdf_path = set([clean_pdf_path(metadata['path']) for metadata in metadata_list]) + print(f"pdf num in origin unique is {len(unique_pdf_path)}") + not_in_page_information = set(page_information_map.keys()) - unique_pdf_path + print(f"the missing pdf is {not_in_page_information}") + for pdf_path in not_in_page_information: + print(f"pdf_path is {pdf_path}=>{page_information_map[pdf_path]}") + + metadata_list=read_json_from_path(origin_data_path,client) + print(f"pdf num in origin is {len(metadata_list)}") + + for pdf_id, metadata in enumerate(metadata_list): + origin_path_path = clean_pdf_path(metadata['path']) + if origin_path_path in not_in_page_information: + missing_part.append(metadata) + return exit_reason, metadata_file, [metadata_file, json.dumps(missing_part)] + else: + exit_reason = packstatus.check_the_page_information + print(f"fail due to unmatch pdf number origin is {len(metadata_list)}, page information is {len(page_information_map)} from {len(page_information)}") + return exit_reason, metadata_file, [metadata_file,f"fail due to unmatch pdf number origin is {len(metadata_list)}, page information is {len(page_information_map)} from {len(page_information)}"] + + + status_all_pdf = [] + for pdf_id, metadata in enumerate(metadata_list): + doc_layout_result = metadata['doc_layout_result'] + origin_path_path = metadata['path'] + origin_path_path = origin_path_path[len("opendata:"):] if origin_path_path.startswith("opendata:") else origin_path_path + assert origin_path_path in page_information_map, f"pdf_id={pdf_id} origin_path_path {origin_path_path} not in page_information_map" + pdf_page_num = page_information_map[origin_path_path] + track_id = metadata['track_id'] + status_for_this_pdf= {t:page_status.none for t in range(pdf_page_num)} + for page_meta in doc_layout_result: + page_id = page_meta['page_id'] + ### now do parser check + page_id = page_meta["page_id"] + layout_dets = page_meta["layout_dets"] + if len(layout_dets)==0: + continue + #raise ValueError(f"pdf_id={pdf_id} page_id={page_id} page_meta={page_meta} is empty") + box_status_list = [get_box_status(box) for box in layout_dets] + status_for_this_pdf[page_id] = judge_one_page_status_via_box_status(box_status_list) + + status_for_this_pdf_list_format = [status_for_this_pdf[page_id] for page_id in range(pdf_page_num)] + pdf_status_for_this_pdf = judge_one_pdf_status_via_page_status(status_for_this_pdf_list_format) + status_all_pdf.append([track_id, pdf_status_for_this_pdf, status_for_this_pdf_list_format]) + + exit_reason = judge_package_status_via_pdf_status([pdf_status for track_id, pdf_status, status_for_this_pdf_list_format in status_all_pdf]) + + return exit_reason, metadata_file,status_all_pdf + +def process_one_file_wrapper(args): + arxiv_path, args = args + return process_file(arxiv_path,args) + + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_arguments(StatusCheckConfig, dest="config") + args = parser.parse_args() + args = args.config + args.task_name = "scan" + alread_processing_file_list = obtain_processed_filelist(args) + results = process_files(process_one_file_wrapper, alread_processing_file_list, args) + analysis = {} + for exit_reason, metadata_file, status_all_pdf in results: + if exit_reason not in analysis: analysis[exit_reason] = [] + analysis[exit_reason].append([metadata_file, status_all_pdf]) + + # if exit_reason not in analysis: analysis[exit_reason] = [] + # if exit_reason == "all_complete": + # analysis[exit_reason].append(metadata_file) + # elif exit_reason == "not_complete": + # analysis[exit_reason].append({'file':metadata_file, "status":status_all_pdf}) + # else: + # analysis[exit_reason].append(status_all_pdf) + + for key, val in analysis.items(): + print(f"{key}=>{len(val)}") + fold = os.path.join(args.logpath,f"{key.lower()}.filelist.split") + logpath = os.path.join(fold,f"{args.start_index}-{args.end_index}") + os.makedirs(fold, exist_ok=True) + if key == "all_complete": + with open(logpath, 'w') as f: + for line in val: + f.write(line+'\n') + elif key == "not_complete": + with open(logpath, 'w') as f: + for line in val: + f.write(json.dumps(line)+'\n') + else: + with open(logpath, 'w') as f: + for metadata_file, status_all_pdf in val: + f.write(f"{metadata_file} "+ json.dumps(status_all_pdf) +'\n') + + + \ No newline at end of file