codebasics · beladiyadarshan · Dec 15, 2020 · Dec 15, 2020 · Dec 15, 2020 · Dec 15, 2020
diff --git a/.gitignore b/.gitignore
@@ -8,7 +8,6 @@ __pycache__/
 
 # Distribution / packaging
 .Python
-build/
 develop-eggs/
 dist/
 downloads/
@@ -127,3 +126,7 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+.vscode
+
+.env
diff --git a/README.md b/README.md
@@ -1,2 +1,24 @@
 # python-simple-ocr-project
 A simple OCR Project using python with frontend in VueJS
+
+# Overview
+Repository for Medical-OCR.
+
+# Folder Structure
+1. **api** : backend python API
+
+2. **frontend**: frontend vue client
+
+# Setup Instructions For Local Environment
+1. Clone the repository
+```bash
+git clone [email protected]:beladiyadarshan/python-simple-ocr-project.git
+```
+
+2. [Set up the API](https://github.com/beladiyadarshan/python-simple-ocr-project/blob/main/api/README.md)
+
+3. [Set up the Frontend](https://github.com/beladiyadarshan/python-simple-ocr-project/blob/main/frontend/README.md)
+
+
+Note: You will need bash and git to install and get started with this project.
+1. Install Git or Gitbash (incase of windows) ([Setup instructions](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git)).
diff --git a/api/.dockerignore b/api/.dockerignore
@@ -0,0 +1,5 @@
+development.yml
+Dockerfile
+production.yml
+docker-compose.yml
+docker-compose.yml.example
diff --git a/api/.gitignore b/api/.gitignore
@@ -0,0 +1,19 @@
+is_running
+docker-compose.yml
+.idea
+*.xlsm
+*.xlsx
+__pycache__
+logs
+*.log
+.vscode
+.vscode/*
+*.xls
+.cache/
+docker_data
+Backup_Reports
+docker_datagit
+Backup_Reports
+XDG_CACHE_HOME
+*.log.*
+*.pdf
diff --git a/api/Dockerfile b/api/Dockerfile
@@ -0,0 +1,29 @@
+FROM ubuntu:18.04
+
+
+RUN apt-get update --fix-missing
+RUN apt-get upgrade -y
+
+RUN apt-get install -y libsm6 libxext6 libxrender-dev libleptonica-dev liblept5
+RUN apt-get -y install nginx \
+    && apt-get -y install python3-dev \
+    && apt-get -y install build-essential \
+    && apt-get -y install python3-pip \
+    && apt-get -y install software-properties-common \
+    && add-apt-repository -y ppa:alex-p/tesseract-ocr \
+    && apt-get -y update \
+    && apt-get -y install tesseract-ocr \
+    && apt-get -y install curl \
+    && apt-get -y install poppler-utils
+
+WORKDIR /project
+
+COPY ./requirements.txt /project/requirements.txt
+
+RUN pip3 install -r requirements.txt
+
+COPY ./ /project/
+
+COPY ./default /etc/nginx/sites-available/
+
+CMD ["gunicorn", "-b", "0.0.0.0:5001", "--workers=3", "--threads=3", "-t", "90", "--error-logfile", "/project/err.log", "--log-level=debug", "wsgi:app"]
diff --git a/api/README.md b/api/README.md
@@ -0,0 +1,37 @@
+# Setting up the API
+
+### With Docker
+1. Install Docker CE. ([Setup instructions](https://docs.docker.com/install/linux/docker-ce/ubuntu/))
+
+2. Install docker-compose. ([Setup instructions](https://docs.docker.com/compose/install/))
+
+3. Clone the project. (`https://github.com/beladiyadarshan/python-simple-ocr-project.git`)
+
+4. Copy docker-compose.yml.example and save it as docker-compose.yml.
+
+5. Build docker image
+```
+    docker build .
+```
+
+### Without Docker
+1. Install Python ([Setup instructions](https://wiki.python.org/moin/BeginnersGuide))
+
+2. Install tesseract ([Setup instructions](https://github.com/tesseract-ocr/tesseract#installing-tesseract))
+
+3. Install Python packages
+```
+pip3 install -r requirements.txt
+```
+
+# Running the API
+
+### With Docker
+```
+docker-compose up
+```
+
+### Without Docker
+```
+python3 app.py
+```
diff --git a/api/app.py b/api/app.py
@@ -0,0 +1,61 @@
+import os
+import sys
+import logging
+from flask import Flask, request, json
+from utils.generic_utils import get_random_string, allowed_file
+from parser.parser import parse
+from flask_cors import CORS
+ROOT_DIR = os.path.dirname(__file__)
+PARENT_DIR = os.path.dirname(__file__) + '/' + str(os.pardir)
+sys.path.append(PARENT_DIR)
+
+logging.basicConfig(level=logging.DEBUG)
+app = Flask(__name__)
+app.config['UPLOAD_FOLDER'] = "uploads"
+app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
+cors = CORS(app)
+
+
+@app.route('/ocr', methods=['POST'])
+def ocr():
+    file_path = ''
+    try:
+        format = request.form['format']
+        file = request.files['file']
+
+        file_path = app.config['UPLOAD_FOLDER'] + "/"\
+            + get_random_string(32) + ".pdf"
+        file.save(file_path)
+        text, data, error = parse(file_path, format) # noqa
+
+        app.logger.info("----------------------------------")
+        app.logger.info(f"Data: {data}")
+        app.logger.info("----------------------------------")
+        response = app.response_class(
+            response=json.dumps({
+                "text": text,
+                "data": data
+            }),
+            status=200,
+            mimetype='application/json'
+        )
+        os.remove(file_path)
+        return response
+
+    except Exception as e:
+        response = app.response_class(
+                response=json.dumps({
+                    "status": 0,
+                    "message": "Some error occurred",
+                    "error": str(e)
+                }),
+                status=500,
+                mimetype='application/json'
+            )
+        if file_path:
+            os.remove(file_path)
+        return response
+
+
+if __name__ == "__main__":
+    app.run(host='0.0.0.0', port=5001, debug=True)
diff --git a/api/default b/api/default
@@ -0,0 +1,41 @@
+##
+# You should look at the following URL's in order to grasp a solid understanding
+# of Nginx configuration files in order to fully unleash the power of Nginx.
+# http://wiki.nginx.org/Pitfalls
+# http://wiki.nginx.org/QuickStart
+# http://wiki.nginx.org/Configuration
+#
+# Generally, you will want to move this file somewhere, and start with a clean
+# file but keep this around for reference. Or just disable in sites-enabled.
+#
+# Please see /usr/share/doc/nginx-doc/examples/ for more detailed examples.
+##
+
+# Default server configuration
+#
+server {
+	listen 80 default_server;
+	listen [::]:80 default_server;
+
+	# SSL configuration
+	#
+	# listen 443 ssl default_server;
+	# listen [::]:443 ssl default_server;
+	#
+	# Self signed certs generated by the ssl-cert package
+	# Don't use them in a production server!
+	#
+	# include snippets/snakeoil.conf;
+
+	root /var/www/html;
+
+	# Add index.php to the list if you are using PHP
+	index index.html index.htm index.nginx-debian.html;
+
+	server_name _;
+
+	location / {
+		include proxy_params;
+		proxy_pass http://ocr-api:5001;
+	}
+}
diff --git a/api/docker-compose.yml.example b/api/docker-compose.yml.example
@@ -0,0 +1,18 @@
+version: '3'
+
+services:
+  app:
+    container_name: ocr-api
+    build: .
+    volumes:
+      - './:/project/'
+    restart: always
+  web:
+    image: nginx
+    volumes:
+      - './nginx.conf:/etc/nginx/conf.d/default.conf'
+    ports:
+      - '5001:80'
+    links:
+      - app
+    restart: always
diff --git a/api/extractor/__init__.py b/api/extractor/__init__.py
@@ -0,0 +1,6 @@
+from extractor import patient_details
+
+
+FUNCTIONS = {
+    'patient_details': patient_details
+}
diff --git a/api/extractor/extract_details.py b/api/extractor/extract_details.py
@@ -0,0 +1,29 @@
+from flask import current_app as app
+from utils.image_utils import get_text_from_image_list
+from . import FUNCTIONS
+
+
+def extract_details(page_list, file_format):
+    """Extract details from a page list
+
+    Extract details from a page list depending upon the document type.
+
+    Parameters
+    ----------
+    page_list : list(np.ndarray)
+        list of pages extracted from pdf
+    file_format : str
+        format a particular file is following
+
+    Returns
+    -------
+    text: str
+        text as extracted from teh set of images
+    data: list(tuple)
+        data stored as list of tuples
+    """
+
+    text = get_text_from_image_list(page_list)
+    app.logger.info(text)
+    data = FUNCTIONS[file_format].extract_details(text)
+    return text, data