This repository has been archived by the owner on Jul 27, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbackend.py
129 lines (108 loc) · 3.53 KB
/
backend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from flask import Flask, request, jsonify
import time
import json
import regex
app = Flask(__name__)
regexps = dict()
@app.route("/")
def echo():
return json.dumps({"started":"true"})
@app.route("/add_rule", methods=['POST'])
def add_rule():
"""Adds rule.
JSON Args:
ruleset: Name of the document type eg. Cargo-Slips, Insurance-Documents
tags: a list of field and context
field: Name of the field. eg. "Insured"
data: Info of interest
context: Context around the data (including the data). eg. "Insured:\n\nABC, Inc and/or Associated"
Demo data:
{
"ruleset":"Cargo Slip",
"tags":[
{
"field":"Company-Name",
"data":"Lloyd & Partners Pvt. Ltd.",
"context":"are Lloyd & Partners Pvt. Ltd. not"
},
{
"field":"Balance-Paid",
"data":"1,500,000",
"context":"USD 1,500,000 any"
},
{
"field":"Policy-ID",
"data":"B0000DC1234567000",
"context":"Reference: B0000DC1234567000 Attaching"
}
]
}
"""
ruleset = request.json['ruleset']
regexps[ruleset] = dict()
for field_dict in request.json['tags']:
context = field_dict['context']
field = field_dict['field']
data = field_dict['data']
if data not in context:
return 'Context should contain data.'
data_pattern = '(['
if regex.search(r'[a-z]', data):
data_pattern += 'a-z'
if regex.search(r'[A-Z]', data):
data_pattern += 'A-Z'
if regex.search(r'[0-9]', data):
data_pattern += '0-9'
if regex.search(r'\s', data):
data_pattern += r'\s'
if regex.search(r'[^a-zA-Z0-9\s]', data):
data_pattern += '.'
data_pattern += ']{0,' + str(len(data) + 20) + '})'
pattern = context.replace(data, data_pattern)
regexps[ruleset][field] = pattern
return "200"
@app.route("/extract", methods=['POST'])
def extract():
"""Extracts data.
JSON Args:
ruleset: Name of the document type eg. Cargo-Slips, Insurance-Documents
text: Text extracted from document through OCR.
Demo Data:
{
"ruleset":"Cargo Slip",
"files":[
{
"name": "/home/akshay/Downloads/SOV_2.pdf",
"content": "are Google Inc. not USD 20,000 any Reference: F0000DC1234567000 Attaching"
},
{
"name": "/home/akshay/Downloads/Sample_2_IIT.pdf",
"content": "are Facebook LLC. not USD 100,000 any Reference: F0000FF1234567000 Attaching"
}
]
}
Returns:
data: Map of field and corresponding data
"""
ruleset = request.json['ruleset']
output = dict()
for file_dict in request.json['files']:
name = file_dict['name']
content = file_dict['content']
output[name] = []
regexps_ruleset = regexps.get(ruleset, dict())
for field in regexps_ruleset.keys():
regexp = regexps_ruleset[field]
result = regex.search("(?b)(%s){e<=7}" % regexp, content)
temp = {"key":field}
if result is None:
temp["val"] = " "
else:
if len(result.groups())>1:
temp["val"] = result.groups()[1]
else:
temp["val"] = " "
output[name].append(temp)
return jsonify(output)
if __name__ == "__main__":
app.run(host='127.0.0.1', port=5122)