-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
228 lines (174 loc) · 7.28 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
from flask import Flask, request, jsonify
from flask_cors import CORS
import os
import google.generativeai as genai
from dotenv import load_dotenv
from werkzeug.utils import secure_filename
import json
import imutils
import cv2
import numpy as np
import matplotlib.pyplot as plt
load_dotenv()
app = Flask(__name__)
CORS(app)
api_key_path = os.environ.get('GEMINI_API_KEY')
if not api_key_path:
raise Exception("GEMINI_API_KEY environment variable is not set")
def orient_vertical(img):
width = img.shape[1]
height = img.shape[0]
if width > height:
rotated = imutils.rotate(img, angle=270)
else:
rotated = img.copy()
return rotated
def sharpen_edge(img):
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (15, 15), 0)
rectKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (11, 11))
dilated = cv2.dilate(blurred, rectKernel, iterations=2)
edged = cv2.Canny(dilated, 75, 200, apertureSize=3)
return edged
def binarize(img, threshold):
threshold = np.mean(img)
thresh, binary = cv2.threshold(img, threshold, 255, cv2.THRESH_BINARY)
rectKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
dilated = cv2.dilate(binary, rectKernel, iterations=2)
return dilated
def find_receipt_bounding_box(binary, img):
global rect
contours, hierarchy = cv2.findContours(
binary, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
largest_cnt = max(contours, key=cv2.contourArea)
rect = cv2.minAreaRect(largest_cnt)
box = np.intp(cv2.boxPoints(rect))
boxed = cv2.drawContours(img.copy(), [box], 0, (0, 255, 0), 20)
return boxed, largest_cnt
def find_tilt_angle(largest_contour):
angle = rect[2] # Find the angle of vertical line
print("Angle_0 = ", round(angle, 1))
if angle < -45:
angle += 90
print("Angle_1:", round(angle, 1))
else:
uniform_angle = abs(angle)
print("Uniform angle = ", round(uniform_angle, 1))
return rect, uniform_angle
def adjust_tilt(img, angle):
if angle >= 5 and angle < 80:
rotated_angle = 0
elif angle < 5:
rotated_angle = angle
else:
rotated_angle = 270+angle
tilt_adjusted = imutils.rotate(img, rotated_angle)
delta = 360-rotated_angle
return tilt_adjusted, delta
def crop(img, largest_contour):
x, y, w, h = cv2.boundingRect(largest_contour)
cropped = img[y:y+h, x:x+w]
return cropped
def enhance_txt(img):
w = img.shape[1]
h = img.shape[0]
w1 = int(w*0.05)
w2 = int(w*0.95)
h1 = int(h*0.05)
h2 = int(h*0.95)
ROI = img[h1:h2, w1:w2] # 95% of center of the image
threshold = np.mean(ROI) * 0.98 # % of average brightness
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (1, 1), 0)
edged = 255 - cv2.Canny(blurred, 100, 150, apertureSize=7)
thresh, binary = cv2.threshold(blurred, threshold, 255, cv2.THRESH_BINARY)
return binary
def preprocess_image(image_path):
raw_img = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
raw_rgb = cv2.cvtColor(raw_img, cv2.COLOR_BGR2RGB)
rotated = orient_vertical(raw_rgb)
edged = sharpen_edge(rotated)
binary = binarize(edged, 100)
try:
# Try to find the bounding box
boxed, largest_cnt = find_receipt_bounding_box(binary, rotated)
rect, angle = find_tilt_angle(largest_cnt)
tilted, delta = adjust_tilt(boxed, angle)
# Check if cropping is needed
x, y, w, h = cv2.boundingRect(largest_cnt)
img_height, img_width = tilted.shape[:2]
if w > img_width * 0.5 and h < img_height * 0.5:
# Crop if the bounding box is significantly smaller than the image
cropped = crop(tilted, largest_cnt)
else:
# Skip cropping and use the rotated image
cropped = rotated
except Exception as e:
print(f"Error in find_receipt_bounding_box: {e}")
# Skip bounding box and use the rotated image directly
cropped = rotated
# Enhance text in the cropped or rotated image
enhanced = enhance_txt(cropped)
enhanced_rgb = cv2.cvtColor(enhanced, cv2.COLOR_BGR2RGB)
# Save the enhanced image to a temporary file
filename = secure_filename(os.path.basename(image_path + "_enhanced.jpg"))
temp_path = os.path.join('/tmp', filename)
plt.imsave(temp_path, enhanced_rgb)
return temp_path
def process_receipt(image_file):
# Save the file to a temporary location
filename = secure_filename(image_file.filename)
temp_path = os.path.join('/tmp', filename)
image_file.save(temp_path)
# Preprocess the image
processed_img_temp_path = preprocess_image(temp_path)
# Upload the image to Gemini
uploaded_image = genai.upload_file(processed_img_temp_path)
# Create a Gemini model instance
model = genai.GenerativeModel(
'gemini-1.5-flash',
generation_config={"response_mime_type": "application/json"}
)
prompt = """
Analyze the following image of a grocery store receipt, which may be in Thai or English:
[Image]
**Note:** The receipt image has been preprocessed to improve readability, including enhancing text contrast and reducing noise. Any faded text or low-contrast areas are likely adjusted for better clarity.
**Focus on identifying and extracting information for items that are likely ingredients.** For each ingredient:
1. **Ingredient Name:** The name of the product or ingredient.
2. **Price:** The price of the item in Thai Baht (฿).
- **If the price is in a foreign currency (e.g., $, €, £):**
1. Extract the numerical value.
2. Use a reliable currency conversion API to get the current exchange rate.
3. Convert the numerical value to Thai Baht.
4. Format the result as a string with only number in the string.
- **If the price is in Thai Baht or without a currency symbol:**
1. Extract the numerical value.
2. Format the result as a string with only number in the string.
3. **Quantity:** The quantity purchased, as an integer string. If not explicitly stated, estimate the quantity based on image analysis or textual cues.
**Exclude non-ingredient items** like taxes, subtotal, total, store name, date, etc.
Present the information in a structured JSON format, with each ingredient represented as an object containing the three fields: `ingredientName`, `price`, and `quantity`.
"""
# Generate text based on the image
result = model.generate_content([uploaded_image, "\n\n", prompt])
# Remove the temporary file after uploading
os.remove(processed_img_temp_path)
os.remove(temp_path)
# Convert the result text to JSON
result_json = json.loads(result.text)
# Return the JSON response
return result_json
@app.route("/")
def hello_world():
return "<p>Welcome to BakingUp Receipt Scanner</p>"
@app.route('/scan_receipt', methods=['POST'])
def scan_receipt():
if 'file' not in request.files:
return jsonify({'error': 'No file part'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No selected file'}), 400
# Process the uploaded file
extracted_text = process_receipt(file)
return extracted_text
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=8001)