-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
145 lines (111 loc) · 5.05 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from flask import Flask, render_template, request # Flask Library
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer # sklearn Library
import pickle # Read Pickle File
import filetype
import filehandle # Import filehandle.py -- Read PDF & WORD
import textprocess_ML # Import textprocess_ML.py -- Preprocess clean text list
import magic # Checks for file Format
app = Flask(__name__)
from datetime import datetime
# GET Default Route
@app.route('/')
def index():
return render_template('index.html')
# GET | POST Predict Route
@app.route('/predict', methods=['GET', 'POST'])
def file():
if request.method == 'POST':
# Start Time
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
# File Request
file = request.files['file']
file.save("Documents_Uploaded/"+file.filename)
file_error = "Not a valid document type \n Please select PDF or WORD"
FileType = []
# Error Check for Documents_Upload Folder
try:
FileType = magic.from_file("Documents_Uploaded/"+file.filename)
except:
return render_template('index.html', err=file_error, filename=file.filename)
# File Format Check
FileTypeArray = []
FileTypeArray=FileType.split(" ")
# Format: WORD
if "Word" in FileTypeArray:
try:
# Predict WORD CONTENT
tags=predict(filehandle.wordtolist(filename=file.filename),now)
return render_template('index.html', tag1=tags[0][0],tag2=tags[0][1],filename=file.filename,format="WORD",duration=tags[1])
except:
# Error Reading WORD CONTENT
return render_template('index.html', err=file_error, filename=file.filename, format="WORD")
# Format: PDF
elif "PDF" in FileTypeArray:
try:
# Predict PDF CONTENT
tags=predict(filehandle.pdftolist(filename=file.filename),now)
return render_template('index.html', tag1=tags[0][0],tag2=tags[0][1],filename=file.filename,format="PDF",duration=tags[1])
except:
# Error Reading PDF CONTENT
file_error = "This PDF file has no content in it"
return render_template('index.html', err=file_error, filename=file.filename, format="PDF")
# Format: ZIP or WORD
elif "Zip" in FileTypeArray:
''' Certain Word Files formats are ZIP format '''
try:
# Predict WORD / ZIP CONTENT
tags=predict(filehandle.wordtolist(filename=file.filename),now)
return render_template('index.html', tag1=tags[0][0],tag2=tags[0][1],filename=file.filename,format="WORD",duration=tags[1])
except:
# Not a PDF / WORD CONTENT
return render_template('index.html', err=file_error, filename=file.filename, format="ZIP")
# Format: Unknow
else:
try:
# Predict WORD / Unknow CONTENT
tags=predict(filehandle.wordtolist(filename=file.filename),now)
return render_template('index.html', tag1=tags[0][0],tag2=tags[0][1],filename=file.filename,format="WORD",duration=tags[1])
except:
# Not a PDF / WORD CONTENT
return render_template('index.html', err=file_error, filename=file.filename, format="Unknow Format")
# Predicting Content
def predict(textlist,now):
''' A function that predicts the given file to which tag does it belong '''
# Read Vectorized Pickle File
with open('vectorizer.pickle', 'rb') as file:
vectorizer = pickle.load(file)
# Cleans Text List
textlist_clean=textprocess_ML.clean_textlist(textlist)
# Converts Text to Vector-Form
textlist_vec=vectorizer.transform(textlist_clean)
# Read Naive Bayes Model
with open('1NB_model', 'rb') as file:
NB_model = pickle.load(file)
# Read Decision Tree Model
with open('2DT_model', 'rb') as file:
DT_model = pickle.load(file)
# Predict Naive Bayes Model
Naive_Bayes_Tag = add_freq_element(NB_model.predict(textlist_vec).tolist(),"Naive Bayes")
# Predict Decision Tree Model
Decision_Tree_Tag = add_freq_element(DT_model.predict(textlist_vec).tolist(),"Decision Tree")
# Calculate Time Taken to Complete Prediction
later = datetime.now()
duration = str(later-now)
duration = duration.split(".")[0]
# Predicted Tags
alltags = [Naive_Bayes_Tag,Decision_Tree_Tag]
result = [alltags,duration]
# Return Required Data
return result
def add_freq_element(predict_list,model_name):
''' Function to find the highest frequency and map to the Tags '''
tag_value=max(predict_list,key=predict_list.count)
tag_list=['Business','Entertainment','Politics','Sport','Technology']
tags=tag_list[tag_value]
# Completion of Prediction Model
print(model_name,"Tag Count Ready")
return tags
if __name__ == "__main__":
app.run(debug=True)