-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
760 lines (623 loc) · 32.5 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
from flask import Flask, render_template, request, jsonify
from flask_sqlalchemy import SQLAlchemy
from flask_migrate import Migrate
from sqlalchemy import inspect
from sqlalchemy import Table, MetaData
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sqlalchemy.ext.declarative import declarative_base
from database_utils import *
import pandas as pd
import pymysql
import json
import random
import requests
from datetime import datetime
from rules import apply_rules, apply_rules_to_database
from flask_cors import CORS
from relationships import define_relationships
from sqlalchemy import inspect
import logging
from dateutil.parser import parse
# Setup basic logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
pymysql.install_as_MySQLdb()
create_clean_table_if_not_exists()
TABLE_NAME = 'db_messy'
CLEAN_TABLE_NAME = 'db_clean'
define_relationships(config_file_path='relationships_config.json')
# At the start of your app to create tables and apply rules
with app.app_context():
model_class = create_model_class(TABLE_NAME)
model_class.__table__.create(bind=db.engine, checkfirst=True)
apply_rules_to_database(TABLE_NAME) # Assuming this function now takes a table name
clean_model_class = create_model_class(CLEAN_TABLE_NAME)
# Load relationships configuration from JSON file
def load_relationships_config():
with open('relationships_config.json', 'r') as file:
config = json.load(file)
return config.get('relationships', [])
def fetch_data_with_primary_key():
# Fetch data including primary key
data_with_primary_key = model_class.query.all()
return data_with_primary_key
# Find related column based on relationships configuration
def find_related_column(column, relationships):
for relationship in relationships:
if column == relationship['main_column']:
return relationship['related_column']
return None
def update_database_with_cleaned_data(cleaned_dataframe, model_class, primary_key_variations):
try:
for index, row in cleaned_dataframe.iterrows():
# Find the primary key column variation that exists in the DataFrame
primary_key_column = next((col for col in primary_key_variations if col in row.index), None)
if primary_key_column:
# Update the row in the database based on the found primary key column
primary_key_value = row[primary_key_column]
print(f"Updating row with primary key {primary_key_column}={primary_key_value} in the database")
db.session.query(model_class).filter(getattr(model_class, primary_key_column) == primary_key_value).update(row.to_dict())
db.session.commit()
#print("Changes committed to the database")
except Exception as e:
print(f"Error committing changes to the database: {e}")
# Function to check correction needed
def check_correction_needed(row):
"""
Check if correction is needed for a given row.
"""
# Example: Check if any value in the row is missing
if any(pd.isnull(row)):
return 1 # Correction needed
else:
return 0 # No correction needed
# Function to check existing data against rules
def check_existing_data(dataframe):
existing_data_errors = []
# Example: Check 'Year' column for non-integer values
for index, row in dataframe.iterrows():
try:
int(row['Year'])
except ValueError:
existing_data_errors.append(
f"Existing data error: The 'Year' data for the year {row['Year']} is not an integer."
)
# Add more checks for other columns later
return existing_data_errors
# Function to train a classification model
def train_classification_model(dataframe, relationships_config):
# Assume 'Correction_Needed' is a new column indicating whether correction is needed (1) or not (0)
dataframe['Correction_Needed'] = dataframe.apply(lambda row: check_correction_needed(row), axis=1)
# Feature extraction
features = dataframe.apply(lambda row: ' '.join([str(row[column]) for column in dataframe.columns]), axis=1)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, dataframe['Correction_Needed'], test_size=0.2, random_state=42)
# Use TF-IDF Vectorizer to convert text data into numerical features
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
# Train a Random Forest Classifier
classifier = RandomForestClassifier()
classifier.fit(X_train_vec, y_train)
# Evaluate the model on the test set
y_pred = classifier.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")
return classifier, vectorizer
# Function to generate questions using the machine learning model
def generate_questions_ml(dataframe, relationships_config, model_class):
# Apply rules
cleaned_dataframe, flagged_values, primary_keys, column_datatypes = apply_rules(dataframe, model_class)# Unpack the tuple
# Train the classification model
classifier, vectorizer = train_classification_model(cleaned_dataframe, relationships_config)
questions = []
question_count = 0
# Keep track of rows to avoid duplicate questions
processed_rows = set()
# Generate Questions for User
for index, row in cleaned_dataframe.iterrows():
# Dynamically fetch column names from the database
text_columns = [column.name for column in inspect(model_class).columns]
feature_vector = vectorizer.transform([str(row[column]) for column in text_columns])
# Predict using the trained model
prediction = classifier.predict(feature_vector)[0]
if index in processed_rows:
continue
# Extract the primary key (ID) column name and value
primary_key_column = 'ID'
row_id = row[primary_key_column]
# Generate a question for each relationship
for relationship in relationships_config:
main_column = relationship['main_column']
related_column = relationship['related_column']
main_value = row[main_column]
related_value = row[related_column]
question_name = f'q_{index}_ml_correct_{main_column}_{related_column}'
main_column_datatype = determine_column_datatype(dataframe[main_column]) # Determine datatype for main_column
related_column_datatype = determine_column_datatype(dataframe[related_column]) # Determine datatype for related_column
# Add the row ID to the question data
question_data = {
'row_id': row_id,
'mainColumn': main_column,
'mainValue': main_value,
'relatedColumn': related_column,
'relatedValue': related_value,
'mainColumnDatatype': main_column_datatype,
'relatedColumnDatatype': related_column_datatype,
}
#print(f"Question Data (Before Adding to Questions): {question_data}")
main_value = row[main_column]
related_value = row[related_column]
# Check if either main or related value is missing or empty
if pd.isnull(main_value) or main_value == '' or pd.isnull(related_value) or related_value == '':
# Generate the question for missing data
#question_text = f"The {related_column}: {related_value}_____ information for the {main_column}: {main_value} is missing (Row ID: {row_id}). Do you want to modify the data?"
question_text = f"The {related_column}: {related_value}_____ information for the {main_column}: {main_value} is missing. Please provide the missing answer"
else:
question_text = f"The {main_column}: {main_value} has {related_value} as {related_column}. Do you want to modify the data?"
#question_text = f"The {main_column} | (Datatype: {main_column_datatype}): {main_value} has {related_value} as {related_column} | (Datatype: {related_column_datatype}). Do you want to modify the data?"
question_count += 1
question = {
'type': 'confirm',
'name': question_name,
'message': question_text,
'default': True,
'data': question_data,
}
questions.append(question)
# Add the row to processed rows to avoid duplicate questions
processed_rows.add(index)
print(f"Total questions generated: {question_count}")
return questions
def generate_question_for_relationship(row, relationship, relationships_config, prediction):
# Get main column and related column from the relationship
main_column = relationship['main_column']
related_column = relationship['related_column']
# Get values for the selected columns
main_value = row[main_column]
related_value = row[related_column]
# Check if both main and related values are not missing
if not (pd.isnull(main_value) or pd.isnull(related_value)):
# If not missing, generate the question without mentioning "missing"
#question_text = f"The {main_column}: {main_value} has {related_value} as {related_column}. Do you want to modify the data? (Predicted: {'Yes' if prediction == 1 else 'No'})"
question_text = f"The {main_column}: {main_value} has {related_value} as {related_column}. Do you want to modify the data?"
else:
# If missing, generate a question explicitly mentioning "missing"
question_text = f"The {related_column}: {related_value}_____ information for the {main_column}: {main_value} is missing. Please provide the missing answer"
return question_text
@app.route('/')
def index():
# Access dynamically created model
data = model_class.query.all()
# Extract relevant columns from the query result
columns_to_include = [column.key for column in model_class.__table__.columns]
data_dict_list = [{col: getattr(row, col) for col in columns_to_include} for row in data]
# Convert the list of dictionaries to a DataFrame
dataframe = pd.DataFrame(data_dict_list)
# Load relationships configuration from JSON file
relationships_config = load_relationships_config()
# Generate questions using machine learning model
questions_ml = generate_questions_ml(dataframe, relationships_config, model_class)
# Ensure 'Correction_Needed' column is present before dropping it
if 'Correction_Needed' in dataframe.columns:
# Remove 'Correction_Needed' column from the DataFrame for the machine learning model
dataframe_ml = dataframe.drop(columns=['Correction_Needed'])
else:
dataframe_ml = dataframe.copy()
# Store the dataframe in the Flask app context for access in other routes
app.config['DATAFRAME'] = dataframe
# Get the first question from the list to extract the primary key (ID) for display
first_question = questions_ml[0] if questions_ml else None
current_row_id = first_question['data']['row_id'] if first_question else None
# Convert the Python list to JSON using json.dumps with double quotes
pre_rendered_questions = json.dumps([
{
"name": question["name"],
"message": question["message"],
"row_id": question['data']['row_id'],
"mainColumn": question['data']['mainColumn'],
"mainValue": question['data']['mainValue'],
"relatedColumn": question['data']['relatedColumn'],
"relatedValue": question['data']['relatedValue'],
"mainColumnDatatype": question['data']['mainColumnDatatype'], # Include mainColumnDatatype
"relatedColumnDatatype": question['data']['relatedColumnDatatype'], # Include relatedColumnDatatype
} for question in questions_ml
], ensure_ascii=False)
# Print the current row ID to the console
if current_row_id is not None:
print("")
return render_template('index.html', pre_rendered_questions=pre_rendered_questions, current_row_id=current_row_id)
@app.route('/update_dialog_values', methods=['POST'])
def update_dialog_values():
logging.debug("Received request to '/update_dialog_values'")
data = request.get_json()
logging.debug(f"Request data: {data}")
try:
row_id = data['rowId']
main_column_name = data['mainColumn']
related_column_name = data['relatedColumn']
table_name = CLEAN_TABLE_NAME
model_class = create_model_class(table_name)
# Insert or update the record in db_clean
record_clean = db.session.query(model_class).filter_by(ID=row_id).first()
if record_clean is None:
record_clean = model_class()
setattr(record_clean, 'ID', row_id) # Assuming 'ID' is the primary key column name
# Update only the main and related columns in db_clean
if 'mainValue' in data and data['mainValue'] is not None:
setattr(record_clean, main_column_name, data['mainValue'])
if 'relatedValue' in data and data['relatedValue'] is not None:
setattr(record_clean, related_column_name, data['relatedValue'])
# Add any additional metadata for the clean table here, if necessary
# For example:
record_clean.ValidationScore = ...
record_clean.ValidatedAt = datetime.now()
record_clean.ValidatorId = ...
db.session.add(record_clean) # This will insert or update the record
db.session.commit()
logging.info(f"Successfully updated row ID: {row_id} in {table_name}")
response_data = {
"status": "success",
"message": f"Values updated successfully in {table_name}",
"updatedValues": {
"mainValue": data.get('mainValue'),
"relatedValue": data.get('relatedValue'),
"rowId": row_id
}
}
logging.debug(f"Response data: {response_data}")
return jsonify(response_data), 200
except Exception as e:
logging.error(f"Error updating values in {table_name}: {e}", exc_info=True)
db.session.rollback()
return jsonify({"status": "error", "message": "Failed to update values"}), 500
def update_row_in_database(row_id, main_value, related_value):
# Implement the logic to update the database row identified by row_id
# with the new values for the main and related columns
record = model_class.query.filter_by(ID=row_id).first()
if record:
record.main_column = main_value # Use the actual column name
record.related_column = related_value # Use the actual related column name
db.session.commit()
else:
print(f"No record found with ID: {row_id}")
@app.route('/find_missing_values')
def find_missing_values():
model_class = create_model_class(TABLE_NAME)
data = model_class.query.all()
dataframe = pd.DataFrame([row.__dict__ for row in data])
relationships_config = load_relationships_config()
# Get the current row ID from the URL parameters
current_row_id = request.args.get('current_row_id', None)
# Print the current row ID to the console
if current_row_id is not None:
print(f"From App Find Missing Function: {current_row_id}")
missing_questions = generate_missing_data_questions(dataframe, relationships_config, current_row_id)
# Convert the Python list to JSON using json.dumps with double quotes
pre_rendered_questions = json.dumps([
{
"name": question["name"],
"message": question["message"].strip().replace('"', '\\"'), # Escape double quotes
"data": {
"row_id": question.get('data', {}).get('row_id', None), # Ensure row_id is included
"datatype": question.get('data', {}).get('datatype', 'appropriate'),
"mainColumn": question.get('data', {}).get('mainColumn', None), # Add mainColumn
"relatedColumn": question.get('data', {}).get('relatedColumn', None) # Add relatedColumn
}
} for question in missing_questions
], ensure_ascii=False)
#print(f"Pre-rendered Questions (Before Template): {pre_rendered_questions}")
return render_template('index.html', pre_rendered_questions=pre_rendered_questions, current_row_id=current_row_id)
displayed_row_id_missing = None
def generate_missing_data_questions(dataframe, relationships_config, current_row_id):
global displayed_row_id_missing
missing_questions = []
for index, row in dataframe.iterrows():
# Check if the row ID matches the current_row_id, if provided
if current_row_id is not None and row['ID'] != current_row_id:
continue
for relationship in relationships_config:
main_column = relationship['main_column']
related_column = relationship['related_column']
main_value = row[main_column]
related_value = row[related_column]
column_datatype = determine_column_datatype(dataframe[related_column])
if pd.isnull(related_value) or related_value == '':
# If related value is missing, generate a question explicitly mentioning "missing"
#question_text = f"The {related_column}: _____ information for the {main_column}: {main_value} is missing (Row ID: {row['ID']}). Please provide the missing answer."
question_text = f"The {related_column}: _____ information for the {main_column}: {main_value} is missing. Please provide the missing answer."
#question_text = f"The {related_column} information for the {main_column}: {main_value} is missing (Row ID: {row['ID']}). Please provide a {column_datatype} value."
# Extract the primary key (ID) column name and value
primary_key_column = 'ID'
row_id = row[primary_key_column]
m_question_name = f'missing_{index}_{main_column}_{related_column}'
question_data = {
'row_id': row_id,
'mainColumn': main_column,
'relatedColumn': related_column,
'datatype': column_datatype
}
#print(f"Question Data (Before Adding to Questions): {question_data}")
missing_questions.append({
'type': 'confirm',
'name': m_question_name,
'message': question_text,
'default': True,
'data': question_data
})
# Store the displayed row ID
displayed_row_id_missing = row_id
return missing_questions
def is_date(string):
try:
parse(string)
return True
except ValueError:
return False
def determine_column_datatype(column):
int_count = 0
float_count = 0
date_count = 0
total_count = len(column)
for value in column.dropna(): # Exclude NaN values
if str(value).isdigit():
int_count += 1
continue
try:
float(value)
float_count += 1
except ValueError:
pass
if is_date(str(value)):
date_count += 1
if int_count / total_count > 0.8:
return 'integer'
elif float_count / total_count > 0.8:
return 'float'
elif date_count / total_count > 0.8:
return 'date'
else:
return 'text'
def generate_flagged_value_questions(flagged_values, primary_keys, model_class, column_datatypes, relationships_config, dataframe):
questions = []
for i, flagged_value in enumerate(flagged_values):
primary_key = primary_keys[i]
row = model_class.query.filter_by(ID=primary_key).first()
if row:
row_id = row.ID # Ensure row_id is set correctly for each row
# Generate relationship-based questions
relationship_questions = []
for relationship in relationships_config:
main_column = relationship['main_column']
related_column = relationship['related_column']
if main_column in row.__dict__ and related_column in row.__dict__:
main_value = getattr(row, main_column)
related_value = getattr(row, related_column)
main_column_datatype = determine_column_datatype(dataframe[main_column])
related_column_datatype = determine_column_datatype(dataframe[related_column])
# Process flagged values
for col_name, col_value in row.__dict__.items():
if col_value == flagged_value:
datatype = column_datatypes.get(col_name, 'unknown datatype')
rule_violation_text = f"This value is violating the rules of the database; the expected value should be of {datatype} datatype."
question_text = f"The value '{flagged_value}' in the '{col_name}' column is marked as flagged. Please provide an accurate value."
questions.append({
"row_id": primary_key,
"question": question_text,
"rule_violation": rule_violation_text,
"col_name": col_name,
"flagged_value": flagged_value,
"datatype": datatype,
'mainColumn': main_column,
'mainValue': main_value,
'relatedColumn': related_column,
'relatedValue': related_value,
'mainColumnDatatype': main_column_datatype,
'relatedColumnDatatype': related_column_datatype,
})
#print(f"Question Data (Before Adding to Questions): {questions}")
return questions
@app.route('/show_flagged_values_questions')
def show_flagged_values_questions():
# Access dynamically created model
data = model_class.query.all()
# Extract relevant columns from the query result
columns_to_include = [column.key for column in model_class.__table__.columns]
data_dict_list = [{col: getattr(row, col) for col in columns_to_include} for row in data]
# Convert the list of dictionaries to a DataFrame
dataframe = pd.DataFrame(data_dict_list)
# Load relationships configuration from JSON file
relationships_config = load_relationships_config()
flagged_values, primary_keys, column_datatypes = apply_rules_to_database(TABLE_NAME)
print(f"Checking the values of flagged_values: {flagged_values} then we checked primary_keys: {primary_keys} after that we went to far and checked column_datatypes: {column_datatypes} ")
questions = generate_flagged_value_questions(flagged_values, primary_keys, model_class, column_datatypes, relationships_config, dataframe)
print(f"HERE IS THE CHECK POINT: {flagged_values}")
return jsonify({'questions': questions})
@app.route('/flagged_update_dialog_values', methods=['POST'])
def flagged_update_dialog_values():
data = request.get_json() # Extract data from the POST request
print("Received data:", data) # Debugging: Log the received data
try:
row_id = data.get('row_id')
if row_id is None or row_id == '':
print("rowId is missing or empty")
return jsonify({"status": "error", "message": "rowId is missing or invalid"}), 400
row_id = int(row_id) # Now convert to int
column_name = data.get('flaggedColumn') # Extract the column name that needs to be updated
new_value = data.get('flaggedValue') # Extract the new value for the flagged column
# Debugging: Log the extracted values
print("Row ID:", row_id)
print("Column Name:", column_name)
print("New Value:", new_value)
# Fetch the record to update from the database using the row ID
record = db.session.query(model_class).filter_by(ID=row_id).first()
if record:
# Dynamically update the specified column with the new value
setattr(record, column_name, new_value)
db.session.commit() # Commit the changes to the database
# Debugging: Log success message
print(f"Successfully updated row {row_id}, column {column_name} with value {new_value}")
return jsonify({"status": "success", "message": "Value updated successfully"}), 200
else:
# Debugging: Log record not found
print(f"Record not found for Row ID: {row_id}")
return jsonify({"status": "error", "message": "Record not found"}), 404
except Exception as e:
# Debugging: Log the error
print(f"Error updating values: {e}")
return jsonify({"status": "error", "message": str(e)}), 500
@app.route('/log_current_row_id', methods=['POST'])
def log_current_row_id():
data = request.get_json()
row_id = data.get('row_id', None)
if row_id is not None:
# Log the row ID to the server console
print("")
return jsonify({'success': True})
# find_missing_values dialog box
@app.route('/process_answers', methods=['POST'])
def process_answers():
data = request.get_json() # This will parse JSON data from the request
print(f"Missing Received data: {data}")
# Validate the presence of required fields
required_fields = ['rowId', 'mainColumn', 'relatedColumn', 'answer']
for field in required_fields:
if field not in data:
print(f"{field} is missing in the request data")
return jsonify({"status": "error", "message": f"{field} is missing"}), 400
try:
row_id = int(data['rowId']) # Convert row_id to int
except ValueError:
print(f"Invalid row ID format: {data['rowId']}")
return jsonify({"status": "error", "message": "Invalid row ID format"}), 400
main_column = data['mainColumn']
related_column = data['relatedColumn']
user_answer = data['answer'] # User answer as a string
# Ensure columns are valid
valid_columns = {column.name for column in inspect(model_class).columns}
if main_column not in valid_columns or related_column not in valid_columns:
print(f"Invalid column name(s): {main_column}, {related_column}")
return jsonify({"status": "error", "message": "Invalid column name(s)"}), 400
# Determine expected datatype for the related column
expected_datatype = data.get('datatype', 'text') # Default to 'text' if not specified
user_answer = data.get('answer', None)
print(f"Expected datatype: {expected_datatype}, Received answer: {user_answer}")
# Convert user_answer to the expected data type
if expected_datatype == 'integer':
try:
user_answer = int(user_answer) # Convert to integer
except ValueError:
return jsonify({"status": "error", "message": "Invalid data type for answer. Expected integer."}), 400
# Add additional checks and conversions for other datatypes as necessary.
# Attempt to update the database with the converted user_answer
try:
return updateDatabaseWithAnswer(
userQuery=user_answer, # Use the converted user_answer
model_class=model_class,
rowId=row_id, # Use the converted row_id
relatedColumn=related_column,
mainColumn=main_column,
expected_datatype=expected_datatype
)
except Exception as e:
print(f"Error updating the database: {e}")
return jsonify({"status": "error", "message": str(e)}), 500 # Internal Server Error
def is_valid_integer(value):
try:
int(value)
return True
except ValueError:
return False
# Function to update the database with the provided answer
def updateDatabaseWithAnswer(userQuery, model_class, rowId, relatedColumn, mainColumn, expected_datatype):
try:
# Convert rowId to integer if it's not already
rowId = int(rowId)
# Find the primary key column variation that exists in the DataFrame
primary_key_column = next((col for col in model_class.__table__.columns.keys() if col.lower() == 'id'), None)
if not primary_key_column:
print("Primary key column not found in the table.")
return jsonify({"status": "error", "message": "Primary key column not found"}), 404
# Attempt to convert userQuery to the correct datatype if necessary
if expected_datatype == 'integer':
if not is_valid_integer(userQuery):
print(f"Value conversion error: {userQuery} is not a valid integer")
return jsonify({"status": "error", "message": "Value must be an integer"}), 400
userQuery = int(userQuery)
# Add additional checks and conversions for other datatypes (float, date, etc.) as necessary.
# Fetch the record to update
record = model_class.query.filter_by(ID=rowId).first()
if record:
# Set the attribute value
setattr(record, relatedColumn, userQuery)
db.session.commit()
print(f"Record updated: {rowId}, {relatedColumn}, {userQuery}")
return jsonify({"status": "success"}), 200
else:
print(f"No record found for ID: {rowId}")
return jsonify({"status": "error", "message": "Record not found"}), 404
except ValueError as e:
print(f"Value conversion error: {e}")
return jsonify({"status": "error", "message": str(e)}), 500
except Exception as e:
print(f"Error updating the database: {e}")
return jsonify({"status": "error", "message": str(e)}), 500
@app.route('/perform_database_operations')
def perform_database_operations():
with app.app_context():
# Use the SQLAlchemy session from the 'db' instance
data = model_class.query.all()
dataframe = pd.DataFrame([row.__dict__ for row in data])
relationships_config = load_relationships_config()
train_classification_model(dataframe, relationships_config)
return 'Database operations performed successfully!'
@app.route('/update_clean_data', methods=['POST'])
def update_clean_data():
clean_model_class = create_model_class(CLEAN_TABLE_NAME)
data = request.get_json()
print("Received update request:", data)
try:
# Retrieve or create a new record for the clean table
record = db.session.execute(db.select(clean_model_class).where(clean_model_class.ID == data['rowId'])).scalar_one_or_none()
#record = YourCleanTable.query.get(data['rowId'])
if not record:
record = clean_model_class(ID=data['rowId'])
# Set the main and related column values
setattr(record, data['mainColumn'], data['mainValue'])
setattr(record, data['relatedColumn'], data['relatedValue'])
# Set additional metadata columns
record.ValidationScore = 1.0 # Example default value
record.ValidatedAt = datetime.now() # Current timestamp
record.ValidatorId = 1 # Example validator ID
db.session.add(record)
db.session.commit()
response_data = {
'status': 'success',
'message': 'Data stored successfully in the clean table',
'mainValue': data['mainValue'],
'relatedValue': data['relatedValue']
}
# Print the response data right before returning it
print("**Another one is Sending response:", response_data)
return jsonify(response_data)
except Exception as e:
db.session.rollback()
return jsonify({'status': 'error', 'message': str(e)})
if __name__ == "__main__":
# Apply rules to the database during application startup
with app.app_context():
flagged_values, primary_keys, column_datatypes = apply_rules_to_database(TABLE_NAME)
# Store flagged values, primary keys, and column datatypes in the Flask application context
app.config['FLAGGED_VALUES'] = flagged_values
app.config['PRIMARY_KEYS'] = primary_keys
app.config['COLUMN_DATATYPES'] = column_datatypes
# Print the flagged values with primary keys
print("\nShowing From App.py Flagged Values with their Primary Keys:")
for i in range(len(flagged_values)):
value = flagged_values[i]
primary_key = primary_keys[i]
print(f"ID: '{primary_key}' Value: '{value}'")
# Run the Flask app
app.run(debug=False, port=5001)