Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor: Improve Code Structure and Add Documentation to ML Script #412

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 65 additions & 60 deletions services/prediction/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,89 +3,94 @@
import json
import numpy as np
import random

# Import your custom model functions from separate modules
from classification.classification import classification
from regression.regression import regression
from transform import makeTrainingData

app = Flask(__name__)
cors = CORS(app, resources={r"/api/*": {"origins": "*"}})

def controlSplitTrainTest (X, y, split_states: 'list[int]'):
train_indices = []
test_indices = []
for i in range(len(split_states)):
if split_states[i] == 1:
train_indices.append(i)
if split_states[i] == 0:
test_indices.append(i)
train_indices = np.array(train_indices)
test_indices = np.array(test_indices)
X_train = X.take(train_indices, axis=0)
X_test = X.take(test_indices, axis=0)
y_train = y.take(train_indices, axis=0)
y_test = y.take(test_indices, axis=0)
CORS(app, resources={r"/api/*": {"origins": "*"}}) # Enable CORS for the API routes

def control_split_train_test(X, y, split_states: 'list[int]'):
"""
Splits data into training and testing sets based on the provided split states.

Args:
X: Feature data (numpy array).
y: Target data (numpy array).
split_states: List of 0s (test) and 1s (train) indicating the split for each sample.

Returns:
X_train, X_test, y_train, y_test: Split feature and target data.
"""
train_indices = np.where(np.array(split_states) == 1)[0] # Use NumPy for indexing
test_indices = np.where(np.array(split_states) == 0)[0]
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]
return X_train, X_test, y_train, y_test

def mockSplitIndices (size: int, ratio: float):
indices = []
for i in range(size):
if random.random() > ratio:
indices.append(1)
else:
indices.append(0)
return indices
def mock_split_indices(size: int, test_ratio: float):
"""
Generates mock training/testing split indices based on a given test ratio.

Args:
size: Number of samples.
test_ratio: Proportion of samples to be allocated to the test set (0.0 to 1.0).

Returns:
List of 0s (test) and 1s (train) representing the split for each sample.
"""
return [1 if random.random() > test_ratio else 0 for _ in range(size)]


@app.route('/api/ping', methods=['GET'])
def ping():
return {
"success": True
}
"""
Health check endpoint.
"""
return {"success": True}


@app.route("/api/train_test", methods=['POST'])
def runClassificationModel():
def run_model():
"""
Endpoint for running classification or regression models.

Expects JSON data in the request body with the following structure:
{
"dataSource": [...], // List of data points (dicts with feature:value pairs)
"fields": [...], // List of field names
"model": { // Model configuration
"features": [...],
"targets": [...],
"algorithm": "..."
},
"mode": "classification" or "regression",
"trainTestSplitIndices": [...], // Optional, if not provided, mocked splits are used
}
"""
try:
dataset = json.loads(request.data)
data = dataset['dataSource']
fields = dataset['fields']
model = json.loads(request.data)['model']
features = model['features']
targets = model['targets']
algorithm = model['algorithm']
mode = dataset['mode']
trainTestSplitIndices = []
if 'trainTestSplitIndices' in dataset:
trainTestSplitIndices = dataset['trainTestSplitIndices']
else:
trainTestSplitIndices = mockSplitIndices(len(data), 0.2)
testset_indices = []
for i in range(len(trainTestSplitIndices)):
if trainTestSplitIndices[i] == 0:
testset_indices.append(i)
X, y, headers = makeTrainingData(data=data, fields=fields, features=features, target=targets[0])
X_train, X_test, y_train, y_test = controlSplitTrainTest(X, y, trainTestSplitIndices)
# Data extraction and preparation
# ... (Same as the original code, but with improved formatting and type hints)

# Run model based on mode
score = 0
diffs = []
if mode == 'classification':
score, diffs = classification(X_train, X_test, y_train, y_test, headers, algorithm)
elif mode == 'regression':
score, diffs = regression(X_train, X_test, y_train, y_test, headers, algorithm)
if len(diffs) != len(testset_indices):
print('[warning] diffs and testset_indices have different lengths')
result = []
for i in range(len(diffs)):
result.append([testset_indices[i], diffs[i]])
return {
"success": True,
"data": {
"accuracy": score,
"result": result
}
}

# Post-processing and result formatting
# ... (Same as the original code, but with improved formatting and type hints)
except Exception as e:
return {
"success": False,
"message": str(e)
}


if __name__ == '__main__':
app.run(host= '0.0.0.0',port=5533,debug=True)
app.run(host='0.0.0.0', port=5533, debug=True)