Skip to content

Commit

Permalink
Merging with main
Browse files Browse the repository at this point in the history
  • Loading branch information
rawanmahdi committed Apr 5, 2024
2 parents 8098c38 + 2ef50d6 commit 0b1fa25
Show file tree
Hide file tree
Showing 15 changed files with 6,125 additions and 769 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.html linguist-vendored
9 changes: 6 additions & 3 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,19 @@ jobs:

- id: 'auth'
name: Authenticate to Google Cloud
uses: google-github-actions/auth@v0
uses: google-github-actions/auth@v1
with:
credentials_json: ${{ secrets.OWNER_SA_KEY }}

- name: Decode credentials.json
run: echo "${{ secrets.B64_ENCODED_KEY }}" | base64 --decode > credentials.json
run: echo "${{ secrets.B64_ENCODED_KEY }}" | base64 --decode > ./backend/credentials.json

- name: Install GCloud CLI
uses: google-github-actions/setup-gcloud@v0
uses: google-github-actions/setup-gcloud@v1

- name: List files in the backend directory
run: ls -l ./backend

- name: Build and Push Backend Docker Image
env:
GOOGLE_PROJECT: ${{ secrets.PROJECT_ID }}
Expand Down
8 changes: 4 additions & 4 deletions DEPLOYMENT_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ To deploy the application on the cloud using GKE, follow these steps:
2. `docker.yml` which builds the frontend & backend Docker image for the application and pushes it to the artifact registry on gcp.
3. `kubernetes.yml` which creates a kubernetes deployment and service for the application on GKE using the files in `/cloud-infra/k8s` dir.

3. The application will be deployed on GKE and accessible through the external IP provided by the Load Balancer service.
3. The application will be deployed **(might take same time for the status to turn green)** on GKE and accessible through the external IP provided by the Load Balancer service.

4. Go to [link](https://console.cloud.google.com/kubernetes/deployment/us-east1-b/automl-cluster/default/backend-deployment/overview?project=automateml&supportedpurview=project) and scroll down to view the external IP of the Load Balancer service.
4. Go to [link](https://console.cloud.google.com/kubernetes/deployment/us-east1-b/automl-cluster/default/backend-deployment/overview?project=automateml&supportedpurview=project) and scroll down to view the external IP of the Load Balancer service. (see image below 👇)

![alt text](image.png)

>Please note that it will take some time for the application to be fully functional.
>If you see any errors, try refreshing the page after some time.
>[!NOTE]
> you might have to setup some gcloud configurations to get this to work.
45 changes: 31 additions & 14 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
# argument to be passed while running the docker build command
ARG CREDENTIALS_JSON

# Use the official Python 3.11 image from Docker Hub
FROM python:3.11.6-slim
# Stage 1: Install dependencies and build the app
FROM python:3.11.8-slim AS builder

# Set the working directory in the container
WORKDIR /app

# Install build dependencies
RUN apt-get update && apt-get install -y gcc musl-dev python3-dev
RUN apt-get update && apt-get install -y gcc g++ musl-dev python3-dev

# Install and cleanup
RUN apt-get update && apt-get install -y --no-install-recommends \
Expand All @@ -18,19 +15,39 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Copy the pyproject.toml and poetry.lock files into the container
COPY pyproject.toml poetry.lock ./
# Copy the pyproject.toml & poetry.lock files into the container (app dir)
COPY ./pyproject.toml ./poetry.lock* ./

# Create the requirements.txt file from the pyproject.toml file
RUN poetry export -f requirements.txt --output requirements.txt --without-hashes

# Install the dependencies
RUN poetry config virtualenvs.create false && poetry install --no-dev
# Install the Python packages
RUN pip install --no-cache-dir --upgrade -r ./requirements.txt

# Stage 2: Run the app
FROM python:3.11.8-slim AS runner

# argument to be passed while running the docker build command
ARG CREDENTIALS_JSON

# Copy the creds file and compute dir into the container
WORKDIR /app

# Copy the installed Python packages from the builder stage
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages

RUN pip uninstall -y uvicorn
RUN pip install uvicorn[standard]==0.23.2

# Copy the required files & dir into the container
COPY compute ./compute
COPY ${CREDENTIALS_JSON} ./
COPY big_query.py ./
COPY ${CREDENTIALS_JSON} ./

# create dir for the HTML files created during profiling
RUN mkdir tempHTML

# Copy the FastAPI application into the container
COPY big_query.py ./
COPY main.py ./

# Specify the command to run the FastAPI application using uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
Empty file removed backend/backend/__init__.py
Empty file.
32 changes: 11 additions & 21 deletions backend/compute/autoEDA.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,28 @@
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

# main function to generate the EDA report
def profile(file_path):
df = pd.read_csv(file_path)
profile = ProfileReport(df, title="Profiling Report")
unique_filename = f"your_report_{uuid.uuid4()}.html"
profile.to_file(f"tempHTML/{unique_filename}")
return unique_filename

# legacy - not called by backend
def generate_eda_plots(df):
# Create a correlation matrix
corr_matrix = df.corr()

# Create variable distributions
variable_distributions = sns.pairplot(df)

return corr_matrix, variable_distributions
return corr_matrix


def analyze_csv_and_plot(file_path):
# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Generate EDA plots
corr_matrix, variable_distributions = generate_eda_plots(df)
corr_matrix = generate_eda_plots(df)

# Display the plots (or save them to files if needed)
plt.show()
Expand All @@ -42,23 +47,8 @@ def generate_eda(file_path):

# Create variable distributions (pairplot) and save it as a PNG
plt.figure(figsize=(10, 10))
variable_distributions = sns.pairplot(df)

unique_filename = f"variable_distributions_{uuid.uuid4()}.png"
variable_distributions.savefig(f"tempImages/{unique_filename}")
plt.close()

return corr_matrix, unique_filename


# # Example: Provide the path to your CSV file
csv_file_path = (
r"C:/Users/Rawan Alamily/Downloads/GDSC/Auto-ML/backend/data/electric_cars1.csv"
)

# analyze_csv_and_plot(csv_file_path)

# generate_eda(r"/Users/abedm/Documents/repos/Auto-ML/backend/compute/data.csv")
return corr_matrix


def get_nulls(file_path):
Expand Down
43 changes: 35 additions & 8 deletions backend/compute/autoML.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,59 @@
# from pycaret.classification import setup as classification_setup, compare_models as classification_compare_models, finalize_model
# from pycaret.regression import setup as regression_setup, compare_models as regression_compare_models, finalize_model

import pycaret.classification as pycaret_cl
from pycaret.classification import *
import pycaret.regression as pycaret_rg

import pandas as pd
import joblib
import uuid
import matplotlib.pyplot as plt
import os

def perform_classification(data, target_column):

pycaret_cl.setup(data = data, target = target_column)
clf1 = pycaret_cl.setup(data = data, target = target_column)

best_model = pycaret_cl.compare_models()

model_file_path = 'classification_model.pkl'
joblib.dump(best_model, model_file_path)

return best_model, model_file_path
#generating scoring/accuray grid
dt = pycaret_cl.create_model('dt')
dt_results = pycaret_cl.pull()
scoring_grid_filename = os.path.join('tempData', f"scoring_grid_{uuid.uuid4()}.csv")

dt_results.to_csv(scoring_grid_filename, index=False)

#plotting model
lr = pycaret_cl.create_model('lr')
plot_filename = f"plot_{uuid.uuid4()}.png"
plot_model = pycaret_cl.plot_model(lr, plot='auc', save='tempData')

return best_model, model_file_path, scoring_grid_filename, plot_filename

def perform_regression(data, target_column):
#IMPLEMENT SAME FOR REGRESSION LATER

pycaret_rg.setup(data = data, target = target_column)
best_model = pycaret_rg.compare_models()

model_file_path = 'regression_model.pkl'
joblib.dump(best_model, model_file_path)

return best_model, model_file_path
#generating scoring/accuracy chart
dt = pycaret_rg.create_model('dt')
dt_results = pycaret_rg.pull()
scoring_grid_filename = f"scoring_grid_{uuid.uuid4()}.csv"
dt_results.to_csv(scoring_grid_filename, index=False)

#plotting model
lr = pycaret_rg.create_model('lr')
plot_filename = f"plot_{uuid.uuid4()}.png"
pycaret_rg.plot_model(lr, plot='auc', save=True, plot_name=plot_filename)

return best_model, model_file_path, scoring_grid_filename, plot_filename

def generate_model(data, target_column, task):

Expand All @@ -34,10 +62,9 @@ def generate_model(data, target_column, task):

if task == 'C':
perform_classification(df, target_column) # Call classification_setup() before classification_compare_models()
final_model, model_file_path = perform_classification(df, target_column)
final_model, model_file_path, scoring_grid_filename, plot_filename = perform_classification(df, target_column)
elif task == 'R':
perform_regression(df, target_column) # Call regression_setup() before regression_compare_models()
final_model, model_file_path = perform_regression(df, target_column)

return final_model, model_file_path
final_model, model_file_path, scoring_grid_filename, plot_filename = perform_regression(df, target_column)

return final_model, model_file_path, scoring_grid_filename, plot_filename
79 changes: 61 additions & 18 deletions backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from fastapi import Request
from fastapi import HTTPException
from fastapi.responses import StreamingResponse, HTMLResponse
import shutil


# custom functions for EDA and AutoML
Expand All @@ -25,6 +26,7 @@
DATA_BUCKET = "automate-ml-datasets"
GRAPH_BUCKET = "automate_ml_graphs"
MODEL_BUCKET = "automl_gdsc_models"
ML_PLOT_BUCKET = "automl_gdsc_mlplot"
origins = ["*"]

app.add_middleware(
Expand Down Expand Up @@ -279,62 +281,103 @@ async def getProfile(fileName):
# return {}


# start the automl process
#start the automl process
@app.get("/api/generateModel")
async def getModel(fileName, column, task):
async def getModel(fileName, column,
task):
plot_filename = ""
scoreGridLines = ""
try:

temp_dir = 'tempData'
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)

storage_client = storage.Client.from_service_account_json("./credentials.json")

#retreive data
data_bucket = storage_client.get_bucket(DATA_BUCKET)
blob = data_bucket.blob(f"{fileName}.csv")

byte_stream = BytesIO()
blob.download_to_file(byte_stream)
byte_stream.seek(0)

# producing model
model, model_file_path = generate_model(byte_stream, column, task)
#producing model
model, model_file_path, scoring_grid_filename, plot_filename = generate_model(byte_stream, column, task)

# upload model to model bucket
#upload model to model bucket
model_bucket = storage_client.get_bucket(MODEL_BUCKET)
model_blob = model_bucket.blob(f"{fileName}.pkl")
with open(model_file_path, "rb") as model_file:
model_blob.upload_from_file(
model_file, content_type="application/octet-stream"
)
model_blob.upload_from_file(model_file, content_type="application/octet-stream")

#put model into model bucket
bucket = storage_client.get_bucket(MODEL_BUCKET)
blob = bucket.blob(fileName)

#put score grid into plot bucket
scoring_grid_bucket = storage_client.get_bucket(ML_PLOT_BUCKET)
scoring_grid_blob = scoring_grid_bucket.blob(scoring_grid_filename)
with open(scoring_grid_filename, "rb") as scoring_grid_file:
scoring_grid_blob.upload_from_file(scoring_grid_file, content_type="text/csv")

#store scoring/accuracy grid
bucket = storage_client.get_bucket(ML_PLOT_BUCKET)
blob = bucket.blob(scoring_grid_filename)

#convert it to csv and json
with blob.open("r") as f :
scoreGridLines = f.read()
scoreGridLines = str(scoreGridLines) if scoreGridLines else None
csv_reader = csv.DictReader(StringIO(scoreGridLines))
json_data = [row for row in csv_reader]

return fileName, column, task
#put plot into plot bucket
plot_bucket = storage_client.get_bucket(ML_PLOT_BUCKET)
plot_blob = plot_bucket.blob(plot_filename)
blob.content_type = 'image/png'
plot_blob.upload_from_filename("tempData/AUC.png")

#get the url of the plot
public_url = plot_blob.public_url

return {"scoring_grid": scoreGridLines, "json": json_data, "plot_model_url": public_url}

except Exception as e:
return {"error": f"An error occurred: {str(e)}"}

finally:
#Delete the temporary file
#if os.path.exists("tempImages/AUC.png"):
# os.remove("tempImages/AUC.png")
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)



# retreive the model and download it
#retreive the model and download it
@app.get("/api/downloadModel")
async def downloadModel():
try:
# action
#action
storage_client = storage.Client.from_service_account_json("./credentials.json")

# retreiving the data from bucket
#retreiving the data from bucket
bucket = storage_client.get_bucket(MODEL_BUCKET)
blobs = list(bucket.list_blobs())
blob = blobs[0]

byte_stream = BytesIO()
blob.download_to_file(byte_stream)
byte_stream.seek(0)

# remove it from the bucket
#remove it from the bucket
blob.delete()

return StreamingResponse(byte_stream, media_type="application/octet-stream")


except Exception as e:
return {"error": f"An error occurred: {str(e)}"}


# big query operations
@app.get("/api/bq")
async def bq(fileName, query=None):
Expand Down
Loading

0 comments on commit 0b1fa25

Please sign in to comment.