Merging with main

DSC-McMaster-U · Apr 5, 2024 · 0b1fa25 · 0b1fa25
2 parents 8098c38 + 2ef50d6
commit 0b1fa25
Show file tree

Hide file tree

Showing 15 changed files with 6,125 additions and 769 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.html linguist-vendored
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -16,16 +16,19 @@ jobs:
 
         - id: 'auth'
           name: Authenticate to Google Cloud
-          uses: google-github-actions/auth@v0
+          uses: google-github-actions/auth@v1
           with: 
             credentials_json: ${{ secrets.OWNER_SA_KEY }}
 
         - name: Decode credentials.json
-          run: echo "${{ secrets.B64_ENCODED_KEY }}" | base64 --decode > credentials.json
+          run: echo "${{ secrets.B64_ENCODED_KEY }}" | base64 --decode > ./backend/credentials.json
 
         - name: Install GCloud CLI
-          uses: google-github-actions/setup-gcloud@v0
+          uses: google-github-actions/setup-gcloud@v1
 
+        - name: List files in the backend directory
+          run: ls -l ./backend
+
         - name: Build and Push Backend Docker Image
           env:
             GOOGLE_PROJECT: ${{ secrets.PROJECT_ID }}

diff --git a/DEPLOYMENT_GUIDE.md b/DEPLOYMENT_GUIDE.md
@@ -26,12 +26,12 @@ To deploy the application on the cloud using GKE, follow these steps:
    2. `docker.yml` which builds the frontend & backend Docker image for the application and pushes it to the artifact registry on gcp.
    3. `kubernetes.yml` which creates a kubernetes deployment and service for the application on GKE using the files in `/cloud-infra/k8s` dir.
 
-3. The application will be deployed on GKE and accessible through the external IP provided by the Load Balancer service.
+3. The application will be deployed **(might take same time for the status to turn green)** on GKE and accessible through the external IP provided by the Load Balancer service.
 
-4. Go to [link](https://console.cloud.google.com/kubernetes/deployment/us-east1-b/automl-cluster/default/backend-deployment/overview?project=automateml&supportedpurview=project) and scroll down to view the external IP of the Load Balancer service.
+4. Go to [link](https://console.cloud.google.com/kubernetes/deployment/us-east1-b/automl-cluster/default/backend-deployment/overview?project=automateml&supportedpurview=project) and scroll down to view the external IP of the Load Balancer service. (see image below 👇)
+
+![alt text](image.png)
 
 >Please note that it will take some time for the application to be fully functional.
 >If you see any errors, try refreshing the page after some time.
-
->[!NOTE]
 > you might have to setup some gcloud configurations to get this to work.
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -1,14 +1,11 @@
-# argument to be passed while running the docker build command
-ARG CREDENTIALS_JSON
-
-# Use the official Python 3.11 image from Docker Hub
-FROM python:3.11.6-slim
+# Stage 1: Install dependencies and build the app
+FROM python:3.11.8-slim AS builder
 
 # Set the working directory in the container
 WORKDIR /app
 
 # Install build dependencies
-RUN apt-get update && apt-get install -y gcc musl-dev python3-dev
+RUN apt-get update && apt-get install -y gcc g++ musl-dev python3-dev
 
 # Install and cleanup
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -18,19 +15,39 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
-# Copy the pyproject.toml and poetry.lock files into the container
-COPY pyproject.toml poetry.lock ./
+# Copy the pyproject.toml & poetry.lock files into the container (app dir)
+COPY ./pyproject.toml ./poetry.lock* ./
+
+# Create the requirements.txt file from the pyproject.toml file
+RUN poetry export -f requirements.txt --output requirements.txt --without-hashes
 
-# Install the dependencies
-RUN poetry config virtualenvs.create false && poetry install --no-dev
+# Install the Python packages
+RUN pip install --no-cache-dir --upgrade -r ./requirements.txt
+
+# Stage 2: Run the app
+FROM python:3.11.8-slim AS runner
+
+# argument to be passed while running the docker build command
+ARG CREDENTIALS_JSON
 
-# Copy the creds file and compute dir into the container
+WORKDIR /app
+
+# Copy the installed Python packages from the builder stage
+COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
+
+RUN pip uninstall -y uvicorn
+RUN pip install uvicorn[standard]==0.23.2
+
+# Copy the required files & dir into the container
 COPY compute ./compute
-COPY  ${CREDENTIALS_JSON} ./
+COPY big_query.py ./
+COPY ${CREDENTIALS_JSON} ./
+
+# create dir for the HTML files created during profiling
+RUN mkdir tempHTML
 
 # Copy the FastAPI application into the container
-COPY big_query.py ./
 COPY main.py ./
 
 # Specify the command to run the FastAPI application using uvicorn
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/backend/backend/__init__.py b/backend/backend/__init__.py
diff --git a/backend/compute/autoEDA.py b/backend/compute/autoEDA.py
@@ -4,23 +4,28 @@
 import matplotlib.pyplot as plt
 from ydata_profiling import ProfileReport
 
+# main function to generate the EDA report
+def profile(file_path):
+    df = pd.read_csv(file_path)
+    profile = ProfileReport(df, title="Profiling Report")
+    unique_filename = f"your_report_{uuid.uuid4()}.html"
+    profile.to_file(f"tempHTML/{unique_filename}")
+    return unique_filename
 
+# legacy - not called by backend 
 def generate_eda_plots(df):
     # Create a correlation matrix
     corr_matrix = df.corr()
 
-    # Create variable distributions
-    variable_distributions = sns.pairplot(df)
-
-    return corr_matrix, variable_distributions
+    return corr_matrix
 
 
 def analyze_csv_and_plot(file_path):
     # Read the CSV file into a pandas DataFrame
     df = pd.read_csv(file_path)
 
     # Generate EDA plots
-    corr_matrix, variable_distributions = generate_eda_plots(df)
+    corr_matrix = generate_eda_plots(df)
 
     # Display the plots (or save them to files if needed)
     plt.show()
@@ -42,23 +47,8 @@ def generate_eda(file_path):
 
     # Create variable distributions (pairplot) and save it as a PNG
     plt.figure(figsize=(10, 10))
-    variable_distributions = sns.pairplot(df)
 
-    unique_filename = f"variable_distributions_{uuid.uuid4()}.png"
-    variable_distributions.savefig(f"tempImages/{unique_filename}")
-    plt.close()
-
-    return corr_matrix, unique_filename
-
-
-# # Example: Provide the path to your CSV file
-csv_file_path = (
-    r"C:/Users/Rawan Alamily/Downloads/GDSC/Auto-ML/backend/data/electric_cars1.csv"
-)
-
-# analyze_csv_and_plot(csv_file_path)
-
-# generate_eda(r"/Users/abedm/Documents/repos/Auto-ML/backend/compute/data.csv")
+    return corr_matrix
 
 
 def get_nulls(file_path):

diff --git a/backend/compute/autoML.py b/backend/compute/autoML.py
@@ -1,31 +1,59 @@
 # from pycaret.classification import setup as classification_setup, compare_models as classification_compare_models, finalize_model
 # from pycaret.regression import setup as regression_setup, compare_models as regression_compare_models, finalize_model
-
 import pycaret.classification as pycaret_cl
+from pycaret.classification import *
 import pycaret.regression as pycaret_rg
 
 import pandas as pd
 import joblib
+import uuid
+import matplotlib.pyplot as plt
+import os
 
 def perform_classification(data, target_column):
 
-    pycaret_cl.setup(data = data, target = target_column)
+    clf1 = pycaret_cl.setup(data = data, target = target_column)
+
     best_model = pycaret_cl.compare_models()
 
     model_file_path = 'classification_model.pkl'
     joblib.dump(best_model, model_file_path)
 
-    return best_model, model_file_path
+    #generating scoring/accuray grid
+    dt = pycaret_cl.create_model('dt')
+    dt_results = pycaret_cl.pull()
+    scoring_grid_filename = os.path.join('tempData', f"scoring_grid_{uuid.uuid4()}.csv")
+
+    dt_results.to_csv(scoring_grid_filename, index=False)
+
+    #plotting model
+    lr = pycaret_cl.create_model('lr')
+    plot_filename = f"plot_{uuid.uuid4()}.png"
+    plot_model = pycaret_cl.plot_model(lr, plot='auc', save='tempData')
+
+    return best_model, model_file_path, scoring_grid_filename, plot_filename
 
 def perform_regression(data, target_column):
+    #IMPLEMENT SAME FOR REGRESSION LATER
 
     pycaret_rg.setup(data = data, target = target_column)
     best_model = pycaret_rg.compare_models()
 
     model_file_path = 'regression_model.pkl'
     joblib.dump(best_model, model_file_path)
 
-    return best_model, model_file_path
+    #generating scoring/accuracy chart
+    dt = pycaret_rg.create_model('dt')
+    dt_results = pycaret_rg.pull()
+    scoring_grid_filename = f"scoring_grid_{uuid.uuid4()}.csv"
+    dt_results.to_csv(scoring_grid_filename, index=False)
+
+    #plotting model
+    lr = pycaret_rg.create_model('lr')
+    plot_filename = f"plot_{uuid.uuid4()}.png"
+    pycaret_rg.plot_model(lr, plot='auc', save=True, plot_name=plot_filename)
+
+    return best_model, model_file_path, scoring_grid_filename, plot_filename
 
 def generate_model(data, target_column, task):
 
@@ -34,10 +62,9 @@ def generate_model(data, target_column, task):
 
     if task == 'C':
         perform_classification(df, target_column)  # Call classification_setup() before classification_compare_models()
-        final_model, model_file_path = perform_classification(df, target_column)
+        final_model, model_file_path, scoring_grid_filename, plot_filename = perform_classification(df, target_column)
     elif task == 'R':
         perform_regression(df, target_column)  # Call regression_setup() before regression_compare_models()
-        final_model, model_file_path = perform_regression(df, target_column)
-
-    return final_model, model_file_path
+        final_model, model_file_path, scoring_grid_filename, plot_filename = perform_regression(df, target_column)
 
+    return final_model, model_file_path, scoring_grid_filename, plot_filename
diff --git a/backend/main.py b/backend/main.py
@@ -10,6 +10,7 @@
 from fastapi import Request
 from fastapi import HTTPException
 from fastapi.responses import StreamingResponse, HTMLResponse
+import shutil
 
 
 # custom functions for EDA and AutoML
@@ -25,6 +26,7 @@
 DATA_BUCKET = "automate-ml-datasets"
 GRAPH_BUCKET = "automate_ml_graphs"
 MODEL_BUCKET = "automl_gdsc_models"
+ML_PLOT_BUCKET = "automl_gdsc_mlplot"
 origins = ["*"]
 
 app.add_middleware(
@@ -279,62 +281,103 @@ async def getProfile(fileName):
 #     return {}
 
 
-# start the automl process
+#start the automl process
 @app.get("/api/generateModel")
-async def getModel(fileName, column, task):
+async def getModel(fileName, column, 
+task):
+    plot_filename = ""
+    scoreGridLines = ""
     try:
-
+        temp_dir = 'tempData'
+        if not os.path.exists(temp_dir):
+            os.makedirs(temp_dir)
+
         storage_client = storage.Client.from_service_account_json("./credentials.json")
 
+        #retreive data
         data_bucket = storage_client.get_bucket(DATA_BUCKET)
         blob = data_bucket.blob(f"{fileName}.csv")
-
         byte_stream = BytesIO()
         blob.download_to_file(byte_stream)
         byte_stream.seek(0)
 
-        # producing model
-        model, model_file_path = generate_model(byte_stream, column, task)
+        #producing model
+        model, model_file_path, scoring_grid_filename, plot_filename = generate_model(byte_stream, column, task)
 
-        # upload model to model bucket
+        #upload model to model bucket
         model_bucket = storage_client.get_bucket(MODEL_BUCKET)
         model_blob = model_bucket.blob(f"{fileName}.pkl")
         with open(model_file_path, "rb") as model_file:
-            model_blob.upload_from_file(
-                model_file, content_type="application/octet-stream"
-            )
+            model_blob.upload_from_file(model_file, content_type="application/octet-stream")
+
+        #put model into model bucket
+        bucket = storage_client.get_bucket(MODEL_BUCKET)
+        blob = bucket.blob(fileName)
+
+        #put score grid into plot bucket
+        scoring_grid_bucket = storage_client.get_bucket(ML_PLOT_BUCKET)
+        scoring_grid_blob = scoring_grid_bucket.blob(scoring_grid_filename)
+        with open(scoring_grid_filename, "rb") as scoring_grid_file:
+            scoring_grid_blob.upload_from_file(scoring_grid_file, content_type="text/csv")
+
+        #store scoring/accuracy grid
+        bucket = storage_client.get_bucket(ML_PLOT_BUCKET)
+        blob = bucket.blob(scoring_grid_filename)
+
+        #convert it to csv and json
+        with blob.open("r") as f :
+            scoreGridLines = f.read()
+        scoreGridLines = str(scoreGridLines) if scoreGridLines else None
+        csv_reader = csv.DictReader(StringIO(scoreGridLines))
+        json_data = [row for row in csv_reader]
 
-        return fileName, column, task
+        #put plot into plot bucket
+        plot_bucket = storage_client.get_bucket(ML_PLOT_BUCKET)
+        plot_blob = plot_bucket.blob(plot_filename)
+        blob.content_type = 'image/png'
+        plot_blob.upload_from_filename("tempData/AUC.png")
+
+        #get the url of the plot
+        public_url = plot_blob.public_url
+
+        return {"scoring_grid": scoreGridLines, "json": json_data, "plot_model_url": public_url}
 
     except Exception as e:
         return {"error": f"An error occurred: {str(e)}"}
+
+    finally:
+        #Delete the temporary file
+        #if os.path.exists("tempImages/AUC.png"):
+         #   os.remove("tempImages/AUC.png")
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
 
+
 
-# retreive the model and download it
+#retreive the model and download it
 @app.get("/api/downloadModel")
 async def downloadModel():
     try:
-        # action
+        #action
         storage_client = storage.Client.from_service_account_json("./credentials.json")
 
-        # retreiving the data from bucket
+        #retreiving the data from bucket
         bucket = storage_client.get_bucket(MODEL_BUCKET)
         blobs = list(bucket.list_blobs())
         blob = blobs[0]
 
         byte_stream = BytesIO()
         blob.download_to_file(byte_stream)
         byte_stream.seek(0)
-
-        # remove it from the bucket
+        
+        #remove it from the bucket
         blob.delete()
 
         return StreamingResponse(byte_stream, media_type="application/octet-stream")
 
+
     except Exception as e:
         return {"error": f"An error occurred: {str(e)}"}
-
-
 # big query operations
 @app.get("/api/bq")
 async def bq(fileName, query=None):