MGTheTrain · MGTheTrain · Apr 28, 2024 · Apr 27, 2024 · Apr 27, 2024 · Apr 27, 2024
diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml
@@ -1,13 +1,13 @@
-name: Workflow for deploying and destroying an ACR or AKS, as well as installing or uninstalling Helm charts on those Kubernetes clusters
+name: Workflow for deploying and destroying an ACR or AKS with ACR, as well as installing or uninstalling Helm charts on those Kubernetes clusters
 
 on:
   workflow_dispatch:
     inputs:
       INFRASTRUCTURE_OPERATIONS:
         type: choice
-        options: [ 'storage-account-backend-deploy', 'k8s-service-deploy', 'k8s-service-destroy', 'ml-ops-tools-install', 'ml-ops-tools-uninstall' ]
+        options: [ 'storage-backends-deploy', 'k8s-service-deploy', 'k8s-service-destroy', 'ml-ops-tools-install', 'ml-ops-tools-uninstall' ]
         default: k8s-service-deploy
-        description: 'Infrastructure operations: [ storage-account-backend-deploy, k8s-service-deploy, k8s-service-destroy, ml-ops-tools-install, ml-ops-tools-uninstall ]'
+        description: 'Infrastructure operations: [ storage-backends-deploy, k8s-service-deploy, k8s-service-destroy, ml-ops-tools-install, ml-ops-tools-uninstall ]'
       ML_OPS_TOOL:
         type: choice
         options: [ 'kubeflow', 'mlflow' ]
@@ -26,9 +26,9 @@ env:
   ARM_CLIENT_SECRET: '${{ secrets.ARM_CLIENT_SECRET }}'
 
 jobs:
-  deploy-tf-backend:
+  deploy-storage-backends:
     runs-on: ubuntu-latest
-    if: ${{ github.event.inputs.INFRASTRUCTURE_OPERATIONS == 'storage-account-backend-deploy' }}
+    if: ${{ github.event.inputs.INFRASTRUCTURE_OPERATIONS == 'storage-backends-deploy' }}
     steps:
       - name: Checkout code
         uses: actions/checkout@master
@@ -37,7 +37,7 @@ jobs:
       - name: Terraform Init
         run: terraform init # should be only deployed once. Ensure to manually destroy the ACR in the Azure Portal Web UI
         working-directory: ./devops/terraform
-      - name: Deploy Storage Account backend
+      - name: Deploy Storage backends
         run: terraform apply --auto-approve
         continue-on-error: false
         working-directory: ./devops/terraform
@@ -50,13 +50,13 @@ jobs:
       - name: Set up Terraform
         uses: hashicorp/setup-terraform@v3  
         if: ${{ github.event.inputs.INFRASTRUCTURE_OPERATIONS == 'k8s-service-deploy' || github.event.inputs.INFRASTRUCTURE_OPERATIONS == 'k8s-service-destroy' }}
-      - name: Terraform Init # requires a Storage Account backend deployed trough storage-account-backend-deploy workflow step
+      - name: Terraform Init # requires a Storage Account backend deployed trough storage-backends-deploy workflow step
         run: |
           terraform init \
           -backend-config="subscription_id=${{ env.ARM_SUBSCRIPTION_ID }}" \
-          -backend-config="storage_account_name=gftfbesbxsa001" \
-          -backend-config="resource_group_name=gftfbe-sbx-rg001" \
-          -backend-config="container_name=gftfbesbxsac001" \
+          -backend-config="storage_account_name=mlopsftwsasbxsa001" \
+          -backend-config="resource_group_name=mlopsftwsa-sbx-rg001" \
+          -backend-config="container_name=mlopsftwsasbxsac001" \
           -backend-config="key=sbx-k8s-service-deployment/terraform.tfstate"
         working-directory: ./terraform/envs/${{ github.event.inputs.ENVIRONMENT }}-k8s-deployment
         if: ${{ github.event.inputs.INFRASTRUCTURE_OPERATIONS == 'k8s-service-deploy' || github.event.inputs.INFRASTRUCTURE_OPERATIONS == 'k8s-service-destroy' }}
@@ -84,17 +84,17 @@ jobs:
         run: |
           terraform init \
           -backend-config="subscription_id=${{ env.ARM_SUBSCRIPTION_ID }}" \
-          -backend-config="storage_account_name=gftfbesbxsa001" \
-          -backend-config="resource_group_name=gftfbe-sbx-rg001" \
-          -backend-config="container_name=gftfbesbxsac001" \
+          -backend-config="storage_account_name=mlopsftwsasbxsa001" \
+          -backend-config="resource_group_name=mlopsftwsa-sbx-rg001" \
+          -backend-config="container_name=mlopsftwsasbxsac001" \
           -backend-config="key=sbx-k8s-service-deployment/terraform.tfstate"
 
           terraform output aks_kube_config_list | awk '/^  apiVersion:/,/^  EOT,$/' | sed 's/^  //' > ./config
           mkdir -vp ~/.kube
           head -n -3 ./config > ~/.kube/config
           cat ~/.kube/config
         working-directory: ./terraform/envs/${{ github.event.inputs.ENVIRONMENT }}-k8s-deployment
-      - name: Install Kustomize and Kubeflow
+      - name: Install Kustomize, Kubeflow and Kubeflow Training Operator
         run: |
           sudo apt-get install -y git
 
@@ -118,14 +118,17 @@ jobs:
           kubectl get pods -n knative-serving
           kubectl get pods -n kubeflow
           kubectl get pods -n kubeflow-user-example-com
+
+          # Install Kubeflow Training Operator
+          kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.7.0"
         if: ${{ ((github.event.inputs.INFRASTRUCTURE_OPERATIONS == 'ml-ops-tools-install' || github.event.inputs.INFRASTRUCTURE_OPERATIONS == 'k8s-service-deploy') && github.event.inputs.ML_OPS_TOOL == 'kubeflow') }}
-      - name: Terraform Init # requires a Storage Account backend deployed trough storage-account-backend-deploy workflow step
+      - name: Terraform Init # requires a Storage Account backend deployed trough storage-backends-deploy workflow step
         run: |
           terraform init \
           -backend-config="subscription_id=${{ env.ARM_SUBSCRIPTION_ID }}" \
-          -backend-config="storage_account_name=gftfbesbxsa001" \
-          -backend-config="resource_group_name=gftfbe-sbx-rg001" \
-          -backend-config="container_name=gftfbesbxsac001" \
+          -backend-config="storage_account_name=mlopsftwsasbxsa001" \
+          -backend-config="resource_group_name=mlopsftwsa-sbx-rg001" \
+          -backend-config="container_name=mlopsftwsasbxsac001" \
           -backend-config="key=sbx-k8s-service-configuration/terraform.tfstate"
         working-directory: ./terraform/envs/${{ github.event.inputs.ENVIRONMENT }}-k8s-configuration
       - name: Install helm charts

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,8 +13,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - [Feature] Deployment of Azure Kubernetes Service (AKS) clusters
 - [Feature] kubeflow operator or mlflow helm chart installations in deployed AKS clusters
-- [Feature] CD workflow for on-demand AKS deployments and kubeflow operator or mlflow helm chart installations
+- [Feature] CD workflow for on-demand AKS cluster deployments and kubeflow operator or mlflow helm chart installations. 
 - [Feature] CD wofklow for on demand deployments of an Azure Storage Account Container **(For storing terraform state files)**
 - [Feature] Added `devcontainer.json` with necessary tooling for local development
 - [Feature] Simple feedforward neural network with MNIST dataset to map input images to their corresponding digit classes 
-- [Feature] CNN architecture training considering COCO dataset for image classification AI applications (**NOTE:** Compute and storage intensive. Read `Download the COCO dataset images` comments on preferred hardware specs)
+- [Feature] CNN architecture training considering COCO dataset for image classification AI applications (**NOTE:** Compute and storage intensive. Read `Download the COCO dataset images` comments on preferred hardware specs)
+- [Feature] CD workflow for on-demand Azure Container Registry deployments in order to store internal Docker images.
+- [Feature] Dockerizing Python (pytorch or tensorflow) applications for ML training and inference
+- [Feature] Installation of the [Training Operator for CRDs](https://github.com/kubeflow/training-operator) and applying sample [TFJob and PyTorchJob](https://www.kubeflow.org/docs/components/training/overview/) k8s manifest
diff --git a/README.md b/README.md
@@ -23,15 +23,19 @@ Repository showcasing ML Ops practices with kubeflow and mlflow
 - [x] kubeflow operator or mlflow helm chart installations in deployed AKS clusters
 - [x] CD workflow for on-demand AKS deployments and kubeflow operator or mlflow helm chart installations
 - [x] CD wofklow for on demand deployments of an Azure Storage Account Container **(For storing terraform state files)**
+- [x] CD workflow for on-demand Azure Container Registry deployments in order to store internal Docker images.
+- [ ] ~~CI workflow for building internal docker images and uploading those to an Azure Container Resgitry~~
+- [ ] ~~CD workflows for internal helm chart installations in deployed AKS clusters~~
 - [x] Added `devcontainer.json` with necessary tooling for local development
 - [x] Python (pytorch or tensorflow) application for ML training and inference purposes and Jupyter notebooks
     - [x] Simple feedforward neural network with MNIST dataset to map input images to their corresponding digit classes 
     - [x] CNN architecture training and inference considering COCO dataset for image classification AI applications (**NOTE:** Compute and storage intensive. Read `Download the COCO dataset images` comments on preferred hardware specs)
     - [ ] ~~(**OPTIONAL**) Transformer architecture training considering pre-trained models for chatbot AI applications~~
-- [ ] Dockerizing Python (pytorch or tensorflow) applications for ML training and inference
-- [ ] Helm charts with K8s manifests for ML jobs considering the [Training Operator for CRDs](https://github.com/kubeflow/training-operator)
-- [ ] Demonstration of model training and model deployment trough automation workflows
-- [ ] (**OPTIONAL**) mlflow experiments for the machine learning lifecycle
+- [x] Dockerizing Python (pytorch or tensorflow) applications for ML training and inference
+- [ ] ~~Helm charts with K8s manifests for ML jobs considering the [Training Operator for CRDs](https://github.com/kubeflow/training-operator)~~
+- [x] Installation of the [Training Operator for CRDs](https://github.com/kubeflow/training-operator) and applying sample [TFJob and PyTorchJob](https://www.kubeflow.org/docs/components/training/overview/) k8s manifest
+- [x] Demonstration of model training and model deployment trough automation workflows~~
+- [ ] ~~(**OPTIONAL**) mlflow experiments for the machine learning lifecycle
 
 ## Getting started
 
@@ -45,7 +49,7 @@ Repository showcasing ML Ops practices with kubeflow and mlflow
 
 ### Deploy an AKS cluster and install the kubeflow or mlflow components
 
-0. Deploy an AKS trough the [terraform.yml workflow](https://github.com/MGTheTrain/ml-ops-ftw/actions/workflows/terraform.yml) considering the `INFRASTRUCTURE_OPERATIONS option k8s-service-deploy`. 
+0. Deploy an AKS trough the [terraform.yml workflow](https://github.com/MGTheTrain/ml-ops-ftw/actions/workflows/terraform.yml) considering the `INFRASTRUCTURE_OPERATIONS option k8s-service-deploy`. An Azure Container Registry will be part of the deployment in order to store internal Docker images
 1. **Optional:** Install ml-ops tools to an existing kubernetes cluster trough [terraform.yml workflow](https://github.com/MGTheTrain/ml-ops-ftw/actions/workflows/terraform.yml) considering the `INFRASTRUCTURE_OPERATIONS option ml-ops-tools-install`
 
 **NOTE:** 
@@ -69,9 +73,9 @@ and visit in a browser of choice `localhost:8080`.
 
 ![kubeflow-dashboard](./images/kubeflow-dashboard.PNG)
 
-#### CNN architecture training considering pre-trained models for image classification AI applications
+#### Jupyter notebooks
 
-**NOTE:** When creating the Jupyter notebook instance consider the following data volume:
+When creating the Jupyter notebook instance consider the following data volume:
 
 ![Jupyter instance data volume](./images/jupyter-instance-data-volume.PNG)
 
@@ -83,6 +87,10 @@ The Jypter instace that was created appear as follows:
 
 ![Created Jupyter instance](./images/created-jupyter-instance.PNG)
 
+**NOTE:** You can check the status of the Jupyter instance pods:
+
+![Check jupyter instance pods](./images/check-jupyter-instance-pod.PNG)
+
 Once `CONNECTED` to a Jupyter instance ensure to clone this Git repository (HTTPS URL: `https://github.com/MGTheTrain/ml-ops-ftw.git`):
 
 ![Clone git repository](./images/clone-git-repository.PNG)
@@ -91,8 +99,20 @@ You then should have the repository cloned in your workspace:
 
 ![Cloned git repository in jupyter instance](./images/cloned-git-repository-in-jupyter-instance.PNG)
 
-TBD
+Execute a [Jupyter notebook](./notebooks/) to either train the model or perform inference (P.S. It's preferable to begin with the [mnist-trainnig.ipynb](./notebooks/mnist-trainnig.ipynb). Others are either resource intensive or not yet implemented):
+
+![Run jupyter notebook example](./images/run-jupyter-notebook-example.PNG)
+
+#### Applying TFJob or PyTorchJob k8s manifests 
 
+After successful installation of the Kubeflow Training Operator, apply some sample k8s ML training jobs, e.g. [for PyTorch](https://www.kubeflow.org/docs/components/training/user-guides/pytorch/) and [for Tensorflow](https://www.kubeflow.org/docs/components/training/user-guides/tensorflow/).
+
+```sh
+# Pytorch 
+kubectl create -f https://raw.githubusercontent.com/kubeflow/training-operator/master/examples/pytorch/simple.yaml
+# Tensorflow
+kubectl create -f https://raw.githubusercontent.com/kubeflow/training-operator/master/examples/tensorflow/simple.yaml
+```
 
 ### mlflow
 

diff --git a/devops/terraform/main.tf b/devops/terraform/main.tf
@@ -1,6 +1,6 @@
 module "main" {
   source                      = "git::https://github.com/MGTheTrain/gitops-ftw.git//devops/terraform?ref=main"
-  digital_product_affix       = var.digital_product_affix
+  digital_product_affix       = var.digital_product_affix_sa
   environment                 = var.environment
   resource_instance_number    = var.resource_instance_number
   location                    = var.location
@@ -9,4 +9,16 @@ module "main" {
   sa_account_tier             = var.sa_account_tier
   sa_account_replication_type = var.sa_account_replication_type
   sc_container_access_type    = var.sc_container_access_type
+}
+
+module "acr" {
+  source                         = "../../terraform/modules/acr"
+  digital_product_affix          = var.digital_product_affix_acr_module
+  environment                    = var.environment
+  resource_instance_number       = var.resource_instance_number
+  location                       = var.location
+  team                           = var.team
+  number_of_container_registries = var.number_of_container_registries
+  acr_sku                        = var.acr_sku
+  acr_admin_enabled              = var.acr_admin_enabled
 }
diff --git a/devops/terraform/output.tf b/devops/terraform/output.tf
@@ -0,0 +1,13 @@
+# ACR
+output "container_registry_admin_username_list" {
+  value = module.acr.container_registry_admin_username_list
+}
+
+output "container_registry_admin_password_list" {
+  value     = module.acr.container_registry_admin_password_list
+  sensitive = true
+}
+
+output "container_registry_login_server_list" {
+  value = module.acr.container_registry_login_server_list
+}
diff --git a/devops/terraform/variables.tf b/devops/terraform/variables.tf
@@ -1,7 +1,13 @@
 # Azure Rg
-variable "digital_product_affix" {
-  default     = "gftfbe"
-  description = "The digital product affix."
+variable "digital_product_affix_sa" {
+  default     = "mlopsftwsa"
+  description = "The digital product affix of the Storage Account module."
+  type        = string
+}
+
+variable "digital_product_affix_acr_module" {
+  default     = "mlopsftwcr"
+  description = "The digital product affix of the acr module."
   type        = string
 }
 
@@ -18,7 +24,7 @@ variable "resource_instance_number" {
 }
 
 variable "location" {
-  default     = "West Europe"
+  default     = "Switzerland North"
   description = "The geographic location in which to deploy."
   type        = string
 }
@@ -52,4 +58,23 @@ variable "sc_container_access_type" {
   default     = "private"
   description = "Container access type of the Storage Account Container"
   type        = string
+}
+
+# Azure Container Registry
+variable "number_of_container_registries" {
+  default     = 1
+  description = "The total number of Azure Container registries to deploy."
+  type        = number
+}
+
+variable "acr_sku" {
+  description = "SKU for the Azure Container Registry"
+  type        = string
+  default     = "Basic"
+}
+
+variable "acr_admin_enabled" {
+  description = "Flag to enable admin user for the Azure Container Registry"
+  type        = bool
+  default     = true
 }
diff --git a/images/check-jupyter-instance-pod.PNG b/images/check-jupyter-instance-pod.PNG
diff --git a/images/resource-groups.PNG b/images/resource-groups.PNG
diff --git a/images/run-jupyter-notebook-example.PNG b/images/run-jupyter-notebook-example.PNG
diff --git a/notebooks/coco-yolov4-training.ipynb b/notebooks/coco-yolov4-training.ipynb
@@ -33,7 +33,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!python main.py"
+    "!python main.py --mode train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python main.py --mode inference --image_path <>"
    ]
   }
  ],

diff --git a/notebooks/mnist-trainnig.ipynb b/notebooks/mnist-trainnig.ipynb
@@ -33,7 +33,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!python main.py"
+    "!python main.py --mode train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python main.py --mode inference"
    ]
   }
  ],

diff --git a/notebooks/transformer-chat-bot.ipynb b/notebooks/transformer-chat-bot.ipynb
@@ -33,7 +33,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!python main.py"
+    "!python main.py --mode train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python main.py --mode inference"
    ]
   }
  ],

diff --git a/python/keras-mnist-training/Dockerfile b/python/keras-mnist-training/Dockerfile
@@ -0,0 +1,9 @@
+FROM python:3.9-slim
+
+WORKDIR /app
+
+COPY main.py /app/
+COPY src /app/src
+COPY requirements.txt /app/
+
+RUN pip install -r requirements.txt
diff --git a/python/pt-yolo-coco-training/Dockerfile b/python/pt-yolo-coco-training/Dockerfile
@@ -0,0 +1,9 @@
+FROM python:3.9-slim
+
+WORKDIR /app
+
+COPY main.py /app/
+COPY src /app/src
+COPY requirements.txt /app/
+
+RUN pip install -r requirements.txt
diff --git a/terraform/envs/sbx-k8s-configuration/README.md b/terraform/envs/sbx-k8s-configuration/README.md
@@ -23,7 +23,7 @@ No resources.
 | <a name="input_acr_password"></a> [acr\_password](#input\_acr\_password) | Sets an ACR password. | `string` | `"TBD"` | no |
 | <a name="input_acr_username"></a> [acr\_username](#input\_acr\_username) | Sets an ACR user name. | `string` | `"TBD"` | no |
 | <a name="input_environment"></a> [environment](#input\_environment) | The environment. | `string` | `"sbx"` | no |
-| <a name="input_ml_ops_tool"></a> [ml\_ops\_tool](#input\_ml\_ops\_tool) | String determining whether to install Argo CD or FluxCD. Viable options: [ kubeflow, fluxcd ] | `string` | `"kubeflow"` | no |
+| <a name="input_ml_ops_tool"></a> [ml\_ops\_tool](#input\_ml\_ops\_tool) | String determining whether to install mlflow or none. Viable options: [ mlflow, kubeflow, none ]. The installation of Kubeflow will be managed externally through the continuous deployment (CD) workflow, as Terraform modules and kubernetes provider are either outdated or difficult to setup | `string` | `"none"` | no |
 
 ## Outputs