From d16abb294a304cb6866eb2b83eeab1318933ba0e Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Wed, 25 Sep 2024 15:05:15 -0500 Subject: [PATCH 01/39] Inital commit to add GH action to generate report --- .github/workflows/report.yaml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/workflows/report.yaml diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml new file mode 100644 index 0000000..96758af --- /dev/null +++ b/.github/workflows/report.yaml @@ -0,0 +1,31 @@ +name: Generate Data Usage Report + +on: + pull_request: + branches: + - main + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v3 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + # TODO param region + aws-region: us-east-2 + + - name: Configure kubectl with AWS EKS + # TODO param name, region + run: | + aws eks update-kubeconfig --name eks-dandihub --region us-east-2 + + - name: Sanity check + run: | + kubectl get pods -n jupyterhub From 713d64cbd12e5aa08b6e4c448f1e04ee15adc6b7 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Wed, 25 Sep 2024 15:13:59 -0500 Subject: [PATCH 02/39] Assume Jupyterhub Provisioning Role --- .github/workflows/report.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 96758af..829dc77 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -21,6 +21,16 @@ jobs: # TODO param region aws-region: us-east-2 + - name: Assume JupyterhubProvisioningRole + # TODO param ProvisioningRoleARN and name ^ + run: | + ROLE_ARN="arn:aws:iam::278212569472:role/JupyterhubProvisioningRole" + CREDS=$(aws sts assume-role --role-arn $ROLE_ARN --role-session-name "GitHubActionsSession") + export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId') + export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey') + export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken') + + - name: Configure kubectl with AWS EKS # TODO param name, region run: | From 519360c7b004d9807fde9e524cca853b115facce Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Wed, 25 Sep 2024 15:16:02 -0500 Subject: [PATCH 03/39] Fixup: indent --- .github/workflows/report.yaml | 58 +++++++++++++++++------------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 829dc77..162b09f 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -10,32 +10,32 @@ jobs: runs-on: ubuntu-latest steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v3 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - # TODO param region - aws-region: us-east-2 - - - name: Assume JupyterhubProvisioningRole - # TODO param ProvisioningRoleARN and name ^ - run: | - ROLE_ARN="arn:aws:iam::278212569472:role/JupyterhubProvisioningRole" - CREDS=$(aws sts assume-role --role-arn $ROLE_ARN --role-session-name "GitHubActionsSession") - export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId') - export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey') - export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken') - - - - name: Configure kubectl with AWS EKS - # TODO param name, region - run: | - aws eks update-kubeconfig --name eks-dandihub --region us-east-2 - - - name: Sanity check - run: | - kubectl get pods -n jupyterhub + - name: Checkout code + uses: actions/checkout@v3 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v3 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + # TODO param region + aws-region: us-east-2 + + - name: Assume JupyterhubProvisioningRole + # TODO param ProvisioningRoleARN and name ^ + run: | + ROLE_ARN="arn:aws:iam::278212569472:role/JupyterhubProvisioningRole" + CREDS=$(aws sts assume-role --role-arn $ROLE_ARN --role-session-name "GitHubActionsSession") + export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId') + export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey') + export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken') + + + - name: Configure kubectl with AWS EKS + # TODO param name, region + run: | + aws eks update-kubeconfig --name eks-dandihub --region us-east-2 + + - name: Sanity check + run: | + kubectl get pods -n jupyterhub From e6f481441b6512f51f17c22e83a608e66e9551ba Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Wed, 25 Sep 2024 15:17:03 -0500 Subject: [PATCH 04/39] Rename job --- .github/workflows/report.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 162b09f..5b1f053 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -6,7 +6,7 @@ on: - main jobs: - deploy: + generate_data_usage_report: runs-on: ubuntu-latest steps: From 72496f4a7846ef70856408c154f89651ee8a5d55 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Wed, 25 Sep 2024 15:23:29 -0500 Subject: [PATCH 05/39] Add assumed role to update-kubeconfig --- .github/workflows/report.yaml | 4 ++-- README.md | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 5b1f053..8229875 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -32,9 +32,9 @@ jobs: - name: Configure kubectl with AWS EKS - # TODO param name, region + # TODO param name, region role-arn run: | - aws eks update-kubeconfig --name eks-dandihub --region us-east-2 + aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn arn:aws:iam::278212569472:role/JupyterhubProvisioningRole - name: Sanity check run: | diff --git a/README.md b/README.md index 5af6edf..4337f54 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # Dandihub +TODO + - add provisioning role to cluser :q + - + This Terraform blueprint creates a Kubernetes environment (EKS) and installs JupyterHub. Based on [AWS Data on EKS JupyterHub](https://github.com/awslabs/data-on-eks/tree/main/ai-ml/jupyterhub). ## Table of Contents From 8428d3a2025421775320f39542f4cb9057559499 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Wed, 25 Sep 2024 15:26:10 -0500 Subject: [PATCH 06/39] No need to add ProvisioningRole to masters --- README.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/README.md b/README.md index 4337f54..5af6edf 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,5 @@ # Dandihub -TODO - - add provisioning role to cluser :q - - - This Terraform blueprint creates a Kubernetes environment (EKS) and installs JupyterHub. Based on [AWS Data on EKS JupyterHub](https://github.com/awslabs/data-on-eks/tree/main/ai-ml/jupyterhub). ## Table of Contents From e170b593494033ec743255a6e823dea156d85e60 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Wed, 25 Sep 2024 15:47:01 -0500 Subject: [PATCH 07/39] Deploy a pod to the cluster, and schedule with Karpenter --- .github/manifests/hello-world.yaml | 20 ++++++++++++++++++++ .github/workflows/report.yaml | 20 ++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 .github/manifests/hello-world.yaml diff --git a/.github/manifests/hello-world.yaml b/.github/manifests/hello-world.yaml new file mode 100644 index 0000000..1977f33 --- /dev/null +++ b/.github/manifests/hello-world.yaml @@ -0,0 +1,20 @@ +# manifests/hello-world-pod.yaml +apiVersion: v1 +kind: Pod +metadata: + name: hello-world-pod +spec: + containers: + - name: hello + image: busybox + command: ['sh', '-c', 'echo Hello, World! && sleep 30'] + nodeSelector: + NodeGroupType: default + NodePool: default + hub.jupyter.org/node-purpose: user + tolerations: + - key: "hub.jupyter.org/dedicated" + operator: "Equal" + value: "user" + effect: "NoSchedule" + diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 8229875..d8dff65 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -39,3 +39,23 @@ jobs: - name: Sanity check run: | kubectl get pods -n jupyterhub + + # Step 4: Deploy Hello World Pod from manifest + - name: Deploy Hello World Pod + run: | + kubectl apply -f manifests/hello-world-pod.yaml + + # Step 5: Wait for Pod to Complete + - name: Wait for Hello World Pod to complete + run: | + kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=60s + + # Step 6: Get Pod Logs to verify it ran successfully + - name: Get Hello World Pod logs + run: | + kubectl logs hello-world-pod + + # Step 7: Cleanup - Delete the Pod + - name: Delete Hello World Pod + run: | + kubectl delete pod hello-world-pod From bfce04694ef3b4be7fd00a54ae1738f2a02f169e Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Wed, 25 Sep 2024 15:48:20 -0500 Subject: [PATCH 08/39] Fixup: correct path to pod manifest --- .github/workflows/report.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index d8dff65..a498d99 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -39,11 +39,11 @@ jobs: - name: Sanity check run: | kubectl get pods -n jupyterhub - + # Step 4: Deploy Hello World Pod from manifest - name: Deploy Hello World Pod run: | - kubectl apply -f manifests/hello-world-pod.yaml + kubectl apply -f .github/manifests/hello-world-pod.yaml # Step 5: Wait for Pod to Complete - name: Wait for Hello World Pod to complete From 0993129e3452e35980279e0fa14a99d1b76203fa Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Wed, 25 Sep 2024 15:49:42 -0500 Subject: [PATCH 09/39] Fixup again ugh, rename file --- .github/manifests/{hello-world.yaml => hello-world-pod.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/manifests/{hello-world.yaml => hello-world-pod.yaml} (100%) diff --git a/.github/manifests/hello-world.yaml b/.github/manifests/hello-world-pod.yaml similarity index 100% rename from .github/manifests/hello-world.yaml rename to .github/manifests/hello-world-pod.yaml From 87027d220f91ce52f33fc59617ab3d1a96595935 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Wed, 25 Sep 2024 15:59:09 -0500 Subject: [PATCH 10/39] Delete Pod even if previous step times out (Also increase timeout) --- .github/workflows/report.yaml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index a498d99..bc1d088 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -48,14 +48,17 @@ jobs: # Step 5: Wait for Pod to Complete - name: Wait for Hello World Pod to complete run: | - kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=60s + kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s # 5 minutes + continue-on-error: true # Allow the workflow to continue even if this step fails - # Step 6: Get Pod Logs to verify it ran successfully + # Step 6: Get Pod Logs to verify it ran successfully, only if Step 5 succeeds - name: Get Hello World Pod logs run: | kubectl logs hello-world-pod + if: ${{ success() }} # Only run this step if the previous step was successful - # Step 7: Cleanup - Delete the Pod + # Step 7: Cleanup - Always run this step, even if previous steps fail - name: Delete Hello World Pod run: | kubectl delete pod hello-world-pod + if: ${{ always() }} # Always run this step, even if other steps fail From 686f6868eff5b99e866e83a1c1cf4fc32f5c2f82 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 11 Oct 2024 12:52:23 -0500 Subject: [PATCH 11/39] Hack out initial du --- .github/manifests/disk-usage-report-job.yaml | 23 ++++++ .github/scripts/du.py | 55 ++++++++++++++ .github/workflows/report.yaml | 77 ++++++++++++++------ NEXTSTEPS | 32 ++++++++ images/Dockerfile.dandihub_report_generator | 15 ++++ 5 files changed, 179 insertions(+), 23 deletions(-) create mode 100644 .github/manifests/disk-usage-report-job.yaml create mode 100755 .github/scripts/du.py create mode 100644 NEXTSTEPS create mode 100644 images/Dockerfile.dandihub_report_generator diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml new file mode 100644 index 0000000..2c94536 --- /dev/null +++ b/.github/manifests/disk-usage-report-job.yaml @@ -0,0 +1,23 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: disk-usage-report-job +spec: + template: + metadata: + labels: + app: disk-usage-report + spec: + containers: + - name: disk-usage-report + image: IMAGE_PLACEHOLDER + restartPolicy: Never + nodeSelector: + NodeGroupType: default + NodePool: default + hub.jupyter.org/node-purpose: user + tolerations: + - key: "hub.jupyter.org/dedicated" + operator: "Equal" + value: "user" + effect: "NoSchedule" diff --git a/.github/scripts/du.py b/.github/scripts/du.py new file mode 100755 index 0000000..29bccad --- /dev/null +++ b/.github/scripts/du.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +import os +import subprocess +import sys +import json + +OUTPUT_FILE = "du_report.json" +SIZE_THRESHOLD_GB = 1 +SIZE_THRESHOLD_BYTES = SIZE_THRESHOLD_GB * 1024 * 1024 * 1024 + +# Function to calculate disk usage of a directory in bytes +def get_disk_usage_bytes(path): + result = subprocess.run(['du', '-sb', path], capture_output=True, text=True) + size_str = result.stdout.split()[0] # Get the size in bytes (du -sb gives size in bytes) + return int(size_str) + +# Function to convert bytes to a human-readable format (e.g., KB, MB, GB) +def bytes_to_human_readable(size_in_bytes): + for unit in ['B', 'KB', 'MB', 'GB', 'TB']: + if size_in_bytes < 1024: + return f"{size_in_bytes:.2f} {unit}" + size_in_bytes /= 1024 + +def prepare_report(directory): + report = {} + # List user home dirs in the directory and calculate disk usage + for user_dir in os.listdir(directory): + user_path = os.path.join(directory, user_dir) + if os.path.isdir(user_path): + disk_usage_bytes = get_disk_usage_bytes(user_path) + report[user_dir] = { + "disk_usage_bytes": disk_usage_bytes + } + if disk_usage_bytes > SIZE_THRESHOLD_BYTES: + # TODO: Placeholder for other actions + report[user_dir]["action"] = f"Directory size exceeds {SIZE_THRESHOLD_BYTES / (1024**3):.2f}GB, further action taken." + else: + report[user_dir]["action"] = "No action required." + + for user, data in report.items(): + data["disk_usage_human_readable"] = bytes_to_human_readable(data["disk_usage_bytes"]) + + with open(OUTPUT_FILE, 'w') as f: + json.dump(report, f, indent=4) + + print(f"Disk usage report generated at {OUTPUT_FILE}") + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: du.py ") + else: + directory = sys.argv[1] + prepare_report(directory) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index bc1d088..20ea815 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -13,52 +13,83 @@ jobs: - name: Checkout code uses: actions/checkout@v3 + - name: Log in to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push Docker image + uses: docker/build-push-action@v3 + with: + context: . + file: images/Dockerfile.dandihub_report_generator + push: true + tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub_report_generator:latest + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v3 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - # TODO param region aws-region: us-east-2 - - name: Assume JupyterhubProvisioningRole - # TODO param ProvisioningRoleARN and name ^ + - name: Assume ProvisioningRole run: | - ROLE_ARN="arn:aws:iam::278212569472:role/JupyterhubProvisioningRole" - CREDS=$(aws sts assume-role --role-arn $ROLE_ARN --role-session-name "GitHubActionsSession") + CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession") export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId') export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey') export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken') - - name: Configure kubectl with AWS EKS - # TODO param name, region role-arn run: | - aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn arn:aws:iam::278212569472:role/JupyterhubProvisioningRole + aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} + # TODO remove - name: Sanity check run: | kubectl get pods -n jupyterhub - # Step 4: Deploy Hello World Pod from manifest - - name: Deploy Hello World Pod + - name: Replace image placeholder in manifest + run: | + sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml + + - name: Deploy Disk Usage Report Job + run: | + kubectl apply -f .github/manifests/disk-usage-report-job.yaml + + # TODO should timeout be longer? + - name: Wait for Disk Usage Report Job to complete + run: | + kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s + continue-on-error: true + + - name: Save Pod logs to file + run: | + POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}') + kubectl logs $POD_NAME > disk_usage_report.log + continue-on-error: true + + # continue-on-error for previous steps so we delete the job + - name: Delete Disk Usage Report Job run: | - kubectl apply -f .github/manifests/hello-world-pod.yaml + kubectl delete job disk-usage-report-job - # Step 5: Wait for Pod to Complete - - name: Wait for Hello World Pod to complete + - name: Clone dandi-hub-usage-reports repository run: | - kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s # 5 minutes - continue-on-error: true # Allow the workflow to continue even if this step fails + git clone https://github.com/dandi/dandi-hub-usage-reports.git + cd dandi-hub-usage-reports - # Step 6: Get Pod Logs to verify it ran successfully, only if Step 5 succeeds - - name: Get Hello World Pod logs + - name: Copy log file to repository run: | - kubectl logs hello-world-pod - if: ${{ success() }} # Only run this step if the previous step was successful + DATE=$(date +'%Y-%m-%d') + mv ../disk_usage_report.log $DATE_disk_usage_report.log - # Step 7: Cleanup - Always run this step, even if previous steps fail - - name: Delete Hello World Pod + # Step 13: Commit and push logs to the repository + - name: Commit and push logs run: | - kubectl delete pod hello-world-pod - if: ${{ always() }} # Always run this step, even if other steps fail + git config --global user.name "GitHub Actions" + git config --global user.email "actions@github.com" + git add disk_usage_report.log + git commit -m "Add disk usage report log" + git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git diff --git a/NEXTSTEPS b/NEXTSTEPS new file mode 100644 index 0000000..44d0177 --- /dev/null +++ b/NEXTSTEPS @@ -0,0 +1,32 @@ +DONE + - Set AWS_ROLE ARN secret + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + +TODO: + - Create Dockerhub Service account + - set username & token as secrets + - Create Github CI account + - Docker Image Tagging: + - The Docker image is tagged with latest. For better version control, consider using commit SHA or version numbers. + - Log Retrieval: + - The logs from the pod are retrieved to help you verify the script's output. + - Cleanup: + - Deleting the Job ensures that no resources are left running after the workflow completes. + +By making these updates, your workflow will now: + + Include your du.py script in a Docker image. + Build and push this image to DockerHub. + Deploy a Kubernetes Job to your EKS cluster that runs the script. + Wait for the Job to complete and retrieve logs. + Clean up resources after execution. + +Feel free to ask if you need further assistance or clarification on any of these steps! + + +- Get image pushing +- create private gh repository under dandi org for reports + + + diff --git a/images/Dockerfile.dandihub_report_generator b/images/Dockerfile.dandihub_report_generator new file mode 100644 index 0000000..5f46008 --- /dev/null +++ b/images/Dockerfile.dandihub_report_generator @@ -0,0 +1,15 @@ +FROM python:3.9-slim + +# Set the working directory +WORKDIR /app + +# Copy the du.py script into the container +COPY .github/scripts/du.py /app/du.py + +# Install required packages +RUN apt-get update \ + && apt-get install -y coreutils \ + && rm -rf /var/lib/apt/lists/* + +# Set the entrypoint to the script +ENTRYPOINT ["python3", "/app/du.py"] From ff52971049feb003996ca3ad328e4733b8c342b3 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 12:38:21 -0600 Subject: [PATCH 12/39] tmp comment out job deployment, test dockerhub build --- .github/workflows/report.yaml | 132 +++++++++++++++++----------------- 1 file changed, 66 insertions(+), 66 deletions(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 20ea815..c58da48 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -27,69 +27,69 @@ jobs: push: true tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub_report_generator:latest - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v3 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: us-east-2 - - - name: Assume ProvisioningRole - run: | - CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession") - export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId') - export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey') - export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken') - - - name: Configure kubectl with AWS EKS - run: | - aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} - - # TODO remove - - name: Sanity check - run: | - kubectl get pods -n jupyterhub - - - name: Replace image placeholder in manifest - run: | - sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml - - - name: Deploy Disk Usage Report Job - run: | - kubectl apply -f .github/manifests/disk-usage-report-job.yaml - - # TODO should timeout be longer? - - name: Wait for Disk Usage Report Job to complete - run: | - kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s - continue-on-error: true - - - name: Save Pod logs to file - run: | - POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}') - kubectl logs $POD_NAME > disk_usage_report.log - continue-on-error: true - - # continue-on-error for previous steps so we delete the job - - name: Delete Disk Usage Report Job - run: | - kubectl delete job disk-usage-report-job - - - name: Clone dandi-hub-usage-reports repository - run: | - git clone https://github.com/dandi/dandi-hub-usage-reports.git - cd dandi-hub-usage-reports - - - name: Copy log file to repository - run: | - DATE=$(date +'%Y-%m-%d') - mv ../disk_usage_report.log $DATE_disk_usage_report.log - - # Step 13: Commit and push logs to the repository - - name: Commit and push logs - run: | - git config --global user.name "GitHub Actions" - git config --global user.email "actions@github.com" - git add disk_usage_report.log - git commit -m "Add disk usage report log" - git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git + # - name: Configure AWS credentials + # uses: aws-actions/configure-aws-credentials@v3 + # with: + # aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + # aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + # aws-region: us-east-2 + # + # - name: Assume ProvisioningRole + # run: | + # CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession") + # export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId') + # export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey') + # export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken') + # + # - name: Configure kubectl with AWS EKS + # run: | + # aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} + # + # # TODO remove + # - name: Sanity check + # run: | + # kubectl get pods -n jupyterhub + # + # - name: Replace image placeholder in manifest + # run: | + # sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml + # + # - name: Deploy Disk Usage Report Job + # run: | + # kubectl apply -f .github/manifests/disk-usage-report-job.yaml + # + # # TODO should timeout be longer? + # - name: Wait for Disk Usage Report Job to complete + # run: | + # kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s + # continue-on-error: true + # + # - name: Save Pod logs to file + # run: | + # POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}') + # kubectl logs $POD_NAME > disk_usage_report.log + # continue-on-error: true + # + # # continue-on-error for previous steps so we delete the job + # - name: Delete Disk Usage Report Job + # run: | + # kubectl delete job disk-usage-report-job + # + # - name: Clone dandi-hub-usage-reports repository + # run: | + # git clone https://github.com/dandi/dandi-hub-usage-reports.git + # cd dandi-hub-usage-reports + # + # - name: Copy log file to repository + # run: | + # DATE=$(date +'%Y-%m-%d') + # mv ../disk_usage_report.log $DATE_disk_usage_report.log + # + # # Step 13: Commit and push logs to the repository + # - name: Commit and push logs + # run: | + # git config --global user.name "GitHub Actions" + # git config --global user.email "actions@github.com" + # git add disk_usage_report.log + # git commit -m "Add disk usage report log" + # git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git From ca6db8943f7da07f8d158a9af5b1983cebe0a8b9 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 12:42:29 -0600 Subject: [PATCH 13/39] Fixup hyphens for image name --- .github/workflows/report.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index c58da48..9bf5526 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -25,7 +25,7 @@ jobs: context: . file: images/Dockerfile.dandihub_report_generator push: true - tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub_report_generator:latest + tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest # - name: Configure AWS credentials # uses: aws-actions/configure-aws-credentials@v3 From d228f9daf4a1d4af8a13b1b27b0bb3ceeb4cc02c Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 12:57:24 -0600 Subject: [PATCH 14/39] Write file to output location --- .github/scripts/du.py | 4 +- .github/workflows/report.yaml | 132 +++++++++++++++++----------------- 2 files changed, 68 insertions(+), 68 deletions(-) diff --git a/.github/scripts/du.py b/.github/scripts/du.py index 29bccad..260bd07 100755 --- a/.github/scripts/du.py +++ b/.github/scripts/du.py @@ -5,7 +5,7 @@ import sys import json -OUTPUT_FILE = "du_report.json" +OUTPUT_FILE = "/output/du_report.json" SIZE_THRESHOLD_GB = 1 SIZE_THRESHOLD_BYTES = SIZE_THRESHOLD_GB * 1024 * 1024 * 1024 @@ -44,7 +44,7 @@ def prepare_report(directory): with open(OUTPUT_FILE, 'w') as f: json.dump(report, f, indent=4) - print(f"Disk usage report generated at {OUTPUT_FILE}") + print(f"Disk usage report generated at {os.path.abspath(OUTPUT_FILE)}") if __name__ == "__main__": diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 9bf5526..cd7f8f0 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -27,69 +27,69 @@ jobs: push: true tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest - # - name: Configure AWS credentials - # uses: aws-actions/configure-aws-credentials@v3 - # with: - # aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - # aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - # aws-region: us-east-2 - # - # - name: Assume ProvisioningRole - # run: | - # CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession") - # export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId') - # export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey') - # export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken') - # - # - name: Configure kubectl with AWS EKS - # run: | - # aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} - # - # # TODO remove - # - name: Sanity check - # run: | - # kubectl get pods -n jupyterhub - # - # - name: Replace image placeholder in manifest - # run: | - # sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml - # - # - name: Deploy Disk Usage Report Job - # run: | - # kubectl apply -f .github/manifests/disk-usage-report-job.yaml - # - # # TODO should timeout be longer? - # - name: Wait for Disk Usage Report Job to complete - # run: | - # kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s - # continue-on-error: true - # - # - name: Save Pod logs to file - # run: | - # POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}') - # kubectl logs $POD_NAME > disk_usage_report.log - # continue-on-error: true - # - # # continue-on-error for previous steps so we delete the job - # - name: Delete Disk Usage Report Job - # run: | - # kubectl delete job disk-usage-report-job - # - # - name: Clone dandi-hub-usage-reports repository - # run: | - # git clone https://github.com/dandi/dandi-hub-usage-reports.git - # cd dandi-hub-usage-reports - # - # - name: Copy log file to repository - # run: | - # DATE=$(date +'%Y-%m-%d') - # mv ../disk_usage_report.log $DATE_disk_usage_report.log - # - # # Step 13: Commit and push logs to the repository - # - name: Commit and push logs - # run: | - # git config --global user.name "GitHub Actions" - # git config --global user.email "actions@github.com" - # git add disk_usage_report.log - # git commit -m "Add disk usage report log" - # git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v3 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-2 + + - name: Assume ProvisioningRole + run: | + CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession") + export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId') + export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey') + export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken') + + - name: Configure kubectl with AWS EKS + run: | + aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} + + # TODO remove + - name: Sanity check + run: | + kubectl get pods -n jupyterhub + + - name: Replace image placeholder in manifest + run: | + sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml + + - name: Deploy Disk Usage Report Job + run: | + kubectl apply -f .github/manifests/disk-usage-report-job.yaml + + # TODO should timeout be longer? + - name: Wait for Disk Usage Report Job to complete + run: | + kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s + continue-on-error: true + + - name: Save Pod logs to file + run: | + POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}') + kubectl logs $POD_NAME > disk_usage_report.log + continue-on-error: true + + # continue-on-error for previous steps so we delete the job + - name: Delete Disk Usage Report Job + run: | + kubectl delete job disk-usage-report-job + + - name: Clone dandi-hub-usage-reports repository + run: | + git clone https://github.com/dandi/dandi-hub-usage-reports.git + cd dandi-hub-usage-reports + + - name: Copy log file to repository + run: | + DATE=$(date +'%Y-%m-%d') + mv ../disk_usage_report.log $DATE_disk_usage_report.log + + # Step 13: Commit and push logs to the repository + - name: Commit and push logs + run: | + git config --global user.name "GitHub Actions" + git config --global user.email "actions@github.com" + git add disk_usage_report.log + git commit -m "Add disk usage report log" + git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git From 68f707ff011f651eb3cf7c4f69fa4ef48a99c150 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 12:59:49 -0600 Subject: [PATCH 15/39] use kubectl cp to retrieve report --- .github/workflows/report.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index cd7f8f0..968bd5a 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -64,10 +64,10 @@ jobs: kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s continue-on-error: true - - name: Save Pod logs to file + - name: Retrieve generated report file run: | POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}') - kubectl logs $POD_NAME > disk_usage_report.log + kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub continue-on-error: true # continue-on-error for previous steps so we delete the job @@ -80,16 +80,16 @@ jobs: git clone https://github.com/dandi/dandi-hub-usage-reports.git cd dandi-hub-usage-reports - - name: Copy log file to repository + - name: Copy report file to repository run: | DATE=$(date +'%Y-%m-%d') - mv ../disk_usage_report.log $DATE_disk_usage_report.log + mv ../du_report.json $DATE_du_report.json - # Step 13: Commit and push logs to the repository - - name: Commit and push logs + # Step 13: Commit and push report to the repository + - name: Commit and push report run: | git config --global user.name "GitHub Actions" git config --global user.email "actions@github.com" - git add disk_usage_report.log - git commit -m "Add disk usage report log" + git add $DATE_du_report.json + git commit -m "Add disk usage report for $DATE" git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git From ad6b589bfd64720204f9d5f05ea87f833031dd74 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 14:08:57 -0600 Subject: [PATCH 16/39] Combine run blocks to use vars --- .github/workflows/report.yaml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 968bd5a..d6837c3 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -78,16 +78,12 @@ jobs: - name: Clone dandi-hub-usage-reports repository run: | git clone https://github.com/dandi/dandi-hub-usage-reports.git - cd dandi-hub-usage-reports - - name: Copy report file to repository + - name: Copy report file to repository, commit and push report run: | + cd dandi-hub-usage-reports DATE=$(date +'%Y-%m-%d') mv ../du_report.json $DATE_du_report.json - - # Step 13: Commit and push report to the repository - - name: Commit and push report - run: | git config --global user.name "GitHub Actions" git config --global user.email "actions@github.com" git add $DATE_du_report.json From f18e8b7583ddcbada46d32461d6cfa85218fe427 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 14:09:29 -0600 Subject: [PATCH 17/39] Mount efs and pass arg to du script --- .github/manifests/disk-usage-report-job.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml index 2c94536..696d8f4 100644 --- a/.github/manifests/disk-usage-report-job.yaml +++ b/.github/manifests/disk-usage-report-job.yaml @@ -11,6 +11,11 @@ spec: containers: - name: disk-usage-report image: IMAGE_PLACEHOLDER + args: + - "/home/" + volumeMounts: + - name: persistent-storage + mountPath: "/home/" restartPolicy: Never nodeSelector: NodeGroupType: default @@ -21,3 +26,7 @@ spec: operator: "Equal" value: "user" effect: "NoSchedule" + volumes: + - name: persistent-storage + persistentVolumeClaim: + claimName: efs-persist From 387cfc1240bc67be1fb3564377d9f13de823f857 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 14:15:08 -0600 Subject: [PATCH 18/39] Comment out repo pushing, lets see if the report runs --- .github/workflows/report.yaml | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index d6837c3..a12adbd 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -68,6 +68,7 @@ jobs: run: | POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}') kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub + cat du_report.json continue-on-error: true # continue-on-error for previous steps so we delete the job @@ -75,17 +76,17 @@ jobs: run: | kubectl delete job disk-usage-report-job - - name: Clone dandi-hub-usage-reports repository - run: | - git clone https://github.com/dandi/dandi-hub-usage-reports.git - - - name: Copy report file to repository, commit and push report - run: | - cd dandi-hub-usage-reports - DATE=$(date +'%Y-%m-%d') - mv ../du_report.json $DATE_du_report.json - git config --global user.name "GitHub Actions" - git config --global user.email "actions@github.com" - git add $DATE_du_report.json - git commit -m "Add disk usage report for $DATE" - git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git + # - name: Clone dandi-hub-usage-reports repository + # run: | + # git clone https://github.com/dandi/dandi-hub-usage-reports.git + # + # - name: Copy report file to repository, commit and push report + # run: | + # cd dandi-hub-usage-reports + # DATE=$(date +'%Y-%m-%d') + # mv ../du_report.json $DATE_du_report.json + # git config --global user.name "GitHub Actions" + # git config --global user.email "actions@github.com" + # git add $DATE_du_report.json + # git commit -m "Add disk usage report for $DATE" + # git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git From 04b4193db3f205038d1d081c53eddcffc545aff8 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 14:22:11 -0600 Subject: [PATCH 19/39] Restrict job to asmacdo for testing --- .github/manifests/disk-usage-report-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml index 696d8f4..c7f60a4 100644 --- a/.github/manifests/disk-usage-report-job.yaml +++ b/.github/manifests/disk-usage-report-job.yaml @@ -12,7 +12,7 @@ spec: - name: disk-usage-report image: IMAGE_PLACEHOLDER args: - - "/home/" + - "/home/asmacdo" volumeMounts: - name: persistent-storage mountPath: "/home/" From a443081ce5a1ae8bbc60532af88c4ccdeb474858 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 14:30:46 -0600 Subject: [PATCH 20/39] Sanity check. Just list the directories --- .github/scripts/du.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/scripts/du.py b/.github/scripts/du.py index 260bd07..12e0c0f 100755 --- a/.github/scripts/du.py +++ b/.github/scripts/du.py @@ -41,6 +41,7 @@ def prepare_report(directory): for user, data in report.items(): data["disk_usage_human_readable"] = bytes_to_human_readable(data["disk_usage_bytes"]) + with open(OUTPUT_FILE, 'w') as f: json.dump(report, f, indent=4) @@ -51,5 +52,9 @@ def prepare_report(directory): if len(sys.argv) != 2: print("Usage: du.py ") else: - directory = sys.argv[1] - prepare_report(directory) + path = sys.argv[1] + directories = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] + + with open(OUTPUT_FILE, 'w') as f: + f.write("\n".join(directories)) + # prepare_report(directory) From 99ac2646b0779dc2ee6178d7f20091ccc8dc745f Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 14:43:17 -0600 Subject: [PATCH 21/39] Job was deployed, but never assigned to node, back to sanity check --- .github/workflows/report.yaml | 57 +++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index a12adbd..5889c90 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -50,32 +50,51 @@ jobs: run: | kubectl get pods -n jupyterhub - - name: Replace image placeholder in manifest + - name: Deploy Hello World Pod run: | - sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml + kubectl apply -f .github/manifests/hello-world-pod.yaml - - name: Deploy Disk Usage Report Job + - name: Wait for Hello World Pod to complete run: | - kubectl apply -f .github/manifests/disk-usage-report-job.yaml + kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s # 5 minutes + continue-on-error: true # Allow the workflow to continue even if this step fails - # TODO should timeout be longer? - - name: Wait for Disk Usage Report Job to complete + - name: Get Hello World Pod logs run: | - kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s - continue-on-error: true + kubectl logs hello-world-pod + if: ${{ success() }} # Only run this step if the previous step was successful - - name: Retrieve generated report file + - name: Delete Hello World Pod run: | - POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}') - kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub - cat du_report.json - continue-on-error: true - - # continue-on-error for previous steps so we delete the job - - name: Delete Disk Usage Report Job - run: | - kubectl delete job disk-usage-report-job - + kubectl delete pod hello-world-pod + if: ${{ always() }} # Always run this step, even if other steps fail + # + # - name: Replace image placeholder in manifest + # run: | + # sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml + # + # - name: Deploy Disk Usage Report Job + # run: | + # kubectl apply -f .github/manifests/disk-usage-report-job.yaml + # + # # TODO should timeout be longer? + # - name: Wait for Disk Usage Report Job to complete + # run: | + # kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s + # continue-on-error: true + # + # - name: Retrieve generated report file + # run: | + # POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}') + # kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub + # cat du_report.json + # continue-on-error: true + # + # # continue-on-error for previous steps so we delete the job + # - name: Delete Disk Usage Report Job + # run: | + # kubectl delete job disk-usage-report-job + # # - name: Clone dandi-hub-usage-reports repository # run: | # git clone https://github.com/dandi/dandi-hub-usage-reports.git From 6ee89b2eeebb6732115abc2a0c47d5856f7930b2 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 14:55:31 -0600 Subject: [PATCH 22/39] change from job to pod --- .github/manifests/disk-usage-report-job.yaml | 51 ++++++-------- .github/workflows/report.yaml | 73 ++++++++++---------- 2 files changed, 59 insertions(+), 65 deletions(-) diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml index c7f60a4..36ded0f 100644 --- a/.github/manifests/disk-usage-report-job.yaml +++ b/.github/manifests/disk-usage-report-job.yaml @@ -1,32 +1,27 @@ -apiVersion: batch/v1 -kind: Job +apiVersion: v1 +kind: Pod metadata: name: disk-usage-report-job spec: - template: - metadata: - labels: - app: disk-usage-report - spec: - containers: - - name: disk-usage-report - image: IMAGE_PLACEHOLDER - args: - - "/home/asmacdo" - volumeMounts: - - name: persistent-storage - mountPath: "/home/" - restartPolicy: Never - nodeSelector: - NodeGroupType: default - NodePool: default - hub.jupyter.org/node-purpose: user - tolerations: - - key: "hub.jupyter.org/dedicated" - operator: "Equal" - value: "user" - effect: "NoSchedule" - volumes: + containers: + - name: disk-usage-report + image: IMAGE_PLACEHOLDER + args: + - "/home/asmacdo" + volumeMounts: - name: persistent-storage - persistentVolumeClaim: - claimName: efs-persist + mountPath: "/home/" + restartPolicy: Never + nodeSelector: + NodeGroupType: default + NodePool: default + hub.jupyter.org/node-purpose: user + tolerations: + - key: "hub.jupyter.org/dedicated" + operator: "Equal" + value: "user" + effect: "NoSchedule" + volumes: + - name: persistent-storage + persistentVolumeClaim: + claimName: efs-persist diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 5889c90..50168ff 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -50,51 +50,50 @@ jobs: run: | kubectl get pods -n jupyterhub - - name: Deploy Hello World Pod - run: | - kubectl apply -f .github/manifests/hello-world-pod.yaml - - - name: Wait for Hello World Pod to complete - run: | - kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s # 5 minutes - continue-on-error: true # Allow the workflow to continue even if this step fails - - - name: Get Hello World Pod logs - run: | - kubectl logs hello-world-pod - if: ${{ success() }} # Only run this step if the previous step was successful - - - name: Delete Hello World Pod - run: | - kubectl delete pod hello-world-pod - if: ${{ always() }} # Always run this step, even if other steps fail - # - # - name: Replace image placeholder in manifest - # run: | - # sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml - # - # - name: Deploy Disk Usage Report Job + # - name: Deploy Hello World Pod # run: | - # kubectl apply -f .github/manifests/disk-usage-report-job.yaml + # kubectl apply -f .github/manifests/hello-world-pod.yaml # - # # TODO should timeout be longer? - # - name: Wait for Disk Usage Report Job to complete + # - name: Wait for Hello World Pod to complete # run: | - # kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s - # continue-on-error: true + # kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s # 5 minutes + # continue-on-error: true # Allow the workflow to continue even if this step fails # - # - name: Retrieve generated report file + # - name: Get Hello World Pod logs # run: | - # POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}') - # kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub - # cat du_report.json - # continue-on-error: true + # kubectl logs hello-world-pod + # if: ${{ success() }} # Only run this step if the previous step was successful # - # # continue-on-error for previous steps so we delete the job - # - name: Delete Disk Usage Report Job + # - name: Delete Hello World Pod # run: | - # kubectl delete job disk-usage-report-job + # kubectl delete pod hello-world-pod + # if: ${{ always() }} # Always run this step, even if other steps fail # + - name: Replace image placeholder in manifest + run: | + sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml + + - name: Deploy Disk Usage Report Job Pod + run: | + kubectl apply -f .github/manifests/disk-usage-report-job.yaml + + # TODO should timeout be longer? + - name: Wait for Disk Usage Report Job to complete + run: | + kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=300s + continue-on-error: true + + - name: Retrieve generated report file + run: | + kubectl cp disk-usage-report-job:/output/du_report.json du_report.json + cat du_report.json + continue-on-error: true + + # continue-on-error for previous steps so we delete the job + - name: Delete Disk Usage Report Job + run: | + kubectl delete pod disk-usage-report-job + # - name: Clone dandi-hub-usage-reports repository # run: | # git clone https://github.com/dandi/dandi-hub-usage-reports.git From a8f6ed30157424af1b90192a32c04ccb7ed4bf92 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 15:00:07 -0600 Subject: [PATCH 23/39] deploy pod to same namespace as pvc --- .github/manifests/disk-usage-report-job.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml index 36ded0f..7a5e424 100644 --- a/.github/manifests/disk-usage-report-job.yaml +++ b/.github/manifests/disk-usage-report-job.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: Pod metadata: name: disk-usage-report-job + namespace: jupyterhub spec: containers: - name: disk-usage-report @@ -25,3 +26,4 @@ spec: - name: persistent-storage persistentVolumeClaim: claimName: efs-persist + From 664853b13abef65e910667d3306fc6554cb0df13 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 15:04:29 -0600 Subject: [PATCH 24/39] Use ns in action --- .github/workflows/report.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 50168ff..de0ad87 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -80,19 +80,19 @@ jobs: # TODO should timeout be longer? - name: Wait for Disk Usage Report Job to complete run: | - kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=300s + kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=30s -n jupyterhub continue-on-error: true - name: Retrieve generated report file run: | - kubectl cp disk-usage-report-job:/output/du_report.json du_report.json + kubectl cp disk-usage-report-job:/output/du_report.json du_report.json -n jupyterhub cat du_report.json continue-on-error: true # continue-on-error for previous steps so we delete the job - name: Delete Disk Usage Report Job run: | - kubectl delete pod disk-usage-report-job + kubectl delete pod disk-usage-report-job -n jupyterhub # - name: Clone dandi-hub-usage-reports repository # run: | From e35c97439adc430f9fb1a113ed1db5ab2f8fc2ec Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 15:09:33 -0600 Subject: [PATCH 25/39] increase timeout to 60s job shouldnt take that long, but this is wall time, includes docker pull, etc --- .github/workflows/report.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index de0ad87..4bcf1b4 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -80,7 +80,7 @@ jobs: # TODO should timeout be longer? - name: Wait for Disk Usage Report Job to complete run: | - kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=30s -n jupyterhub + kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=60s -n jupyterhub continue-on-error: true - name: Retrieve generated report file From a8af5f278231575cac52a5575a4ada4fc89ab1a4 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 15:15:26 -0600 Subject: [PATCH 26/39] fixup: image name in manifest --- .github/workflows/report.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 4bcf1b4..170841f 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -71,7 +71,7 @@ jobs: # - name: Replace image placeholder in manifest run: | - sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml + sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest"'|' .github/manifests/disk-usage-report-job.yaml - name: Deploy Disk Usage Report Job Pod run: | From 024cf6e08a55cdffde89435e181dd5b939d6dabd Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 15:18:53 -0600 Subject: [PATCH 27/39] increase timeout to 150 took almost 60 sec to start up --- .github/workflows/report.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 170841f..04b264c 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -80,7 +80,7 @@ jobs: # TODO should timeout be longer? - name: Wait for Disk Usage Report Job to complete run: | - kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=60s -n jupyterhub + kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=150s -n jupyterhub continue-on-error: true - name: Retrieve generated report file From 49c346ea840b1606612482ae4e88c5f004aa8236 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 15:29:59 -0600 Subject: [PATCH 28/39] override entrypoint so i can debug with exec --- .github/manifests/disk-usage-report-job.yaml | 5 +++-- .github/workflows/report.yaml | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml index 7a5e424..e2e81a3 100644 --- a/.github/manifests/disk-usage-report-job.yaml +++ b/.github/manifests/disk-usage-report-job.yaml @@ -7,8 +7,9 @@ spec: containers: - name: disk-usage-report image: IMAGE_PLACEHOLDER - args: - - "/home/asmacdo" + command: ["/bin/sh", "-c", "sleep 300"] + # args: + # - "/home/asmacdo" volumeMounts: - name: persistent-storage mountPath: "/home/" diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 04b264c..34241ee 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -80,7 +80,7 @@ jobs: # TODO should timeout be longer? - name: Wait for Disk Usage Report Job to complete run: | - kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=150s -n jupyterhub + kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=360s -n jupyterhub continue-on-error: true - name: Retrieve generated report file From 0191c858bd006e4608f08dea5ad0d90a5c1ba180 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 15:31:45 -0600 Subject: [PATCH 29/39] bound /home actually meant path was /home/home/asmacdo --- .github/manifests/disk-usage-report-job.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml index e2e81a3..81ff08d 100644 --- a/.github/manifests/disk-usage-report-job.yaml +++ b/.github/manifests/disk-usage-report-job.yaml @@ -7,12 +7,12 @@ spec: containers: - name: disk-usage-report image: IMAGE_PLACEHOLDER - command: ["/bin/sh", "-c", "sleep 300"] - # args: - # - "/home/asmacdo" + args: + - "/home/asmacdo" volumeMounts: - name: persistent-storage - mountPath: "/home/" + mountPath: "/home" + subPath: "home" restartPolicy: Never nodeSelector: NodeGroupType: default From 3eb9157f6f8d15e4cb3615fdb9cdef112c88084b Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 8 Nov 2024 15:43:35 -0600 Subject: [PATCH 30/39] Create output dir prior to writing report --- .github/scripts/du.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/du.py b/.github/scripts/du.py index 12e0c0f..0b2ceb7 100755 --- a/.github/scripts/du.py +++ b/.github/scripts/du.py @@ -41,7 +41,7 @@ def prepare_report(directory): for user, data in report.items(): data["disk_usage_human_readable"] = bytes_to_human_readable(data["disk_usage_bytes"]) - + os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) with open(OUTPUT_FILE, 'w') as f: json.dump(report, f, indent=4) @@ -55,6 +55,7 @@ def prepare_report(directory): path = sys.argv[1] directories = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] + os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) with open(OUTPUT_FILE, 'w') as f: f.write("\n".join(directories)) # prepare_report(directory) From 676a00e2e1d18cc0548443a3124af482669c14b7 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Mon, 11 Nov 2024 08:54:57 -0600 Subject: [PATCH 31/39] pod back to job --- .github/manifests/disk-usage-report-job.yaml | 54 +++++++++++--------- .github/workflows/report.yaml | 9 ++-- 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml index 81ff08d..387aead 100644 --- a/.github/manifests/disk-usage-report-job.yaml +++ b/.github/manifests/disk-usage-report-job.yaml @@ -1,30 +1,34 @@ -apiVersion: v1 -kind: Pod +apiVersion: v1/batch +kind: Job metadata: name: disk-usage-report-job namespace: jupyterhub spec: - containers: - - name: disk-usage-report - image: IMAGE_PLACEHOLDER - args: - - "/home/asmacdo" - volumeMounts: + template: + metadata: + labels: + app: disk-usage-report + spec: + containers: + - name: disk-usage-report + image: IMAGE_PLACEHOLDER + args: + - "/home/asmacdo" + volumeMounts: + - name: persistent-storage + mountPath: "/home" + subPath: "home" + restartPolicy: Never + nodeSelector: + NodeGroupType: default + NodePool: default + hub.jupyter.org/node-purpose: user + tolerations: + - key: "hub.jupyter.org/dedicated" + operator: "Equal" + value: "user" + effect: "NoSchedule" + volumes: - name: persistent-storage - mountPath: "/home" - subPath: "home" - restartPolicy: Never - nodeSelector: - NodeGroupType: default - NodePool: default - hub.jupyter.org/node-purpose: user - tolerations: - - key: "hub.jupyter.org/dedicated" - operator: "Equal" - value: "user" - effect: "NoSchedule" - volumes: - - name: persistent-storage - persistentVolumeClaim: - claimName: efs-persist - + persistentVolumeClaim: + claimName: efs-persist diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 34241ee..1c2692a 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -73,26 +73,27 @@ jobs: run: | sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest"'|' .github/manifests/disk-usage-report-job.yaml - - name: Deploy Disk Usage Report Job Pod + - name: Deploy Disk Usage Report Job run: | kubectl apply -f .github/manifests/disk-usage-report-job.yaml # TODO should timeout be longer? - name: Wait for Disk Usage Report Job to complete run: | - kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=360s -n jupyterhub + kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=360s -n jupyterhub continue-on-error: true - name: Retrieve generated report file run: | - kubectl cp disk-usage-report-job:/output/du_report.json du_report.json -n jupyterhub + POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}') + kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub cat du_report.json continue-on-error: true # continue-on-error for previous steps so we delete the job - name: Delete Disk Usage Report Job run: | - kubectl delete pod disk-usage-report-job -n jupyterhub + kubectl delete job disk-usage-report-job -n jupyterhub # - name: Clone dandi-hub-usage-reports repository # run: | From c085751ef1440b4af9ce6c61af48c266a1db27e5 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Mon, 11 Nov 2024 09:18:16 -0600 Subject: [PATCH 32/39] Fixup use the correct job api --- .github/manifests/disk-usage-report-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml index 387aead..ed2bcde 100644 --- a/.github/manifests/disk-usage-report-job.yaml +++ b/.github/manifests/disk-usage-report-job.yaml @@ -1,4 +1,4 @@ -apiVersion: v1/batch +apiVersion: batch/v1 kind: Job metadata: name: disk-usage-report-job From 3e18a379b1f13835913c7726e48dee35af92a429 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Mon, 11 Nov 2024 09:27:49 -0600 Subject: [PATCH 33/39] Add namespace to pod retrieval --- .github/workflows/report.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 1c2692a..0cbe18d 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -85,7 +85,7 @@ jobs: - name: Retrieve generated report file run: | - POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}') + POD_NAME=$(kubectl get pods -n jupyterhub --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}') kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub cat du_report.json continue-on-error: true From 0fa5ece95edf5a7e3b82fb5cf0b5ce2fe90835e0 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Mon, 11 Nov 2024 10:38:43 -0600 Subject: [PATCH 34/39] write directly to pv to test job --- .github/scripts/du.py | 19 +++++++++++-------- .github/workflows/report.yaml | 7 ------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/.github/scripts/du.py b/.github/scripts/du.py index 0b2ceb7..1c3a403 100755 --- a/.github/scripts/du.py +++ b/.github/scripts/du.py @@ -4,8 +4,9 @@ import subprocess import sys import json +from datetime import date -OUTPUT_FILE = "/output/du_report.json" +OUTPUT_DIR = "/home/asmacdo/du_reports/" SIZE_THRESHOLD_GB = 1 SIZE_THRESHOLD_BYTES = SIZE_THRESHOLD_GB * 1024 * 1024 * 1024 @@ -41,11 +42,12 @@ def prepare_report(directory): for user, data in report.items(): data["disk_usage_human_readable"] = bytes_to_human_readable(data["disk_usage_bytes"]) - os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) - with open(OUTPUT_FILE, 'w') as f: - json.dump(report, f, indent=4) - - print(f"Disk usage report generated at {os.path.abspath(OUTPUT_FILE)}") + # os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) + # output_file = + # with open(OUTPUT_FILE, 'w') as f: + # json.dump(report, f, indent=4) + # + # print(f"Disk usage report generated at {os.path.abspath(OUTPUT_FILE)}") if __name__ == "__main__": @@ -55,7 +57,8 @@ def prepare_report(directory): path = sys.argv[1] directories = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] - os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) - with open(OUTPUT_FILE, 'w') as f: + os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True) + current_date = date.today().strftime('%Y-%m-%d') + with open(f"OUTPUT_DIR/{current_date}.json", "w") as f: f.write("\n".join(directories)) # prepare_report(directory) diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index 0cbe18d..acb9dc1 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -83,13 +83,6 @@ jobs: kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=360s -n jupyterhub continue-on-error: true - - name: Retrieve generated report file - run: | - POD_NAME=$(kubectl get pods -n jupyterhub --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}') - kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub - cat du_report.json - continue-on-error: true - # continue-on-error for previous steps so we delete the job - name: Delete Disk Usage Report Job run: | From e1ecbc33b8b820e68d3fc3d3242cebd4137a4b35 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Mon, 11 Nov 2024 10:47:13 -0600 Subject: [PATCH 35/39] fixup script fstring --- .github/scripts/du.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/du.py b/.github/scripts/du.py index 1c3a403..05043ef 100755 --- a/.github/scripts/du.py +++ b/.github/scripts/du.py @@ -59,6 +59,6 @@ def prepare_report(directory): os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True) current_date = date.today().strftime('%Y-%m-%d') - with open(f"OUTPUT_DIR/{current_date}.json", "w") as f: + with open(f"{OUTPUT_DIR}/{current_date}.json", "w") as f: f.write("\n".join(directories)) # prepare_report(directory) From 082d3ccfe348885d7adfaeec4bbfb2682c7af4c3 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Mon, 11 Nov 2024 10:47:37 -0600 Subject: [PATCH 36/39] no retry on failure, we were spinning up 5 pods, lets just fail 1 time --- .github/manifests/disk-usage-report-job.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml index ed2bcde..b487280 100644 --- a/.github/manifests/disk-usage-report-job.yaml +++ b/.github/manifests/disk-usage-report-job.yaml @@ -9,6 +9,7 @@ spec: labels: app: disk-usage-report spec: + backoffLimit: 0 # No retry on failure containers: - name: disk-usage-report image: IMAGE_PLACEHOLDER From d46ea446adaffc63c6cb14b4ea1f33ba660467dc Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Mon, 11 Nov 2024 10:49:22 -0600 Subject: [PATCH 37/39] Fixup backup limit job not template --- .github/manifests/disk-usage-report-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml index b487280..adf966e 100644 --- a/.github/manifests/disk-usage-report-job.yaml +++ b/.github/manifests/disk-usage-report-job.yaml @@ -4,12 +4,12 @@ metadata: name: disk-usage-report-job namespace: jupyterhub spec: + backoffLimit: 0 # No retry on failure template: metadata: labels: app: disk-usage-report spec: - backoffLimit: 0 # No retry on failure containers: - name: disk-usage-report image: IMAGE_PLACEHOLDER From 965a81eb6374077f8657066761f8808c7e7d08f3 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Mon, 11 Nov 2024 10:52:54 -0600 Subject: [PATCH 38/39] Initial report --- .github/manifests/disk-usage-report-job.yaml | 2 +- .github/scripts/du.py | 19 ++++++------------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml index adf966e..488735e 100644 --- a/.github/manifests/disk-usage-report-job.yaml +++ b/.github/manifests/disk-usage-report-job.yaml @@ -14,7 +14,7 @@ spec: - name: disk-usage-report image: IMAGE_PLACEHOLDER args: - - "/home/asmacdo" + - "/home/" volumeMounts: - name: persistent-storage mountPath: "/home" diff --git a/.github/scripts/du.py b/.github/scripts/du.py index 05043ef..e626769 100755 --- a/.github/scripts/du.py +++ b/.github/scripts/du.py @@ -42,12 +42,11 @@ def prepare_report(directory): for user, data in report.items(): data["disk_usage_human_readable"] = bytes_to_human_readable(data["disk_usage_bytes"]) - # os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) - # output_file = - # with open(OUTPUT_FILE, 'w') as f: - # json.dump(report, f, indent=4) - # - # print(f"Disk usage report generated at {os.path.abspath(OUTPUT_FILE)}") + os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True) + current_date = date.today().strftime('%Y-%m-%d') + with open(f"{OUTPUT_DIR}/{current_date}.json", "w") as f: + json.dump(report, f, indent=4) + print(f"Disk usage report generated at {os.path.abspath(OUTPUT_FILE)}") if __name__ == "__main__": @@ -55,10 +54,4 @@ def prepare_report(directory): print("Usage: du.py ") else: path = sys.argv[1] - directories = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] - - os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True) - current_date = date.today().strftime('%Y-%m-%d') - with open(f"{OUTPUT_DIR}/{current_date}.json", "w") as f: - f.write("\n".join(directories)) - # prepare_report(directory) + prepare_report(path) From 7366d2d318a2b38496d552fcb050be8c86914155 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Mon, 11 Nov 2024 12:35:01 -0600 Subject: [PATCH 39/39] disable report see PR for comment --- .github/manifests/disk-usage-report-job.yaml | 2 +- .github/workflows/report.yaml | 208 +++++++++---------- 2 files changed, 105 insertions(+), 105 deletions(-) diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml index 488735e..161f177 100644 --- a/.github/manifests/disk-usage-report-job.yaml +++ b/.github/manifests/disk-usage-report-job.yaml @@ -12,7 +12,7 @@ spec: spec: containers: - name: disk-usage-report - image: IMAGE_PLACEHOLDER + image: dandiarchive/dandihub-report-generator:latest args: - "/home/" volumeMounts: diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml index acb9dc1..8424fd6 100644 --- a/.github/workflows/report.yaml +++ b/.github/workflows/report.yaml @@ -1,104 +1,104 @@ -name: Generate Data Usage Report - -on: - pull_request: - branches: - - main - -jobs: - generate_data_usage_report: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Log in to DockerHub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - - name: Build and push Docker image - uses: docker/build-push-action@v3 - with: - context: . - file: images/Dockerfile.dandihub_report_generator - push: true - tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v3 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: us-east-2 - - - name: Assume ProvisioningRole - run: | - CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession") - export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId') - export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey') - export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken') - - - name: Configure kubectl with AWS EKS - run: | - aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} - - # TODO remove - - name: Sanity check - run: | - kubectl get pods -n jupyterhub - - # - name: Deploy Hello World Pod - # run: | - # kubectl apply -f .github/manifests/hello-world-pod.yaml - # - # - name: Wait for Hello World Pod to complete - # run: | - # kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s # 5 minutes - # continue-on-error: true # Allow the workflow to continue even if this step fails - # - # - name: Get Hello World Pod logs - # run: | - # kubectl logs hello-world-pod - # if: ${{ success() }} # Only run this step if the previous step was successful - # - # - name: Delete Hello World Pod - # run: | - # kubectl delete pod hello-world-pod - # if: ${{ always() }} # Always run this step, even if other steps fail - # - - name: Replace image placeholder in manifest - run: | - sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest"'|' .github/manifests/disk-usage-report-job.yaml - - - name: Deploy Disk Usage Report Job - run: | - kubectl apply -f .github/manifests/disk-usage-report-job.yaml - - # TODO should timeout be longer? - - name: Wait for Disk Usage Report Job to complete - run: | - kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=360s -n jupyterhub - continue-on-error: true - - # continue-on-error for previous steps so we delete the job - - name: Delete Disk Usage Report Job - run: | - kubectl delete job disk-usage-report-job -n jupyterhub - - # - name: Clone dandi-hub-usage-reports repository - # run: | - # git clone https://github.com/dandi/dandi-hub-usage-reports.git - # - # - name: Copy report file to repository, commit and push report - # run: | - # cd dandi-hub-usage-reports - # DATE=$(date +'%Y-%m-%d') - # mv ../du_report.json $DATE_du_report.json - # git config --global user.name "GitHub Actions" - # git config --global user.email "actions@github.com" - # git add $DATE_du_report.json - # git commit -m "Add disk usage report for $DATE" - # git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git +# name: Generate Data Usage Report +# +# on: +# pull_request: +# branches: +# - main +# +# jobs: +# generate_data_usage_report: +# runs-on: ubuntu-latest +# +# steps: +# - name: Checkout code +# uses: actions/checkout@v3 +# +# - name: Log in to DockerHub +# uses: docker/login-action@v2 +# with: +# username: ${{ secrets.DOCKERHUB_USERNAME }} +# password: ${{ secrets.DOCKERHUB_TOKEN }} +# +# - name: Build and push Docker image +# uses: docker/build-push-action@v3 +# with: +# context: . +# file: images/Dockerfile.dandihub_report_generator +# push: true +# tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest +# +# - name: Configure AWS credentials +# uses: aws-actions/configure-aws-credentials@v3 +# with: +# aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} +# aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +# aws-region: us-east-2 +# +# - name: Assume ProvisioningRole +# run: | +# CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession") +# export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId') +# export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey') +# export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken') +# +# - name: Configure kubectl with AWS EKS +# run: | +# aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} +# +# # TODO remove +# - name: Sanity check +# run: | +# kubectl get pods -n jupyterhub +# +# # - name: Deploy Hello World Pod +# # run: | +# # kubectl apply -f .github/manifests/hello-world-pod.yaml +# # +# # - name: Wait for Hello World Pod to complete +# # run: | +# # kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s # 5 minutes +# # continue-on-error: true # Allow the workflow to continue even if this step fails +# # +# # - name: Get Hello World Pod logs +# # run: | +# # kubectl logs hello-world-pod +# # if: ${{ success() }} # Only run this step if the previous step was successful +# # +# # - name: Delete Hello World Pod +# # run: | +# # kubectl delete pod hello-world-pod +# # if: ${{ always() }} # Always run this step, even if other steps fail +# # +# - name: Replace image placeholder in manifest +# run: | +# sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest"'|' .github/manifests/disk-usage-report-job.yaml +# +# - name: Deploy Disk Usage Report Job +# run: | +# kubectl apply -f .github/manifests/disk-usage-report-job.yaml +# +# # TODO should timeout be longer? +# - name: Wait for Disk Usage Report Job to complete +# run: | +# kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=360s -n jupyterhub +# continue-on-error: true +# +# # continue-on-error for previous steps so we delete the job +# - name: Delete Disk Usage Report Job +# run: | +# kubectl delete job disk-usage-report-job -n jupyterhub +# +# # - name: Clone dandi-hub-usage-reports repository +# # run: | +# # git clone https://github.com/dandi/dandi-hub-usage-reports.git +# # +# # - name: Copy report file to repository, commit and push report +# # run: | +# # cd dandi-hub-usage-reports +# # DATE=$(date +'%Y-%m-%d') +# # mv ../du_report.json $DATE_du_report.json +# # git config --global user.name "GitHub Actions" +# # git config --global user.email "actions@github.com" +# # git add $DATE_du_report.json +# # git commit -m "Add disk usage report for $DATE" +# # git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git