From eb82fb24150d3f8b29651a60eed1cdd706fc019a Mon Sep 17 00:00:00 2001 From: greg pereira Date: Fri, 17 May 2024 08:20:10 -0700 Subject: [PATCH] adding tmate session and documentation around it Signed-off-by: greg pereira --- .github/workflows/README.md | 52 ++++++++++++++++++++++++++++++++++++ .github/workflows/build.yml | 7 +++++ .github/workflows/images.yml | 40 +++++++++++++++++++++++++++ .github/workflows/qa-ec2.yml | 16 +++++++++++ 4 files changed, 115 insertions(+) create mode 100644 .github/workflows/README.md diff --git a/.github/workflows/README.md b/.github/workflows/README.md new file mode 100644 index 00000000..523649dd --- /dev/null +++ b/.github/workflows/README.md @@ -0,0 +1,52 @@ +# Workflow Docs + +## Tmate action + +The following is a rundown of the tmate action used in most of the workflows. Its structure looks something like this: + +```github-action + - name: Setup tmate session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3.18 + timeout-minutes: 15 + with: + detached: false + limit-access-to-actor: true +``` + +While it may seem obvious to some, it is important to note that the workflow will not complete until the tmate action step completes. +Since we have concurrency set on most of our workflows, this means that if you push another run of the workflow, you must first close your SSH session. +More information on this is available in the following section [When / Why does the action step close](./README.md#when--why-does-the-action-step-close). +It is for this reason that this may not be useful in every situation. + +### When / Why does the action step close? + +This action will wait for one of two cases, the first of which is connection close. The SSH session only supports a single connection, +if you ssh and close the connection the action step will close and the workflow will proceed, even if you have not finished the `timeout-minutes` window. +Note also that as it only supports a single connection, only one person can ssh to the tmate sessions, others will be rejected. +The second condition is that the `timeout-minutes` elapse, in which case the action will boot you out of ssh, the session will close and the worfklow will continue. + +### Configurations + +The key values are `timeout-minutes`, `detached` and `limit-access-to-actor`. + +#### Detached mode + +If the action step is ran with `detached: true`, it will proceed to the next action steps unhindered. +If the workflow finishes before the `timeout-minutes` has elapsed, it will pop open a new action step at the end of the workflow to wait for and cleanup the tmate action. +If the step is instead ran with `detached: false` the workflow will not proceed until the step closes. + +#### Limit access to actor + +With `limit-access-to-actor` set to `true`, the action look who created the PR, and grab the public SSH keys stored in their Github account. +It will reject connections from any SSH private key that does not match the public key listed in the Github account. +This is recommended, as it prevents others from abusing your runners, but may be dissabled to allow a teamate to ssh instead. + +### How does this action step work with Terraform / EC2 instances? + +This is a great question! Its important to know that there are 2 parrallel tracks of CI in this example, the first being Github actions + the Runner, +and the second being Ansible playbooks, ran on the runner but SSH to an EC2 instance. Imagine that our workflow starts with Github actions, +which then calls the ansible playbook and does some stuff on our EC2 over ssh. Imagine then we get to something we want to debug, +and we open a `deteached` SSH session. Since it is detached the workflow will proceed and hit the step to tear down the EC2, making it no longer reachable via ssh. +For this reason you will probably have to run the Tmate session with `detached: false` and or add a timeout step to the ansible playbook, +to make sure you still have something that the runner can SSH into. \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 779b53a8..c680797d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -28,6 +28,13 @@ jobs: run: | go build -o "worker_$(go env GOOS)_${GOARCH}" main.go echo bin="worker_$(go env GOOS)_${GOARCH}" >> "$GITHUB_OUTPUT" + - name: Setup tmate session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3.18 + timeout-minutes: 15 + with: + detached: false + limit-access-to-actor: true working-directory: ./worker - uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/images.yml b/.github/workflows/images.yml index ccb7f35c..9cf3c31f 100644 --- a/.github/workflows/images.yml +++ b/.github/workflows/images.yml @@ -48,6 +48,14 @@ jobs: cache-to: type=gha,mode=max file: gobot/Containerfile + - name: Setup tmate session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3.18 + timeout-minutes: 15 + with: + detached: false + limit-access-to-actor: true + push_to_registries_ui: name: Push UI container image to GHCR runs-on: ubuntu-latest @@ -90,6 +98,14 @@ jobs: cache-to: type=gha,mode=max file: ui/Containerfile + - name: Setup tmate session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3.18 + timeout-minutes: 15 + with: + detached: false + limit-access-to-actor: true + push_to_registries_apiserver: name: Push apiserver container image to GHCR runs-on: ubuntu-latest @@ -132,6 +148,14 @@ jobs: cache-to: type=gha,mode=max file: ui/apiserver/Containerfile + - name: Setup tmate session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3.18 + timeout-minutes: 15 + with: + detached: false + limit-access-to-actor: true + push_to_registries_serve: name: Push serve container image to GHCR runs-on: ubuntu-latest @@ -188,6 +212,14 @@ jobs: cache-to: type=gha,mode=max file: worker/Containerfile + - name: Setup tmate session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3.18 + timeout-minutes: 15 + with: + detached: false + limit-access-to-actor: true + push_to_registries_serve_base: name: Push serve base container image to GHCR runs-on: ubuntu-latest @@ -243,3 +275,11 @@ jobs: cache-from: type=gha cache-to: type=gha,mode=max file: worker/Containerfile.servebase + + - name: Setup tmate session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3.18 + timeout-minutes: 15 + with: + detached: false + limit-access-to-actor: true diff --git a/.github/workflows/qa-ec2.yml b/.github/workflows/qa-ec2.yml index 17a9678e..0a91ec80 100644 --- a/.github/workflows/qa-ec2.yml +++ b/.github/workflows/qa-ec2.yml @@ -62,6 +62,14 @@ jobs: --vault-password-file ansible_vault_password_file \ deploy/ansible/qa/prod/deploy-worker-script.yml + - name: Setup tmate session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3.18 + timeout-minutes: 15 + with: + detached: false + limit-access-to-actor: true + - name: Terminate EC2 Instances if: always() run: | @@ -133,6 +141,14 @@ jobs: # -e "github_token=${BOT_GITHUB_TOKEN}" deploy/ansible/deploy-bot.yml # rm -f ansible_vault_password_file + - name: Setup tmate session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3.18 + timeout-minutes: 15 + with: + detached: false + limit-access-to-actor: true + - name: Terminate EC2 Instances if: always() run: |