Multicluster #11280
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Multicluster | |
# Any change in triggers needs to be reflected in the concurrency group. | |
on: | |
### FOR TESTING PURPOSES | |
# This workflow runs in the context of `main`, and ignores changes to | |
# workflow files in PRs. For testing changes to this workflow from a PR: | |
# - Make sure the PR uses a branch from the base repository (requires write | |
# privileges). It will not work with a branch from a fork (missing secrets). | |
# - Uncomment the `pull_request` event below, commit separately with a `DO | |
# NOT MERGE` message, and push to the PR. As long as the commit is present, | |
# any push to the PR will trigger this workflow. | |
# - Don't forget to remove the `DO NOT MERGE` commit once satisfied. The run | |
# will disappear from the PR checks: please provide a direct link to the | |
# successful workflow run (can be found from Actions tab) in a comment. | |
# | |
# pull_request: {} | |
### | |
pull_request_target: {} | |
# Run every 6 hours | |
schedule: | |
- cron: '0 3/6 * * *' | |
# By specifying the access of one of the scopes, all of those that are not | |
# specified are set to 'none'. | |
permissions: | |
# To be able to access the repository with actions/checkout | |
contents: read | |
# To be able to request the JWT from GitHub's OIDC provider | |
id-token: write | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.event.pull_request.number || 'scheduled' }} | |
cancel-in-progress: true | |
env: | |
zone: us-west2-a | |
# renovate: datasource=github-releases depName=cilium/cilium | |
cilium_version: v1.16.5 | |
kubectl_version: v1.23.6 | |
USE_GKE_GCLOUD_AUTH_PLUGIN: True | |
jobs: | |
installation-and-connectivity: | |
name: Multicluster Installation and Connectivity Test | |
if: ${{ github.repository == 'cilium/cilium-cli' }} | |
runs-on: ubuntu-24.04 | |
timeout-minutes: 45 | |
strategy: | |
fail-fast: false | |
steps: | |
- name: Checkout | |
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
# Note: These names currently approach the limit of 40 characters | |
- name: Set mode-specific names | |
run: | | |
echo "clusterNameBase=${{ github.event.repository.name }}-${{ github.run_id }}-${{ github.run_attempt }}" >> $GITHUB_ENV | |
echo "clusterName1=${{ github.event.repository.name }}-${{ github.run_id }}-${{ github.run_attempt }}-1" >> $GITHUB_ENV | |
echo "clusterName2=${{ github.event.repository.name }}-${{ github.run_id }}-${{ github.run_attempt }}-2" >> $GITHUB_ENV | |
echo "firewallRuleName=${{ github.event.repository.name }}-${{ github.run_id }}-${{ github.run_attempt }}" >> $GITHUB_ENV | |
- name: Install kubectl | |
run: | | |
curl -sLO "https://dl.k8s.io/release/${{ env.kubectl_version }}/bin/linux/amd64/kubectl" | |
curl -sLO "https://dl.k8s.io/${{ env.kubectl_version }}/bin/linux/amd64/kubectl.sha256" | |
echo "$(cat kubectl.sha256) kubectl" | sha256sum --check | |
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl | |
kubectl version --client | |
- name: Set up gcloud credentials | |
id: 'auth' | |
uses: google-github-actions/auth@6fc4af4b145ae7821d527454aa9bd537d1f2dc5f # v2.1.7 | |
with: | |
workload_identity_provider: ${{ secrets.GCP_PR_WORKLOAD_IDENTITY_PROVIDER }} | |
service_account: ${{ secrets.GCP_PR_SA_CLI }} | |
create_credentials_file: true | |
export_environment_variables: true | |
- name: Set up gcloud CLI | |
uses: google-github-actions/setup-gcloud@6189d56e4096ee891640bb02ac264be376592d6a # v2.1.2 | |
with: | |
project_id: ${{ secrets.GCP_PR_PROJECT_ID }} | |
version: "405.0.0" | |
- name: Install gke-gcloud-auth-plugin | |
run: | | |
gcloud components install gke-gcloud-auth-plugin | |
- name: Display gcloud CLI info | |
run: | | |
gcloud info | |
- name: Set up Go | |
uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0 | |
with: | |
# renovate: datasource=golang-version depName=go | |
go-version: 1.23.4 | |
- name: Set up job variables | |
id: vars | |
run: | | |
if [ ${{ github.event.issue.pull_request || github.event.pull_request }} ]; then | |
PR_API_JSON=$(curl \ | |
-H "Accept: application/vnd.github.v3+json" \ | |
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ | |
${{ github.event.issue.pull_request.url || github.event.pull_request.url }}) | |
SHA=$(echo "$PR_API_JSON" | jq -r ".head.sha") | |
OWNER=$(echo "$PR_API_JSON" | jq -r ".number") | |
else | |
SHA=${{ github.sha }} | |
OWNER=${{ github.sha }} | |
fi | |
echo "sha=${SHA}" >> $GITHUB_OUTPUT | |
echo "owner=${OWNER}" >> $GITHUB_OUTPUT | |
- name: Create GKE cluster 2 | |
run: | | |
gcloud container clusters create ${{ env.clusterName2 }} \ | |
--labels "usage=${{ github.repository_owner }}-${{ github.event.repository.name }},owner=${{ steps.vars.outputs.owner }}" \ | |
--zone ${{ env.zone }} \ | |
--enable-ip-alias \ | |
--create-subnetwork="range=/26" \ | |
--cluster-ipv4-cidr="/21" \ | |
--services-ipv4-cidr="/24" \ | |
--image-type COS_CONTAINERD \ | |
--num-nodes 2 \ | |
--machine-type e2-custom-2-4096 \ | |
--disk-type pd-standard \ | |
--disk-size 20GB \ | |
--node-taints node.cilium.io/agent-not-ready=true:NoExecute \ | |
--preemptible \ | |
--async | |
- name: Create GKE cluster 1 | |
run: | | |
gcloud container clusters create ${{ env.clusterName1 }} \ | |
--labels "usage=${{ github.repository_owner }}-${{ github.event.repository.name }},owner=${{ steps.vars.outputs.owner }}" \ | |
--zone ${{ env.zone }} \ | |
--enable-ip-alias \ | |
--create-subnetwork="range=/26" \ | |
--cluster-ipv4-cidr="/21" \ | |
--services-ipv4-cidr="/24" \ | |
--image-type COS_CONTAINERD \ | |
--num-nodes 2 \ | |
--machine-type e2-custom-2-4096 \ | |
--disk-type pd-standard \ | |
--disk-size 20GB \ | |
--node-taints node.cilium.io/agent-not-ready=true:NoExecute \ | |
--preemptible \ | |
--async | |
- name: Wait for clusters to be provisioned | |
run: | | |
while [ "$(gcloud container operations list --filter="status=RUNNING AND targetLink~${{ env.clusterNameBase }}" --format="value(name)")" ];do | |
echo "cluster has an ongoing operation, waiting for all operations to finish"; sleep 10 | |
done | |
- name: Get cluster credentials and save context names | |
id: contexts | |
run: | | |
gcloud container clusters get-credentials ${{ env.clusterName1 }} --zone ${{ env.zone }} | |
gcloud container clusters get-credentials ${{ env.clusterName2 }} --zone ${{ env.zone }} | |
CLUSTER1=$(kubectl config view | grep "${{ env.clusterName1 }}" | head -1 | awk '{print $2}') | |
CLUSTER2=$(kubectl config view | grep "${{ env.clusterName2 }}" | head -1 | awk '{print $2}') | |
echo "cluster1=${CLUSTER1}" >> $GITHUB_OUTPUT | |
echo "cluster2=${CLUSTER2}" >> $GITHUB_OUTPUT | |
- name: Allow cross-cluster traffic | |
run: | | |
TAG1=$(gcloud compute firewall-rules list --filter="name~^gke-${{ env.clusterName1 }}-[0-9a-z]*-all$" --format="value(name)") | |
TAG2=$(gcloud compute firewall-rules list --filter="name~^gke-${{ env.clusterName2 }}-[0-9a-z]*-all$" --format="value(name)") | |
gcloud compute firewall-rules describe $TAG1 | |
gcloud compute firewall-rules describe $TAG2 | |
gcloud compute firewall-rules create ${{ env.firewallRuleName }} --allow tcp,udp,icmp,sctp,esp,ah --priority=999 --source-ranges=10.0.0.0/9 --target-tags=${TAG1/-all/-node},${TAG2/-all/-node} | |
gcloud compute firewall-rules describe ${{ env.firewallRuleName }} | |
- name: Install Cilium CLI | |
uses: ./ | |
with: | |
skip-build: 'true' | |
image-tag: ${{ steps.vars.outputs.sha }} | |
- name: Install Cilium and run tests | |
timeout-minutes: 60 | |
run: | | |
# Install Cilium in cluster1 | |
cilium install \ | |
--version "${{ env.cilium_version }}" \ | |
--context "${{ steps.contexts.outputs.cluster1 }}" \ | |
--set loadBalancer.l7.backend=envoy \ | |
--set tls.secretsBackend=k8s \ | |
--set cluster.name="${{ env.clusterName1 }}" \ | |
--set cluster.id=1 \ | |
--set bpf.monitorAggregation=none \ | |
--set ipv4NativeRoutingCIDR=10.0.0.0/9 \ | |
--set hubble.eventQueueSize=65536 | |
# Copy the CA cert from cluster1 to cluster2 | |
kubectl --context ${{ steps.contexts.outputs.cluster1 }} get secrets -n kube-system cilium-ca -oyaml \ | |
| kubectl --context ${{ steps.contexts.outputs.cluster2 }} apply -f - | |
# This seeds all CAs in cluster2 due to logic in the helm chart found here, e.g. for Hubble | |
# https://github.com/cilium/cilium/blob/8b6aa6eda91927275ae722ac020deeb5a9ce479d/install/kubernetes/cilium/templates/hubble/tls-helm/_helpers.tpl#L24-L33 | |
# Install Cilium in cluster2 | |
cilium install \ | |
--version "${{ env.cilium_version }}" \ | |
--context "${{ steps.contexts.outputs.cluster2 }}" \ | |
--set loadBalancer.l7.backend=envoy \ | |
--set tls.secretsBackend=k8s \ | |
--set cluster.name="${{ env.clusterName2 }}" \ | |
--set cluster.id=2 \ | |
--set bpf.monitorAggregation=none \ | |
--set ipv4NativeRoutingCIDR=10.0.0.0/9 \ | |
--set hubble.eventQueueSize=65536 | |
# Enable Relay | |
cilium --context "${{ steps.contexts.outputs.cluster1 }}" hubble enable | |
cilium --context "${{ steps.contexts.outputs.cluster2 }}" hubble enable --relay=false | |
# Wait for cilium and hubble relay to be ready | |
# NB: necessary to work against occassional flakes due to https://github.com/cilium/cilium-cli/issues/918 | |
cilium --context "${{ steps.contexts.outputs.cluster1 }}" status --wait | |
cilium --context "${{ steps.contexts.outputs.cluster2 }}" status --wait | |
# Enable cluster mesh | |
# Test autodetection of service parameters for GKE | |
cilium --context "${{ steps.contexts.outputs.cluster1 }}" clustermesh enable | |
cilium --context "${{ steps.contexts.outputs.cluster2 }}" clustermesh enable | |
# Wait for cluster mesh status to be ready | |
cilium --context "${{ steps.contexts.outputs.cluster1 }}" clustermesh status --wait | |
cilium --context "${{ steps.contexts.outputs.cluster2 }}" clustermesh status --wait | |
# Print clustermesh Service annotations | |
printf "Service annotations for Cluster 1 %s\n" \ | |
$(kubectl --context "${{ steps.contexts.outputs.cluster1 }}" get svc -n kube-system clustermesh-apiserver -o jsonpath='{.metadata.annotations}') | |
printf "Service annotations for Cluster 2 %s\n" \ | |
$(kubectl --context "${{ steps.contexts.outputs.cluster2 }}" get svc -n kube-system clustermesh-apiserver -o jsonpath='{.metadata.annotations}') | |
# Connect clusters | |
cilium --context "${{ steps.contexts.outputs.cluster1 }}" clustermesh connect --destination-context "${{ steps.contexts.outputs.cluster2 }}" | |
# Wait for cluster mesh status to be ready | |
cilium --context "${{ steps.contexts.outputs.cluster1 }}" clustermesh status --wait | |
cilium --context "${{ steps.contexts.outputs.cluster2 }}" clustermesh status --wait | |
# Port forward Relay | |
cilium --context "${{ steps.contexts.outputs.cluster1 }}" hubble port-forward& | |
sleep 10s | |
nc -nvz 127.0.0.1 4245 | |
# Run connectivity test | |
cilium --context "${{ steps.contexts.outputs.cluster1 }}" connectivity test --test-concurrency=5 \ | |
--multi-cluster "${{ steps.contexts.outputs.cluster2 }}" --test '!/*-deny,!/pod-to-.*-nodeport' \ | |
--all-flows --collect-sysdump-on-failure --external-target google.com. \ | |
--log-check-levels error # don't check for warnings in logs, TODO: address warnings and include check again | |
- name: Post-test information gathering | |
if: ${{ !success() }} | |
run: | | |
echo "=== Retrieve cluster1 state ===" | |
kubectl --context "${{ steps.contexts.outputs.cluster1 }}" get pods --all-namespaces -o wide | |
cilium --context "${{ steps.contexts.outputs.cluster1 }}" status | |
cilium --context "${{ steps.contexts.outputs.cluster1 }}" clustermesh status | |
cilium --context "${{ steps.contexts.outputs.cluster1 }}" sysdump --output-filename cilium-sysdump-cluster1 | |
echo "=== Retrieve cluster2 state ===" | |
kubectl --context "${{ steps.contexts.outputs.cluster2 }}" get pods --all-namespaces -o wide | |
cilium --context "${{ steps.contexts.outputs.cluster2 }}" status | |
cilium --context "${{ steps.contexts.outputs.cluster2 }}" clustermesh status | |
cilium --context "${{ steps.contexts.outputs.cluster2 }}" sysdump --output-filename cilium-sysdump-cluster2 | |
shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently | |
- name: Clean up GKE | |
if: ${{ always() }} | |
run: | | |
while [ "$(gcloud container operations list --filter="status=RUNNING AND targetLink~${{ env.clusterNameBase }}" --format="value(name)")" ];do | |
echo "cluster has an ongoing operation, waiting for all operations to finish"; sleep 15 | |
done | |
gcloud container clusters delete ${{ env.clusterName1 }} --zone ${{ env.zone }} --quiet --async | |
gcloud container clusters delete ${{ env.clusterName2 }} --zone ${{ env.zone }} --quiet --async | |
gcloud compute firewall-rules delete ${{ env.firewallRuleName }} --quiet | |
shell: bash {0} # Disable default fail-fast behavior so that all commands run independently | |
- name: Upload artifacts | |
if: ${{ !success() }} | |
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 | |
with: | |
name: cilium-sysdump-out.zip | |
path: | | |
cilium-sysdump-cluster1.zip | |
cilium-sysdump-cluster2.zip | |
retention-days: 5 |