From 395bfb9abf83d48c0c83fdb07994cb20f592c8be Mon Sep 17 00:00:00 2001 From: Future Outlier Date: Sun, 15 Oct 2023 09:57:51 +0800 Subject: [PATCH 01/13] first version of sandbox gpu image Signed-off-by: Future Outlier --- docker/sandbox-bundled/Dockerfile.gpu | 81 +++++++++++++++++++ docker/sandbox-bundled/Makefile | 15 ++++ .../bin/k3d-entrypoint-gpu-check.sh | 15 ++++ docker/sandbox-bundled/config.toml.tmpl | 55 +++++++++++++ .../kustomize/gpu-operator.yaml | 9 +++ .../manifests/complete-agent.yaml | 4 +- .../sandbox-bundled/manifests/complete.yaml | 13 ++- docker/sandbox-bundled/manifests/dev.yaml | 4 +- 8 files changed, 190 insertions(+), 6 deletions(-) create mode 100644 docker/sandbox-bundled/Dockerfile.gpu create mode 100644 docker/sandbox-bundled/bin/k3d-entrypoint-gpu-check.sh create mode 100644 docker/sandbox-bundled/config.toml.tmpl create mode 100644 docker/sandbox-bundled/kustomize/gpu-operator.yaml diff --git a/docker/sandbox-bundled/Dockerfile.gpu b/docker/sandbox-bundled/Dockerfile.gpu new file mode 100644 index 0000000000..50fe741cfa --- /dev/null +++ b/docker/sandbox-bundled/Dockerfile.gpu @@ -0,0 +1,81 @@ +# syntax=docker/dockerfile:1.4-labs + +FROM --platform=${BUILDPLATFORM} mgoltzsche/podman:minimal AS builder + +ARG TARGETARCH +ENV TARGETARCH "${TARGETARCH}" + +WORKDIR /build + +COPY images/manifest.txt images/preload ./ +RUN --security=insecure ./preload manifest.txt + +FROM --platform=${BUILDPLATFORM} golang:1.19-bullseye AS bootstrap + +ARG TARGETARCH +ENV CGO_ENABLED 0 +ENV GOARCH "${TARGETARCH}" +ENV GOOS linux + +WORKDIR /flyteorg/build +COPY bootstrap/go.mod bootstrap/go.sum ./ +RUN go mod download +COPY bootstrap/ ./ +RUN --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/root/go/pkg/mod \ + go build -o dist/flyte-sandbox-bootstrap cmd/bootstrap/main.go +# syntax=docker/dockerfile:1.4-labs + +#Following +FROM nvidia/cuda:12.1.1-base-ubuntu20.04 + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections + +RUN apt-get update && \ + apt-get -y install gnupg2 curl lsb-release && \ + apt-get clean + +# Install NVIDIA Container Runtime +RUN curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add - +RUN curl -s -L https://nvidia.github.io/nvidia-container-runtime/ubuntu20.04/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list +RUN apt-get update && \ + apt-get -y install nvidia-docker2 && \ + apt-get clean + +# Install crictl +ENV CRICTL_VERSION="v1.26.0" +RUN curl -L https://github.com/kubernetes-sigs/cri-tools/releases/download/$CRICTL_VERSION/crictl-${CRICTL_VERSION}-linux-amd64.tar.gz --output crictl-${CRICTL_VERSION}-linux-amd64.tar.gz +RUN tar zxvf crictl-$CRICTL_VERSION-linux-amd64.tar.gz -C /usr/local/bin +RUN rm -f crictl-$CRICTL_VERSION-linux-amd64.tar.gz + +# Install k3s +RUN curl -s -L https://github.com/k3s-io/k3s/releases/download/v1.24.9+k3s1/k3s > /usr/bin/k3s +RUN chmod u+x /usr/bin/k3s +RUN echo "alias kubectl='k3s kubectl'" >> /root/.bashrc + +# Setup containerd for nvidia +COPY config.toml.tmpl /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl +ENV CRI_CONFIG_FILE="/var/lib/rancher/k3s/agent/etc/crictl.yaml" + +# ENV that signals this container should have gpu enabled +ENV FLYTE_GPU "ENABLED" + +ARG TARGETARCH + +ARG FLYTE_SANDBOX_VERSION +ENV FLYTE_SANDBOX_VERSION "${FLYTE_SANDBOX_VERSION}" + +COPY --from=builder /build/images/ /var/lib/rancher/k3s/agent/images/ +COPY images/tar/${TARGETARCH}/ /var/lib/rancher/k3s/agent/images/ +COPY manifests/ /var/lib/rancher/k3s/server/manifests-staging/ +COPY bin/ /bin/ + +COPY --from=bootstrap /flyteorg/build/dist/flyte-sandbox-bootstrap /bin/ + +VOLUME /var/lib/kubelet +VOLUME /var/lib/rancher/k3s +VOLUME /var/lib/cni +VOLUME /var/log + + +ENTRYPOINT [ "/bin/k3d-entrypoint.sh" ] +CMD [ "server", "--disable=traefik", "--disable=servicelb" ] \ No newline at end of file diff --git a/docker/sandbox-bundled/Makefile b/docker/sandbox-bundled/Makefile index 9ae4197673..1f56539adf 100644 --- a/docker/sandbox-bundled/Makefile +++ b/docker/sandbox-bundled/Makefile @@ -33,6 +33,11 @@ manifests: --load-restrictor=LoadRestrictionsNone \ kustomize/complete-agent > manifests/complete-agent.yaml +.PHONY: manifests-gpu +manifests-gpu: manifests + cat kustomize/gpu-operator.yaml >> manifests/complete.yaml + + .PHONY: build build: flyte manifests [ -n "$(shell docker buildx ls | awk '/^flyte-sandbox / {print $$1}')" ] || \ @@ -43,6 +48,16 @@ build: flyte manifests docker buildx build --builder flyte-sandbox --allow security.insecure --load \ --tag flyte-sandbox:latest . +.PHONY: build-gpu +build-gpu: flyte manifests-gpu + [ -n "$(shell docker buildx ls | awk '/^flyte-sandbox / {print $$1}')" ] || \ + docker buildx create --name flyte-sandbox \ + --driver docker-container --driver-opt image=moby/buildkit:master \ + --buildkitd-flags '--allow-insecure-entitlement security.insecure' \ + --platform linux/arm64,linux/amd64 + docker buildx build --builder flyte-sandbox --allow security.insecure --load \ + --tag flyte-sandbox-gpu:latest -f Dockerfile.gpu . + # Port map # 6443 - k8s API server # 30000 - Docker Registry diff --git a/docker/sandbox-bundled/bin/k3d-entrypoint-gpu-check.sh b/docker/sandbox-bundled/bin/k3d-entrypoint-gpu-check.sh new file mode 100644 index 0000000000..0de460b809 --- /dev/null +++ b/docker/sandbox-bundled/bin/k3d-entrypoint-gpu-check.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +if [ -n "${FLYTE_GPU}" ]; then + echo "GPU Enabled - checking if it's available" + nvidia-smi + if [ $? -eq 0 ]; then + echo "nvidia-smi working" + else + >&2 echo "NVIDIA not available, enable it in docker like so: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/user-guide.html" + exit 255 + fi + +else + echo "GPU not enabled" +fi \ No newline at end of file diff --git a/docker/sandbox-bundled/config.toml.tmpl b/docker/sandbox-bundled/config.toml.tmpl new file mode 100644 index 0000000000..4d5c7fa4ca --- /dev/null +++ b/docker/sandbox-bundled/config.toml.tmpl @@ -0,0 +1,55 @@ +[plugins.opt] + path = "{{ .NodeConfig.Containerd.Opt }}" + +[plugins.cri] + stream_server_address = "127.0.0.1" + stream_server_port = "10010" + +{{- if .IsRunningInUserNS }} + disable_cgroup = true + disable_apparmor = true + restrict_oom_score_adj = true +{{end}} + +{{- if .NodeConfig.AgentConfig.PauseImage }} + sandbox_image = "{{ .NodeConfig.AgentConfig.PauseImage }}" +{{end}} + +{{- if not .NodeConfig.NoFlannel }} +[plugins.cri.cni] + bin_dir = "{{ .NodeConfig.AgentConfig.CNIBinDir }}" + conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}" +{{end}} + +[plugins.cri.containerd.runtimes.runc] + # ---- changed from 'io.containerd.runc.v2' for GPU support + runtime_type = "io.containerd.runtime.v1.linux" + +# ---- added for GPU support +[plugins.linux] + runtime = "nvidia-container-runtime" + +{{ if .PrivateRegistryConfig }} +{{ if .PrivateRegistryConfig.Mirrors }} +[plugins.cri.registry.mirrors]{{end}} +{{range $k, $v := .PrivateRegistryConfig.Mirrors }} +[plugins.cri.registry.mirrors."{{$k}}"] + endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}] +{{end}} + +{{range $k, $v := .PrivateRegistryConfig.Configs }} +{{ if $v.Auth }} +[plugins.cri.registry.configs."{{$k}}".auth] + {{ if $v.Auth.Username }}username = "{{ $v.Auth.Username }}"{{end}} + {{ if $v.Auth.Password }}password = "{{ $v.Auth.Password }}"{{end}} + {{ if $v.Auth.Auth }}auth = "{{ $v.Auth.Auth }}"{{end}} + {{ if $v.Auth.IdentityToken }}identitytoken = "{{ $v.Auth.IdentityToken }}"{{end}} +{{end}} +{{ if $v.TLS }} +[plugins.cri.registry.configs."{{$k}}".tls] + {{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}} + {{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}} + {{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}} +{{end}} +{{end}} +{{end}} \ No newline at end of file diff --git a/docker/sandbox-bundled/kustomize/gpu-operator.yaml b/docker/sandbox-bundled/kustomize/gpu-operator.yaml new file mode 100644 index 0000000000..8b226ce694 --- /dev/null +++ b/docker/sandbox-bundled/kustomize/gpu-operator.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: helm.cattle.io/v1 +kind: HelmChart +metadata: + name: nvidia-device-plugin + namespace: kube-system +spec: + chart: nvidia-device-plugin + repo: https://nvidia.github.io/k8s-device-plugin \ No newline at end of file diff --git a/docker/sandbox-bundled/manifests/complete-agent.yaml b/docker/sandbox-bundled/manifests/complete-agent.yaml index b18c5ac84e..34abf885f8 100644 --- a/docker/sandbox-bundled/manifests/complete-agent.yaml +++ b/docker/sandbox-bundled/manifests/complete-agent.yaml @@ -816,7 +816,7 @@ type: Opaque --- apiVersion: v1 data: - haSharedSecret: R2JRWFVRYThnRFVLbHpuSA== + haSharedSecret: QWJLV3ZzNTR1NkswcUxqTg== proxyPassword: "" proxyUsername: "" kind: Secret @@ -1409,7 +1409,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 0ee1553aec7c03152a0a44e7b1a82985795774412a779f7b607a57e59f42c8ef + checksum/secret: 1de8aec7d2868df37aed86003741abe140c7f1cc9679e09286a333dba2b4523b labels: app: docker-registry release: flyte-sandbox diff --git a/docker/sandbox-bundled/manifests/complete.yaml b/docker/sandbox-bundled/manifests/complete.yaml index d7f6e8b0cc..74c88ee79e 100644 --- a/docker/sandbox-bundled/manifests/complete.yaml +++ b/docker/sandbox-bundled/manifests/complete.yaml @@ -805,7 +805,7 @@ type: Opaque --- apiVersion: v1 data: - haSharedSecret: d1l6eWRCOXBJcFhiNEo5QQ== + haSharedSecret: Z2JmOWxNNnpXN0RNU01oMg== proxyPassword: "" proxyUsername: "" kind: Secret @@ -1366,7 +1366,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 2f5b6d46fd3276b5b25c8a537298beb6943b13b0b21900db8b2da23e166f0593 + checksum/secret: 57a7056ad885316655ca974bb2a8fc28874a43fbfbe92ca71dd609c12e32f122 labels: app: docker-registry release: flyte-sandbox @@ -1814,3 +1814,12 @@ spec: updateStrategy: rollingUpdate: {} type: RollingUpdate +--- +apiVersion: helm.cattle.io/v1 +kind: HelmChart +metadata: + name: nvidia-device-plugin + namespace: kube-system +spec: + chart: nvidia-device-plugin + repo: https://nvidia.github.io/k8s-device-plugin \ No newline at end of file diff --git a/docker/sandbox-bundled/manifests/dev.yaml b/docker/sandbox-bundled/manifests/dev.yaml index 4f3f0592e8..d23088130d 100644 --- a/docker/sandbox-bundled/manifests/dev.yaml +++ b/docker/sandbox-bundled/manifests/dev.yaml @@ -499,7 +499,7 @@ metadata: --- apiVersion: v1 data: - haSharedSecret: UkFsUVRMRndZeTNJUVNFSA== + haSharedSecret: clk0bjRyYnlyeHdGYXJ1ag== proxyPassword: "" proxyUsername: "" kind: Secret @@ -933,7 +933,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 25a046ef1aaf34ffb59f7b92554e1cfd0015b9a11f7f165ce06bba31e3bced1b + checksum/secret: 1d10006b870d1f9783708e14d0e8d33b4f12a51c9b5569d539b7e28c15106235 labels: app: docker-registry release: flyte-sandbox From 92afb8cc8e9e967041438bef2ad666a574999591 Mon Sep 17 00:00:00 2001 From: Future Outlier Date: Tue, 24 Oct 2023 16:36:27 +0800 Subject: [PATCH 02/13] yaml Signed-off-by: Future Outlier --- docker/sandbox-bundled/manifests/complete-agent.yaml | 6 +++--- docker/sandbox-bundled/manifests/complete.yaml | 4 ++-- docker/sandbox-bundled/manifests/dev.yaml | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docker/sandbox-bundled/manifests/complete-agent.yaml b/docker/sandbox-bundled/manifests/complete-agent.yaml index 6d61286e76..dd324b15e6 100644 --- a/docker/sandbox-bundled/manifests/complete-agent.yaml +++ b/docker/sandbox-bundled/manifests/complete-agent.yaml @@ -816,7 +816,7 @@ type: Opaque --- apiVersion: v1 data: - haSharedSecret: QWJLV3ZzNTR1NkswcUxqTg== + haSharedSecret: UlZsSHR0c2RFMDdkSFJ0Sw== proxyPassword: "" proxyUsername: "" kind: Secret @@ -1409,7 +1409,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 1de8aec7d2868df37aed86003741abe140c7f1cc9679e09286a333dba2b4523b + checksum/secret: ff9d2702885cbb03ce89a5659e44f49b781a55ea62d7b5099ecea38d1781bf53 labels: app: docker-registry release: flyte-sandbox @@ -1744,7 +1744,7 @@ spec: value: minio - name: FLYTE_AWS_SECRET_ACCESS_KEY value: miniostorage - image: ghcr.io/flyteorg/flyteagent:1.9.1 + image: ghcr.io/flyteorg/flyteagent:1.10.0 imagePullPolicy: IfNotPresent name: flyteagent ports: diff --git a/docker/sandbox-bundled/manifests/complete.yaml b/docker/sandbox-bundled/manifests/complete.yaml index 9ddceed3ce..caa0aa3771 100644 --- a/docker/sandbox-bundled/manifests/complete.yaml +++ b/docker/sandbox-bundled/manifests/complete.yaml @@ -805,7 +805,7 @@ type: Opaque --- apiVersion: v1 data: - haSharedSecret: Z2JmOWxNNnpXN0RNU01oMg== + haSharedSecret: OVRlcXBXS2NkUTc0czEwQg== proxyPassword: "" proxyUsername: "" kind: Secret @@ -1366,7 +1366,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 57a7056ad885316655ca974bb2a8fc28874a43fbfbe92ca71dd609c12e32f122 + checksum/secret: 17ec194cf72de1676eef76a26280b2056a6b549bb06b25cf333e4b5f62562ab3 labels: app: docker-registry release: flyte-sandbox diff --git a/docker/sandbox-bundled/manifests/dev.yaml b/docker/sandbox-bundled/manifests/dev.yaml index d23088130d..8088da2901 100644 --- a/docker/sandbox-bundled/manifests/dev.yaml +++ b/docker/sandbox-bundled/manifests/dev.yaml @@ -499,7 +499,7 @@ metadata: --- apiVersion: v1 data: - haSharedSecret: clk0bjRyYnlyeHdGYXJ1ag== + haSharedSecret: Ulh6cjlwamRWMDNOeG9ycw== proxyPassword: "" proxyUsername: "" kind: Secret @@ -933,7 +933,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 1d10006b870d1f9783708e14d0e8d33b4f12a51c9b5569d539b7e28c15106235 + checksum/secret: 1cea0c45cc972dd53d56c1dc3279a3a24106ad8488a3f8559a8237b96ee040d9 labels: app: docker-registry release: flyte-sandbox From 5027f4af01a77f09a45d4ea8f19956eb422a4e7f Mon Sep 17 00:00:00 2001 From: Danny Farrell <16297104+danpf@users.noreply.github.com> Date: Wed, 1 Nov 2023 02:39:02 +0000 Subject: [PATCH 03/13] Make gpu sandbox work on linux Signed-off-by: Danny Farrell <16297104+danpf@users.noreply.github.com> --- docker/sandbox-bundled/Dockerfile.gpu | 61 +++++------ .../bin/k3d-entrypoint-cgroupv2.sh | 8 +- .../bin/k3d-entrypoint-gpu-check.sh | 0 docker/sandbox-bundled/config.toml.tmpl | 103 ++++++++++++++---- .../device-plugin-daemonset.yaml | 41 +++++++ 5 files changed, 156 insertions(+), 57 deletions(-) mode change 100644 => 100755 docker/sandbox-bundled/bin/k3d-entrypoint-gpu-check.sh create mode 100644 docker/sandbox-bundled/device-plugin-daemonset.yaml diff --git a/docker/sandbox-bundled/Dockerfile.gpu b/docker/sandbox-bundled/Dockerfile.gpu index 50fe741cfa..4f0839186b 100644 --- a/docker/sandbox-bundled/Dockerfile.gpu +++ b/docker/sandbox-bundled/Dockerfile.gpu @@ -1,5 +1,6 @@ # syntax=docker/dockerfile:1.4-labs +###### BUILD FLYTE FROM --platform=${BUILDPLATFORM} mgoltzsche/podman:minimal AS builder ARG TARGETARCH @@ -23,59 +24,49 @@ RUN go mod download COPY bootstrap/ ./ RUN --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/root/go/pkg/mod \ go build -o dist/flyte-sandbox-bootstrap cmd/bootstrap/main.go -# syntax=docker/dockerfile:1.4-labs - -#Following -FROM nvidia/cuda:12.1.1-base-ubuntu20.04 -RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections +###### GET K3S +# ARG K3S_TAG=v1.26.4-k3s1 +FROM rancher/k3s:v1.26.4-k3s1 as k3s -RUN apt-get update && \ - apt-get -y install gnupg2 curl lsb-release && \ - apt-get clean +FROM nvidia/cuda:11.8.0-base-ubuntu22.04 -# Install NVIDIA Container Runtime -RUN curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add - -RUN curl -s -L https://nvidia.github.io/nvidia-container-runtime/ubuntu20.04/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list -RUN apt-get update && \ - apt-get -y install nvidia-docker2 && \ - apt-get clean - -# Install crictl ENV CRICTL_VERSION="v1.26.0" -RUN curl -L https://github.com/kubernetes-sigs/cri-tools/releases/download/$CRICTL_VERSION/crictl-${CRICTL_VERSION}-linux-amd64.tar.gz --output crictl-${CRICTL_VERSION}-linux-amd64.tar.gz -RUN tar zxvf crictl-$CRICTL_VERSION-linux-amd64.tar.gz -C /usr/local/bin -RUN rm -f crictl-$CRICTL_VERSION-linux-amd64.tar.gz +ENV FLYTE_GPU "ENABLED" +ARG TARGETARCH -# Install k3s -RUN curl -s -L https://github.com/k3s-io/k3s/releases/download/v1.24.9+k3s1/k3s > /usr/bin/k3s -RUN chmod u+x /usr/bin/k3s -RUN echo "alias kubectl='k3s kubectl'" >> /root/.bashrc +RUN apt-get update \ + && apt-get -y install gnupg2 curl nvidia-container-toolkit \ + && chmod 1777 /tmp \ + && mkdir -p /var/lib/rancher/k3s/agent/etc/containerd \ + && mkdir -p /var/lib/rancher/k3s/server/manifests \ + && curl -L https://github.com/kubernetes-sigs/cri-tools/releases/download/$CRICTL_VERSION/crictl-${CRICTL_VERSION}-linux-amd64.tar.gz --output crictl-${CRICTL_VERSION}-linux-amd64.tar.gz \ + && tar zxvf crictl-$CRICTL_VERSION-linux-amd64.tar.gz -C /usr/local/bin \ + && rm -f crictl-$CRICTL_VERSION-linux-amd64.tar.gz \ + && echo "alias kubectl='k3s kubectl'" >> /root/.bashrc -# Setup containerd for nvidia -COPY config.toml.tmpl /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl -ENV CRI_CONFIG_FILE="/var/lib/rancher/k3s/agent/etc/crictl.yaml" +COPY --from=k3s /bin /bin +COPY --from=k3s /etc /etc -# ENV that signals this container should have gpu enabled -ENV FLYTE_GPU "ENABLED" - -ARG TARGETARCH +# Provide custom containerd configuration to configure the nvidia-container-runtime +COPY config.toml.tmpl /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl -ARG FLYTE_SANDBOX_VERSION -ENV FLYTE_SANDBOX_VERSION "${FLYTE_SANDBOX_VERSION}" +# Deploy the nvidia driver plugin on startup +COPY device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml COPY --from=builder /build/images/ /var/lib/rancher/k3s/agent/images/ +COPY --from=bootstrap /flyteorg/build/dist/flyte-sandbox-bootstrap /bin/ COPY images/tar/${TARGETARCH}/ /var/lib/rancher/k3s/agent/images/ COPY manifests/ /var/lib/rancher/k3s/server/manifests-staging/ COPY bin/ /bin/ -COPY --from=bootstrap /flyteorg/build/dist/flyte-sandbox-bootstrap /bin/ - VOLUME /var/lib/kubelet VOLUME /var/lib/rancher/k3s VOLUME /var/lib/cni VOLUME /var/log +ENV PATH="$PATH:/bin/aux" +ENV CRI_CONFIG_FILE=/var/lib/rancher/k3s/agent/etc/crictl.yaml ENTRYPOINT [ "/bin/k3d-entrypoint.sh" ] -CMD [ "server", "--disable=traefik", "--disable=servicelb" ] \ No newline at end of file +CMD [ "server", "--disable=traefik", "--disable=servicelb" ] diff --git a/docker/sandbox-bundled/bin/k3d-entrypoint-cgroupv2.sh b/docker/sandbox-bundled/bin/k3d-entrypoint-cgroupv2.sh index 88cb669ded..d9892decef 100755 --- a/docker/sandbox-bundled/bin/k3d-entrypoint-cgroupv2.sh +++ b/docker/sandbox-bundled/bin/k3d-entrypoint-cgroupv2.sh @@ -14,8 +14,12 @@ if [ -f /sys/fs/cgroup/cgroup.controllers ]; then # move the processes from the root group to the /init group, # otherwise writing subtree_control fails with EBUSY. mkdir -p /sys/fs/cgroup/init - busybox xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : + if command -v busybox >/dev/null 2>&1; then + busybox xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : + else + xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : + fi # enable controllers - sed -e 's/ / +/g' -e 's/^/+/' <"/sys/fs/cgroup/cgroup.controllers" >"/sys/fs/cgroup/cgroup.subtree_control" + sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers > /sys/fs/cgroup/cgroup.subtree_control echo "[$(date -Iseconds)] [CgroupV2 Fix] Done" fi diff --git a/docker/sandbox-bundled/bin/k3d-entrypoint-gpu-check.sh b/docker/sandbox-bundled/bin/k3d-entrypoint-gpu-check.sh old mode 100644 new mode 100755 diff --git a/docker/sandbox-bundled/config.toml.tmpl b/docker/sandbox-bundled/config.toml.tmpl index 4d5c7fa4ca..0208836d6f 100644 --- a/docker/sandbox-bundled/config.toml.tmpl +++ b/docker/sandbox-bundled/config.toml.tmpl @@ -1,12 +1,18 @@ -[plugins.opt] - path = "{{ .NodeConfig.Containerd.Opt }}" +version = 2 -[plugins.cri] +[plugins."io.containerd.internal.v1.opt"] + path = "{{ .NodeConfig.Containerd.Opt }}" +[plugins."io.containerd.grpc.v1.cri"] stream_server_address = "127.0.0.1" stream_server_port = "10010" + enable_selinux = {{ .NodeConfig.SELinux }} + enable_unprivileged_ports = {{ .EnableUnprivileged }} + enable_unprivileged_icmp = {{ .EnableUnprivileged }} -{{- if .IsRunningInUserNS }} +{{- if .DisableCgroup}} disable_cgroup = true +{{end}} +{{- if .IsRunningInUserNS }} disable_apparmor = true restrict_oom_score_adj = true {{end}} @@ -15,41 +21,98 @@ sandbox_image = "{{ .NodeConfig.AgentConfig.PauseImage }}" {{end}} +{{- if .NodeConfig.AgentConfig.Snapshotter }} +[plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "nvidia" + snapshotter = "{{ .NodeConfig.AgentConfig.Snapshotter }}" + disable_snapshot_annotations = {{ if eq .NodeConfig.AgentConfig.Snapshotter "stargz" }}false{{else}}true{{end}} +{{ if eq .NodeConfig.AgentConfig.Snapshotter "stargz" }} +{{ if .NodeConfig.AgentConfig.ImageServiceSocket }} +[plugins."io.containerd.snapshotter.v1.stargz"] +cri_keychain_image_service_path = "{{ .NodeConfig.AgentConfig.ImageServiceSocket }}" +[plugins."io.containerd.snapshotter.v1.stargz".cri_keychain] +enable_keychain = true +{{end}} +{{ if .PrivateRegistryConfig }} +{{ if .PrivateRegistryConfig.Mirrors }} +[plugins."io.containerd.snapshotter.v1.stargz".registry.mirrors]{{end}} +{{range $k, $v := .PrivateRegistryConfig.Mirrors }} +[plugins."io.containerd.snapshotter.v1.stargz".registry.mirrors."{{$k}}"] + endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}] +{{if $v.Rewrites}} + [plugins."io.containerd.snapshotter.v1.stargz".registry.mirrors."{{$k}}".rewrite] +{{range $pattern, $replace := $v.Rewrites}} + "{{$pattern}}" = "{{$replace}}" +{{end}} +{{end}} +{{end}} +{{range $k, $v := .PrivateRegistryConfig.Configs }} +{{ if $v.Auth }} +[plugins."io.containerd.snapshotter.v1.stargz".registry.configs."{{$k}}".auth] + {{ if $v.Auth.Username }}username = {{ printf "%q" $v.Auth.Username }}{{end}} + {{ if $v.Auth.Password }}password = {{ printf "%q" $v.Auth.Password }}{{end}} + {{ if $v.Auth.Auth }}auth = {{ printf "%q" $v.Auth.Auth }}{{end}} + {{ if $v.Auth.IdentityToken }}identitytoken = {{ printf "%q" $v.Auth.IdentityToken }}{{end}} +{{end}} +{{ if $v.TLS }} +[plugins."io.containerd.snapshotter.v1.stargz".registry.configs."{{$k}}".tls] + {{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}} + {{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}} + {{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}} + {{ if $v.TLS.InsecureSkipVerify }}insecure_skip_verify = true{{end}} +{{end}} +{{end}} +{{end}} +{{end}} +{{end}} + {{- if not .NodeConfig.NoFlannel }} -[plugins.cri.cni] +[plugins."io.containerd.grpc.v1.cri".cni] bin_dir = "{{ .NodeConfig.AgentConfig.CNIBinDir }}" conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}" {{end}} -[plugins.cri.containerd.runtimes.runc] - # ---- changed from 'io.containerd.runc.v2' for GPU support - runtime_type = "io.containerd.runtime.v1.linux" +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + runtime_type = "io.containerd.runc.v2" -# ---- added for GPU support -[plugins.linux] - runtime = "nvidia-container-runtime" +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + SystemdCgroup = {{ .SystemdCgroup }} {{ if .PrivateRegistryConfig }} {{ if .PrivateRegistryConfig.Mirrors }} -[plugins.cri.registry.mirrors]{{end}} +[plugins."io.containerd.grpc.v1.cri".registry.mirrors]{{end}} {{range $k, $v := .PrivateRegistryConfig.Mirrors }} -[plugins.cri.registry.mirrors."{{$k}}"] +[plugins."io.containerd.grpc.v1.cri".registry.mirrors."{{$k}}"] endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}] +{{if $v.Rewrites}} + [plugins."io.containerd.grpc.v1.cri".registry.mirrors."{{$k}}".rewrite] +{{range $pattern, $replace := $v.Rewrites}} + "{{$pattern}}" = "{{$replace}}" +{{end}} +{{end}} {{end}} {{range $k, $v := .PrivateRegistryConfig.Configs }} {{ if $v.Auth }} -[plugins.cri.registry.configs."{{$k}}".auth] - {{ if $v.Auth.Username }}username = "{{ $v.Auth.Username }}"{{end}} - {{ if $v.Auth.Password }}password = "{{ $v.Auth.Password }}"{{end}} - {{ if $v.Auth.Auth }}auth = "{{ $v.Auth.Auth }}"{{end}} - {{ if $v.Auth.IdentityToken }}identitytoken = "{{ $v.Auth.IdentityToken }}"{{end}} +[plugins."io.containerd.grpc.v1.cri".registry.configs."{{$k}}".auth] + {{ if $v.Auth.Username }}username = {{ printf "%q" $v.Auth.Username }}{{end}} + {{ if $v.Auth.Password }}password = {{ printf "%q" $v.Auth.Password }}{{end}} + {{ if $v.Auth.Auth }}auth = {{ printf "%q" $v.Auth.Auth }}{{end}} + {{ if $v.Auth.IdentityToken }}identitytoken = {{ printf "%q" $v.Auth.IdentityToken }}{{end}} {{end}} {{ if $v.TLS }} -[plugins.cri.registry.configs."{{$k}}".tls] +[plugins."io.containerd.grpc.v1.cri".registry.configs."{{$k}}".tls] {{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}} {{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}} {{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}} + {{ if $v.TLS.InsecureSkipVerify }}insecure_skip_verify = true{{end}} {{end}} {{end}} -{{end}} \ No newline at end of file +{{end}} + +{{range $k, $v := .ExtraRuntimes}} +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes."{{$k}}"] + runtime_type = "{{$v.RuntimeType}}" +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes."{{$k}}".options] + BinaryName = "{{$v.BinaryName}}" +{{end}} diff --git a/docker/sandbox-bundled/device-plugin-daemonset.yaml b/docker/sandbox-bundled/device-plugin-daemonset.yaml new file mode 100644 index 0000000000..3f40c93a8c --- /dev/null +++ b/docker/sandbox-bundled/device-plugin-daemonset.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + template: + metadata: + # Mark this pod as a critical add-on; when enabled, the critical add-on scheduler + # reserves resources for critical add-on pods so that they can be rescheduled after + # a failure. This annotation works in tandem with the toleration below. + annotations: + scheduler.alpha.kubernetes.io/critical-pod: "" + labels: + name: nvidia-device-plugin-ds + spec: + tolerations: + # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode. + # This, along with the annotation above marks this pod as a critical add-on. + - key: CriticalAddonsOnly + operator: Exists + containers: + - env: + - name: DP_DISABLE_HEALTHCHECKS + value: xids + image: nvidia/k8s-device-plugin:1.11 + name: nvidia-device-plugin-ctr + securityContext: + allowPrivilegeEscalation: true + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins From c2eed3efcd11b4558caf71ada6e95c98240dea28 Mon Sep 17 00:00:00 2001 From: Daniel Farrell <16297104+danpf@users.noreply.github.com> Date: Fri, 3 Nov 2023 17:26:50 -0400 Subject: [PATCH 04/13] Update docker/sandbox-bundled/manifests/complete-agent.yaml Co-authored-by: Future-Outlier Signed-off-by: Daniel Farrell <16297104+danpf@users.noreply.github.com> --- docker/sandbox-bundled/manifests/complete-agent.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/sandbox-bundled/manifests/complete-agent.yaml b/docker/sandbox-bundled/manifests/complete-agent.yaml index dd324b15e6..85624e7ebc 100644 --- a/docker/sandbox-bundled/manifests/complete-agent.yaml +++ b/docker/sandbox-bundled/manifests/complete-agent.yaml @@ -1744,7 +1744,7 @@ spec: value: minio - name: FLYTE_AWS_SECRET_ACCESS_KEY value: miniostorage - image: ghcr.io/flyteorg/flyteagent:1.10.0 + image: ghcr.io/flyteorg/flyteagent:1.9.1 imagePullPolicy: IfNotPresent name: flyteagent ports: From a4ec22173d0e2879d67963b262c1ccc3cc43e0e4 Mon Sep 17 00:00:00 2001 From: Daniel Farrell <16297104+danpf@users.noreply.github.com> Date: Fri, 3 Nov 2023 17:41:54 -0400 Subject: [PATCH 05/13] Update device-plugin-daemonset.yaml Signed-off-by: Daniel Farrell <16297104+danpf@users.noreply.github.com> --- docker/sandbox-bundled/device-plugin-daemonset.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker/sandbox-bundled/device-plugin-daemonset.yaml b/docker/sandbox-bundled/device-plugin-daemonset.yaml index 3f40c93a8c..c53386c36c 100644 --- a/docker/sandbox-bundled/device-plugin-daemonset.yaml +++ b/docker/sandbox-bundled/device-plugin-daemonset.yaml @@ -1,3 +1,6 @@ +# Sourced from: https://k3d.io/v5.6.0/usage/advanced/cuda/?h=gpu#the-nvidia-device-plugin +# Thank you to the k3d team for their work on this. + apiVersion: apps/v1 kind: DaemonSet metadata: From b9510e5453e38e0596ef31ce7cdd32830301de44 Mon Sep 17 00:00:00 2001 From: Daniel Farrell <16297104+danpf@users.noreply.github.com> Date: Fri, 3 Nov 2023 17:44:58 -0400 Subject: [PATCH 06/13] Delete docker/sandbox-bundled/kustomize/gpu-operator.yaml Signed-off-by: Daniel Farrell <16297104+danpf@users.noreply.github.com> --- docker/sandbox-bundled/kustomize/gpu-operator.yaml | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 docker/sandbox-bundled/kustomize/gpu-operator.yaml diff --git a/docker/sandbox-bundled/kustomize/gpu-operator.yaml b/docker/sandbox-bundled/kustomize/gpu-operator.yaml deleted file mode 100644 index 8b226ce694..0000000000 --- a/docker/sandbox-bundled/kustomize/gpu-operator.yaml +++ /dev/null @@ -1,9 +0,0 @@ ---- -apiVersion: helm.cattle.io/v1 -kind: HelmChart -metadata: - name: nvidia-device-plugin - namespace: kube-system -spec: - chart: nvidia-device-plugin - repo: https://nvidia.github.io/k8s-device-plugin \ No newline at end of file From f8c8c85697c3c93806cda2e44ba1b45352697e8d Mon Sep 17 00:00:00 2001 From: Daniel Farrell <16297104+danpf@users.noreply.github.com> Date: Tue, 7 Nov 2023 13:31:08 -0500 Subject: [PATCH 07/13] Update docker/sandbox-bundled/bin/k3d-entrypoint-gpu-check.sh Co-authored-by: Future-Outlier Signed-off-by: Daniel Farrell <16297104+danpf@users.noreply.github.com> --- docker/sandbox-bundled/bin/k3d-entrypoint-gpu-check.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/sandbox-bundled/bin/k3d-entrypoint-gpu-check.sh b/docker/sandbox-bundled/bin/k3d-entrypoint-gpu-check.sh index 0de460b809..96a5390fc5 100755 --- a/docker/sandbox-bundled/bin/k3d-entrypoint-gpu-check.sh +++ b/docker/sandbox-bundled/bin/k3d-entrypoint-gpu-check.sh @@ -12,4 +12,4 @@ if [ -n "${FLYTE_GPU}" ]; then else echo "GPU not enabled" -fi \ No newline at end of file +fi From 6ae93f366ca4322d301dbf64fa8f31d526f29105 Mon Sep 17 00:00:00 2001 From: Danny Farrell <16297104+danpf@users.noreply.github.com> Date: Tue, 7 Nov 2023 14:02:02 -0500 Subject: [PATCH 08/13] Add note about source of config.toml.tmpl Signed-off-by: Danny Farrell <16297104+danpf@users.noreply.github.com> --- docker/sandbox-bundled/config.toml.tmpl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker/sandbox-bundled/config.toml.tmpl b/docker/sandbox-bundled/config.toml.tmpl index 0208836d6f..c32ad5afaf 100644 --- a/docker/sandbox-bundled/config.toml.tmpl +++ b/docker/sandbox-bundled/config.toml.tmpl @@ -1,3 +1,5 @@ +# Exactly the same as: https://github.com/k3s-io/k3s/blob/master/pkg/agent/templates/templates_linux.go#L10 +# EXCEPT under the heading: [plugins."io.containerd.grpc.v1.cri".containerd] we add: default_runtime_name = "nvidia" version = 2 [plugins."io.containerd.internal.v1.opt"] @@ -115,4 +117,5 @@ enable_keychain = true runtime_type = "{{$v.RuntimeType}}" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes."{{$k}}".options] BinaryName = "{{$v.BinaryName}}" + SystemdCgroup = {{ $.SystemdCgroup }} {{end}} From 91347b12a0a40f76da75e9b3666126745756e7d8 Mon Sep 17 00:00:00 2001 From: Danny Farrell <16297104+danpf@users.noreply.github.com> Date: Tue, 7 Nov 2023 14:13:58 -0500 Subject: [PATCH 09/13] More documentation of Dockerfile.gpu Signed-off-by: Danny Farrell <16297104+danpf@users.noreply.github.com> --- docker/sandbox-bundled/Dockerfile.gpu | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/docker/sandbox-bundled/Dockerfile.gpu b/docker/sandbox-bundled/Dockerfile.gpu index 4f0839186b..4cee5abd85 100644 --- a/docker/sandbox-bundled/Dockerfile.gpu +++ b/docker/sandbox-bundled/Dockerfile.gpu @@ -1,6 +1,4 @@ # syntax=docker/dockerfile:1.4-labs - -###### BUILD FLYTE FROM --platform=${BUILDPLATFORM} mgoltzsche/podman:minimal AS builder ARG TARGETARCH @@ -11,6 +9,7 @@ WORKDIR /build COPY images/manifest.txt images/preload ./ RUN --security=insecure ./preload manifest.txt + FROM --platform=${BUILDPLATFORM} golang:1.19-bullseye AS bootstrap ARG TARGETARCH @@ -25,8 +24,6 @@ COPY bootstrap/ ./ RUN --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/root/go/pkg/mod \ go build -o dist/flyte-sandbox-bootstrap cmd/bootstrap/main.go -###### GET K3S -# ARG K3S_TAG=v1.26.4-k3s1 FROM rancher/k3s:v1.26.4-k3s1 as k3s FROM nvidia/cuda:11.8.0-base-ubuntu22.04 @@ -35,6 +32,8 @@ ENV CRICTL_VERSION="v1.26.0" ENV FLYTE_GPU "ENABLED" ARG TARGETARCH +ARG FLYTE_SANDBOX_VERSION +ENV FLYTE_SANDBOX_VERSION "${FLYTE_SANDBOX_VERSION}" RUN apt-get update \ && apt-get -y install gnupg2 curl nvidia-container-toolkit \ && chmod 1777 /tmp \ @@ -55,11 +54,19 @@ COPY config.toml.tmpl /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl COPY device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml COPY --from=builder /build/images/ /var/lib/rancher/k3s/agent/images/ -COPY --from=bootstrap /flyteorg/build/dist/flyte-sandbox-bootstrap /bin/ COPY images/tar/${TARGETARCH}/ /var/lib/rancher/k3s/agent/images/ COPY manifests/ /var/lib/rancher/k3s/server/manifests-staging/ COPY bin/ /bin/ +# Install bootstrap +COPY --from=bootstrap /flyteorg/build/dist/flyte-sandbox-bootstrap /bin/ + +VOLUME /var/lib/flyte/storage + +# Set environment variable for picking up additional CA certificates +ENV SSL_CERT_DIR /var/lib/flyte/config/ca-certificates + +## START https://github.com/k3s-io/k3s/blob/master/package/Dockerfile#L15 VOLUME /var/lib/kubelet VOLUME /var/lib/rancher/k3s VOLUME /var/lib/cni @@ -67,6 +74,7 @@ VOLUME /var/log ENV PATH="$PATH:/bin/aux" ENV CRI_CONFIG_FILE=/var/lib/rancher/k3s/agent/etc/crictl.yaml +## END https://github.com/k3s-io/k3s/blob/master/package/Dockerfile#L15 ENTRYPOINT [ "/bin/k3d-entrypoint.sh" ] CMD [ "server", "--disable=traefik", "--disable=servicelb" ] From c9d639aed890f25d19663fb4b8d5aa3af581502f Mon Sep 17 00:00:00 2001 From: Danny Farrell <16297104+danpf@users.noreply.github.com> Date: Tue, 7 Nov 2023 14:17:18 -0500 Subject: [PATCH 10/13] More documentation on dockerfile Signed-off-by: Danny Farrell <16297104+danpf@users.noreply.github.com> --- docker/sandbox-bundled/Dockerfile.gpu | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/sandbox-bundled/Dockerfile.gpu b/docker/sandbox-bundled/Dockerfile.gpu index 4cee5abd85..5939d718e7 100644 --- a/docker/sandbox-bundled/Dockerfile.gpu +++ b/docker/sandbox-bundled/Dockerfile.gpu @@ -26,6 +26,7 @@ RUN --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/r FROM rancher/k3s:v1.26.4-k3s1 as k3s +# We may want to have another version with devel in the future (has more features but is huge) FROM nvidia/cuda:11.8.0-base-ubuntu22.04 ENV CRICTL_VERSION="v1.26.0" From ccf0505d44d52544a79d4d673596504ac006d3f6 Mon Sep 17 00:00:00 2001 From: Danny Farrell <16297104+danpf@users.noreply.github.com> Date: Tue, 7 Nov 2023 19:22:33 -0500 Subject: [PATCH 11/13] Remove nvidia-device-plugin.yml Signed-off-by: Danny Farrell <16297104+danpf@users.noreply.github.com> --- docker/sandbox-bundled/Dockerfile.gpu | 3 --- 1 file changed, 3 deletions(-) diff --git a/docker/sandbox-bundled/Dockerfile.gpu b/docker/sandbox-bundled/Dockerfile.gpu index 5939d718e7..87e8106ecf 100644 --- a/docker/sandbox-bundled/Dockerfile.gpu +++ b/docker/sandbox-bundled/Dockerfile.gpu @@ -51,9 +51,6 @@ COPY --from=k3s /etc /etc # Provide custom containerd configuration to configure the nvidia-container-runtime COPY config.toml.tmpl /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl -# Deploy the nvidia driver plugin on startup -COPY device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml - COPY --from=builder /build/images/ /var/lib/rancher/k3s/agent/images/ COPY images/tar/${TARGETARCH}/ /var/lib/rancher/k3s/agent/images/ COPY manifests/ /var/lib/rancher/k3s/server/manifests-staging/ From 68fd7a22157e1aa24fe97af26f4377a38f602816 Mon Sep 17 00:00:00 2001 From: Daniel Farrell <16297104+danpf@users.noreply.github.com> Date: Tue, 7 Nov 2023 22:59:34 -0600 Subject: [PATCH 12/13] Update docker/sandbox-bundled/Makefile Co-authored-by: Future-Outlier Signed-off-by: Daniel Farrell <16297104+danpf@users.noreply.github.com> --- docker/sandbox-bundled/Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docker/sandbox-bundled/Makefile b/docker/sandbox-bundled/Makefile index 1f56539adf..aa1040fa82 100644 --- a/docker/sandbox-bundled/Makefile +++ b/docker/sandbox-bundled/Makefile @@ -33,11 +33,6 @@ manifests: --load-restrictor=LoadRestrictionsNone \ kustomize/complete-agent > manifests/complete-agent.yaml -.PHONY: manifests-gpu -manifests-gpu: manifests - cat kustomize/gpu-operator.yaml >> manifests/complete.yaml - - .PHONY: build build: flyte manifests [ -n "$(shell docker buildx ls | awk '/^flyte-sandbox / {print $$1}')" ] || \ From 59fd04d077ad56fa58308b46a05701d43bbb9ac5 Mon Sep 17 00:00:00 2001 From: Daniel Farrell <16297104+danpf@users.noreply.github.com> Date: Tue, 7 Nov 2023 22:59:45 -0600 Subject: [PATCH 13/13] Update docker/sandbox-bundled/Makefile Co-authored-by: Future-Outlier Signed-off-by: Daniel Farrell <16297104+danpf@users.noreply.github.com> --- docker/sandbox-bundled/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/sandbox-bundled/Makefile b/docker/sandbox-bundled/Makefile index aa1040fa82..0791174cbc 100644 --- a/docker/sandbox-bundled/Makefile +++ b/docker/sandbox-bundled/Makefile @@ -44,7 +44,7 @@ build: flyte manifests --tag flyte-sandbox:latest . .PHONY: build-gpu -build-gpu: flyte manifests-gpu +build-gpu: flyte manifests [ -n "$(shell docker buildx ls | awk '/^flyte-sandbox / {print $$1}')" ] || \ docker buildx create --name flyte-sandbox \ --driver docker-container --driver-opt image=moby/buildkit:master \