diff --git a/.gitignore b/.gitignore index 2abd77f839..a11607a182 100644 --- a/.gitignore +++ b/.gitignore @@ -126,4 +126,4 @@ docs/_build/ target/ # Ipython Notebook -.ipynb_checkpoints \ No newline at end of file +.ipynb_checkpoints diff --git a/apps/sage/otel-collector/Dockerfile b/apps/sage/otel-collector/Dockerfile new file mode 100644 index 0000000000..2abe929971 --- /dev/null +++ b/apps/sage/otel-collector/Dockerfile @@ -0,0 +1,5 @@ +FROM public.ecr.aws/aws-observability/aws-otel-collector:v0.41.1 as aws-otel + +FROM otel/opentelemetry-collector-contrib:0.113.0 + +COPY --from=aws-otel /healthcheck /healthcheck diff --git a/apps/sage/otel-collector/README.md b/apps/sage/otel-collector/README.md new file mode 100644 index 0000000000..65fe2b2713 --- /dev/null +++ b/apps/sage/otel-collector/README.md @@ -0,0 +1,34 @@ +# Purpose + +As discussed in this Github Issue: +The official opentelemetry (OTEL) collector image does not contain cURL or related shell +commands required to do container level health checks. It is reliant on external +services such as the application load balancer in AWS to perform these checks. This is +problematic with our deployment of the OTEL collector as we are using AWS +service connect with AWS ECS to allow other containers within the namespace to connect +to the collector. As such, there is no load balancer in-front of the container to handle +its lifecycle. Within ECS, the recommended way from AWS to handle container level health +checks is to let ECS perform commands in the container. +Source: + +Since the OTEL collector does not have a shell, nor cURL available we need to accomplish +this another way. In the official AWS OTEL collector distro they accomplish this by +compiling a golang script down into a binary that can be run within the container. +Unfortunately we cannot use the AWS OTEL collector because they are not supporting the +`oauth2clientauthextension`: . + +For our purposes we are creating a new image based off the `otel/opentelemetry-collector-contrib` image, +but with the addition of the healthcheck binary from the AWS OTEL distro. This +combination lets us use the oauth2 extension, and have container level health checks. + +## Creating a new image (To automate later on) + +As new base images are updated we will need to in-turn create a new otel collector +image that we deploy to ECS. + +1. Update values in the `Dockerfile` +2. Run `docker build -t ghcr.io/sage-bionetworks/sage-otel-collector:vX.X.X .` (Replace the version) +3. Run `docker push ghcr.io/sage-bionetworks/sage-otel-collector:vX.X.X` (Replace the version) + +Once a new image is built and pushed, then you'll want to update the values in the CDK +scripts to use the new image version. diff --git a/apps/sage/otel-collector/project.json b/apps/sage/otel-collector/project.json new file mode 100644 index 0000000000..0a4ee8c250 --- /dev/null +++ b/apps/sage/otel-collector/project.json @@ -0,0 +1,32 @@ +{ + "name": "sage-otel-collector", + "$schema": "../../../node_modules/nx/schemas/project-schema.json", + "projectType": "application", + "targets": { + "serve-detach": { + "executor": "nx:run-commands", + "options": { + "command": "docker/sage/serve-detach.sh sage-otel-collector" + } + }, + "publish-image": { + "executor": "@nx-tools/nx-container:build", + "options": { + "context": "apps/sage/otel-collector", + "metadata": { + "images": ["ghcr.io/sage-bionetworks/{projectName}"], + "tags": ["type=edge,branch=main", "type=sha"] + }, + "push": true + }, + "dependsOn": ["build-image"] + }, + "scan-image": { + "executor": "nx:run-commands", + "options": { + "command": "trivy image ghcr.io/sage-bionetworks/{projectName}:local --quiet", + "color": true + } + } + } +} diff --git a/docker/sage/networks.yml b/docker/sage/networks.yml new file mode 100644 index 0000000000..3872357442 --- /dev/null +++ b/docker/sage/networks.yml @@ -0,0 +1,4 @@ +networks: + sage: + name: sage + driver: bridge diff --git a/docker/sage/serve-detach.sh b/docker/sage/serve-detach.sh new file mode 100755 index 0000000000..28dc29a315 --- /dev/null +++ b/docker/sage/serve-detach.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +args=( + # List of services in alphanumeric order + --file docker/sage/services/otel-collector.yml + + --file docker/sage/networks.yml + + up $1 --detach --remove-orphans +) + +docker compose "${args[@]}" \ No newline at end of file diff --git a/docker/sage/services/otel-collector.yml b/docker/sage/services/otel-collector.yml new file mode 100644 index 0000000000..bc65a7c756 --- /dev/null +++ b/docker/sage/services/otel-collector.yml @@ -0,0 +1,18 @@ +services: + sage-otel-collector: + image: ghcr.io/sage-bionetworks/sage-otel-collector:${SAGE_VERSION:-local} + container_name: sage-otel-collector + restart: always + networks: + - sage + ports: + - '1888:1888' # pprof extension + - '8888:8888' # Prometheus metrics exposed by the collector + - '8889:8889' # Prometheus exporter metrics + - '13133:13133' # health_check extension + - '4317:4317' # OTLP gRPC receiver + - '55679:55679' # zpages extension + deploy: + resources: + limits: + memory: 200M diff --git a/libs/sage-monorepo/nx-plugin/src/plugins/plugin.ts b/libs/sage-monorepo/nx-plugin/src/plugins/plugin.ts index 1f31e43d8f..54cabc564a 100644 --- a/libs/sage-monorepo/nx-plugin/src/plugins/plugin.ts +++ b/libs/sage-monorepo/nx-plugin/src/plugins/plugin.ts @@ -34,7 +34,7 @@ function writeProjectConfigurationsToCache( writeJsonFile(cachePath, results); } -const projectFilePattern = '{apps,libs}/{openchallenges,agora,sandbox}/**/project.json'; +const projectFilePattern = '{apps,libs}/{openchallenges,agora,sage,sandbox}/**/project.json'; export const createNodesV2: CreateNodesV2 = [ projectFilePattern,