diff --git a/examples/deploy/otel-tracing-grafana/README.md b/examples/deploy/otel-tracing-grafana/README.md new file mode 100644 index 0000000000000..b950357d7213d --- /dev/null +++ b/examples/deploy/otel-tracing-grafana/README.md @@ -0,0 +1,66 @@ +Tracing with Tempo +================== + +[Tempo](https://grafana.com/oss/tempo/) is an open-source distributed tracing system for monitoring and debugging microservices. It helps track requests across services, analyze latency, identify bottlenecks, and diagnose failures. + +Key use cases include debugging production issues, monitoring performance, visualizing service dependencies, and optimizing system reliability. As Tempo supports the OpenTelemetry protocol, it can be used to collect traces from Windmill. + +Follow the guide on [setting up Tempo](https://windmill.dev/docs/misc/guides/otel#setting-up-tempo) for more details. + +## Setting up Tempo along with Windmill + +Start all services by running: + +```bash +docker-compose up -d +``` + +## Configuring Windmill to use Tempo via OpenTelemetry Collector + +In the Windmill UI available at `http://localhost`, complete the initial setup and go to "Instances Settings" and "OTEL/Prom" tab and fill in the open telemetry collector endpoint and service name and toggle the Tracing option to send traces to the `otel-collector:4317` port. + +## Open the Tempo UI + +The Tempo UI is a plugin available in Grafana, if hosted with the `docker-compose.yml` file above, Grafana will be available at `http://localhost:3000`. When running a script or workflow with Windmill, you will be able to see the traces in the Tempo UI and investigate them. + +## Searching for specific traces + +To search/filter for a specific trace, for example a workflow, you can use the search function in the Tempo UI by filtering by tags set by Windmill. + +The following tags are useful to filter for specific traces: + +- `job_id`: The ID of the job +- `root_job`: The ID of the root job (flow) +- `parent_job`: The ID of the parent job (flow) +- `flow_step_id`: The ID of the step within the workflow +- `script_path`: The path of the script +- `workspace_id`: The name of the workspace +- `worker_id`: The ID of the worker +- `language`: The language of the script +- `tag`: The queue tag of the workflow + +## Monitoring metrics with Tempo and Prometheus + +Tempo can be used to generate time series for metrics of the collected traces. These time series can be used to compare the performance of individual steps within a workflow and their overall performance and relative contribution over time, as well as identify and troubleshoot issues and anomalies. + +These metrics are exported to Prometheus and can be viewed in the Grafana UI in the Metrics Explorer. The generated metrics are: + +- traces_spanmetrics_calls_total +- traces_spanmetrics_latency +- traces_spanmetrics_latency_bucket +- traces_spanmetrics_latency_count +- traces_spanmetrics_latency_sum +- traces_spanmetrics_size_total + +## Viewing Logs with Loki + +Logs from Windmill are sent to [Loki](https://grafana.com/oss/loki/), a log aggregation system that integrates seamlessly with Grafana. You can view and analyze these logs in the Grafana UI. + +To access logs in Grafana: + +1. Open the Grafana UI, typically available at `http://localhost:3000`. +2. Navigate to the "Explore" section. +3. Select the Loki data source. +4. Use the query editor to filter and search logs based on various labels and fields. + +This setup allows you to correlate logs with traces, providing a comprehensive view of your system's behavior and aiding in troubleshooting and performance analysis. diff --git a/examples/deploy/otel-tracing-grafana/docker-compose.yml b/examples/deploy/otel-tracing-grafana/docker-compose.yml new file mode 100644 index 0000000000000..525010efade5d --- /dev/null +++ b/examples/deploy/otel-tracing-grafana/docker-compose.yml @@ -0,0 +1,245 @@ +version: "3.7" + +services: + db: + deploy: + # To use an external database, set replicas to 0 and set DATABASE_URL to the external database url in the .env file + replicas: 1 + image: postgres:16 + shm_size: 1g + restart: unless-stopped + volumes: + - db_data:/var/lib/postgresql/data + expose: + - 5432 + ports: + - 5432:5432 + environment: + POSTGRES_PASSWORD: changeme + POSTGRES_DB: windmill + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 10s + timeout: 5s + retries: 5 + + windmill_server: + image: ${WM_IMAGE} + pull_policy: always + deploy: + replicas: 1 + restart: unless-stopped + expose: + - 8000 + - 2525 + environment: + - DATABASE_URL=${DATABASE_URL} + - MODE=server + depends_on: + db: + condition: service_healthy + volumes: + - worker_logs:/tmp/windmill/logs + + windmill_worker: + image: ${WM_IMAGE} + pull_policy: always + deploy: + replicas: 3 + resources: + limits: + cpus: "1" + memory: 2048M + # for GB, use syntax '2Gi' + restart: unless-stopped + environment: + - DATABASE_URL=${DATABASE_URL} + - MODE=worker + - WORKER_GROUP=default + depends_on: + db: + condition: service_healthy + # to mount the worker folder to debug, KEEP_JOB_DIR=true and mount /tmp/windmill + volumes: + # mount the docker socket to allow to run docker containers from within the workers + - /var/run/docker.sock:/var/run/docker.sock + - worker_dependency_cache:/tmp/windmill/cache + - worker_logs:/tmp/windmill/logs + + ## This worker is specialized for "native" jobs. Native jobs run in-process and thus are much more lightweight than other jobs + windmill_worker_native: + # Use ghcr.io/windmill-labs/windmill-ee:main for the ee + image: ${WM_IMAGE} + pull_policy: always + deploy: + replicas: 1 + resources: + limits: + cpus: "1" + memory: 2048M + # for GB, use syntax '2Gi' + restart: unless-stopped + environment: + - DATABASE_URL=${DATABASE_URL} + - MODE=worker + - WORKER_GROUP=native + - NUM_WORKERS=8 + - SLEEP_QUEUE=200 + depends_on: + db: + condition: service_healthy + volumes: + - worker_logs:/tmp/windmill/logs + # This worker is specialized for reports or scraping jobs. It is assigned the "reports" worker group which has an init script that installs chromium and can be targeted by using the "chromium" worker tag. + # windmill_worker_reports: + # image: ${WM_IMAGE} + # pull_policy: always + # deploy: + # replicas: 1 + # resources: + # limits: + # cpus: "1" + # memory: 2048M + # # for GB, use syntax '2Gi' + # restart: unless-stopped + # environment: + # - DATABASE_URL=${DATABASE_URL} + # - MODE=worker + # - WORKER_GROUP=reports + # depends_on: + # db: + # condition: service_healthy + # # to mount the worker folder to debug, KEEP_JOB_DIR=true and mount /tmp/windmill + # volumes: + # # mount the docker socket to allow to run docker containers from within the workers + # - /var/run/docker.sock:/var/run/docker.sock + # - worker_dependency_cache:/tmp/windmill/cache + # - worker_logs:/tmp/windmill/logs + + # The indexer powers full-text job and log search, an EE feature. + windmill_indexer: + image: ${WM_IMAGE} + pull_policy: always + deploy: + replicas: 0 # set to 1 to enable full-text job and log search + restart: unless-stopped + expose: + - 8001 + environment: + - PORT=8001 + - DATABASE_URL=${DATABASE_URL} + - MODE=indexer + depends_on: + db: + condition: service_healthy + volumes: + - windmill_index:/tmp/windmill/search + - worker_logs:/tmp/windmill/logs + + lsp: + image: ghcr.io/windmill-labs/windmill-lsp:latest + pull_policy: always + restart: unless-stopped + expose: + - 3001 + volumes: + - lsp_cache:/root/.cache + + multiplayer: + image: ghcr.io/windmill-labs/windmill-multiplayer:latest + deploy: + replicas: 0 # Set to 1 to enable multiplayer, only available on Enterprise Edition + restart: unless-stopped + expose: + - 3002 + + caddy: + image: ghcr.io/windmill-labs/caddy-l4:latest + restart: unless-stopped + # Configure the mounted Caddyfile and the exposed ports or use another reverse proxy if needed + volumes: + - ./Caddyfile:/etc/caddy/Caddyfile + # - ./certs:/certs # Provide custom certificate files like cert.pem and key.pem to enable HTTPS - See the corresponding section in the Caddyfile + ports: + # To change the exposed port, simply change 80:80 to :80. No other changes needed + - 80:80 + - 25:25 + # - 443:443 # Uncomment to enable HTTPS handling by Caddy + environment: + - BASE_URL=":80" + # - BASE_URL=":443" # uncomment and comment line above to enable HTTPS via custom certificate and key files + # - BASE_URL=mydomain.com # Uncomment and comment line above to enable HTTPS handling by Caddy + + # Grafana OpenTelemetry Example + # https://windmill.dev/docs/misc/guides/otel#setting-up-grafana + init: + image: &tempoImage grafana/tempo:latest + user: root + entrypoint: + - "chown" + - "10001:10001" + - "/var/tempo" + volumes: + - tempo-data:/var/tempo + + otel-collector: + image: otel/opentelemetry-collector:latest + container_name: otel-collector + expose: + - 4317 + volumes: + - ./otel-config.yaml:/etc/otel/config.yaml + command: ["--config=/etc/otel/config.yaml"] + + tempo: + image: *tempoImage + command: [ "-config.file=/etc/tempo.yaml" ] + volumes: + - ./tempo-config.yaml:/etc/tempo.yaml + - tempo-data:/var/tempo + expose: + - 3200 + - 4317 + depends_on: + - init + + loki: + image: grafana/loki:latest + expose: + - 3100 + command: -config.file=/etc/loki/local-config.yaml + volumes: + - ./loki-config.yaml:/etc/loki/local-config.yaml + + prometheus: + image: prom/prometheus:latest + command: + - --config.file=/etc/prometheus.yaml + - --web.enable-remote-write-receiver + - --enable-feature=exemplar-storage + - --enable-feature=native-histograms + volumes: + - ./prometheus-config.yaml:/etc/prometheus.yaml + expose: + - 9090 + + grafana: + image: grafana/grafana:11.0.0 + volumes: + - ./grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + - GF_AUTH_DISABLE_LOGIN_FORM=true + - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor metricsSummary + - GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app + ports: + - "3000:3000" + +volumes: + db_data: null + worker_dependency_cache: null + worker_logs: null + windmill_index: null + lsp_cache: null + tempo-data: null \ No newline at end of file diff --git a/examples/deploy/otel-tracing-grafana/grafana-datasources.yaml b/examples/deploy/otel-tracing-grafana/grafana-datasources.yaml new file mode 100644 index 0000000000000..e3f2aca1cdd36 --- /dev/null +++ b/examples/deploy/otel-tracing-grafana/grafana-datasources.yaml @@ -0,0 +1,40 @@ +apiVersion: 1 + +datasources: +- name: Prometheus + type: prometheus + uid: prometheus + access: proxy + orgId: 1 + url: http://prometheus:9090 + basicAuth: false + isDefault: false + version: 1 + editable: false + jsonData: + httpMethod: GET +- name: Tempo + type: tempo + access: proxy + orgId: 1 + url: http://tempo:3200 + basicAuth: false + isDefault: true + version: 1 + editable: false + apiVersion: 1 + uid: tempo + jsonData: + httpMethod: GET + serviceMap: + datasourceUid: prometheus + streamingEnabled: + search: true +- name: Loki + type: loki + access: proxy + url: http://loki:3100 + jsonData: + httpHeaderName1: "X-Scope-OrgID" + secureJsonData: + httpHeaderValue1: "tenant1" \ No newline at end of file diff --git a/examples/deploy/otel-tracing-grafana/loki-config.yaml b/examples/deploy/otel-tracing-grafana/loki-config.yaml new file mode 100644 index 0000000000000..4bbc268839f22 --- /dev/null +++ b/examples/deploy/otel-tracing-grafana/loki-config.yaml @@ -0,0 +1,29 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + ring: + instance_addr: 0.0.0.0 + kvstore: + store: inmemory + replication_factor: 1 + path_prefix: /tmp/loki + +schema_config: + configs: + - from: 2020-05-15 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + filesystem: + directory: /tmp/loki/chunks + +limits_config: + allow_structured_metadata: true diff --git a/examples/deploy/otel-tracing-grafana/otel-config.yaml b/examples/deploy/otel-tracing-grafana/otel-config.yaml new file mode 100644 index 0000000000000..0685743406a8d --- /dev/null +++ b/examples/deploy/otel-tracing-grafana/otel-config.yaml @@ -0,0 +1,30 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + +processors: + batch: + timeout: 5s + +exporters: + otlphttp/loki: + endpoint: http://loki:3100/otlp + tls: + insecure: true + otlp/tempo: + endpoint: http://tempo:4317 + tls: + insecure: true + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp/tempo] + logs: + receivers: [otlp] + processors: [batch] + exporters: [otlphttp/loki] \ No newline at end of file diff --git a/examples/deploy/otel-tracing-grafana/prometheus-config.yaml b/examples/deploy/otel-tracing-grafana/prometheus-config.yaml new file mode 100644 index 0000000000000..eda5d0261c8f0 --- /dev/null +++ b/examples/deploy/otel-tracing-grafana/prometheus-config.yaml @@ -0,0 +1,11 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: [ 'localhost:9090' ] + - job_name: 'tempo' + static_configs: + - targets: [ 'tempo:3200' ] diff --git a/examples/deploy/otel-tracing-grafana/tempo-config.yaml b/examples/deploy/otel-tracing-grafana/tempo-config.yaml new file mode 100644 index 0000000000000..54c2090103f5a --- /dev/null +++ b/examples/deploy/otel-tracing-grafana/tempo-config.yaml @@ -0,0 +1,55 @@ +stream_over_http_enabled: true +server: + http_listen_port: 3200 + log_level: info + +query_frontend: + search: + duration_slo: 5s + throughput_bytes_slo: 1.073741824e+09 + metadata_slo: + duration_slo: 5s + throughput_bytes_slo: 1.073741824e+09 + trace_by_id: + duration_slo: 5s + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: "tempo:4317" + +ingester: + max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally + +compactor: + compaction: + block_retention: 1h # overall Tempo trace retention. set for demo purposes + +metrics_generator: + registry: + external_labels: + source: tempo + cluster: windmill + storage: + path: /var/tempo/generator/wal + remote_write: + - url: http://prometheus:9090/api/v1/write + send_exemplars: true + traces_storage: + path: /var/tempo/generator/traces + +storage: + trace: + backend: local # backend configuration to use + wal: + path: /var/tempo/wal # where to store the wal locally + local: + path: /var/tempo/blocks + +overrides: + defaults: + metrics_generator: + processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator + generate_native_histograms: both \ No newline at end of file diff --git a/examples/deploy/otel-tracing-jaeger/README.md b/examples/deploy/otel-tracing-jaeger/README.md index 32f4b7441dce6..d5201eca5adb1 100644 --- a/examples/deploy/otel-tracing-jaeger/README.md +++ b/examples/deploy/otel-tracing-jaeger/README.md @@ -17,7 +17,7 @@ docker-compose up -d ## Configuring Windmill to use Jaeger -In the Windmill UI available at `http://localhost`, complete the initial setup and go to "Instances Settings" and "OTEL/Prom" tab and fill in the Jaeger endpoint and service name and toggle the Tracing option to send traces to Jaeger. +In the Windmill UI available at `http://localhost`, complete the initial setup and go to "Instances Settings" and "OTEL/Prom" tab and fill in the Jaeger endpoint `jaeger:4317` and toggle the Tracing option to send traces to Jaeger. ## Open the Jaeger UI