-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Description
Agent Environment
Version: Agent 7.70.2
Describe what happened:
I am running several docker app containers in docker swarm / stack mode.
Here is a minimal docker-compose
for my docker stack:
services:
traefik:
image: traefik:v3.1
command:
# Swarm provider
- --providers.swarm=true
- --providers.swarm.endpoint=unix:///var/run/docker.sock
- --providers.swarm.exposedbydefault=false
# Docker Providers
# - --providers.docker=true
# - --providers.docker.watch=true
# - --providers.docker.exposedbydefault=false
# Entrypoints
- --entrypoints.web.address=:80
- --entrypoints.websecure.address=:443
# Ping healthcheck endpoint
- --ping=true
- --ping.entrypoint=web
- --api=true
- --api.dashboard=true
- --certificatesresolvers.le.acme.email=...
- --certificatesresolvers.le.acme.storage=/acme.json
- --certificatesresolvers.le.acme.httpchallenge.entrypoint=web
- --certificatesresolvers.le.acme.httpchallenge=true
- --log=true
- --log.filepath=/var/log/traefik.log
- --log.level=WARN
# - --accesslog=true
# - --accesslog.filepath=/var/log/traefik_access.log
# - --accesslog.bufferingsize=100
ports:
- 80:80
- 443:443
- 8080:8080 # optional dashboard
volumes:
- '/var/run/docker.sock:/var/run/docker.sock:ro'
- '/files/traefik/acme.json:/acme.json'
- '/var/log:/var/log'
- loadbalancerdata:/data
deploy:
mode: global
placement:
constraints:
- node.role == manager # run Traefik only on manager(s)
restart_policy:
condition: on-failure
networks:
- webswarm
my-app:
image: ${MYIMAGE}
environment:
- PORT=3000
- SERVER_ENV=production
- DD_ENV=production
- MY_CONTAINER_NAME=my-app-{{.Task.Slot}}
volumes:
- /files:/files
- '/var/run/docker.sock:/var/run/docker.sock:ro'
depends_on:
- traefik
networks:
- webswarm
healthcheck:
test: curl -f http://localhost:3000/healthcheck || exit 1
interval: 30s
timeout: 15s
retries: 3
start_period: 10s
deploy:
labels:
...
mode: replicated
replicas: 7
placement:
preferences:
- spread: node.id # makes sure to spread the containers across the nodes
restart_policy:
condition: on-failure
update_config:
parallelism: 1
delay: 15s
order: start-first
resources:
limits:
cpus: '1.5'
memory: 4096M
reservations:
cpus: '0.5'
memory: 1536M
dd-agent:
image: gcr.io/datadoghq/agent:latest
volumes:
- '/var/run/docker.sock:/var/run/docker.sock:ro'
- '/var/lib/docker/containers:/var/lib/docker/containers:ro'
- '/proc:/host/proc:ro'
- '/sys/fs/cgroup/:/host/sys/fs/cgroup:ro'
- '/opt/datadog-agent/run:/opt/datadog-agent/run:rw'
environment:
- DD_API_KEY=...
- DD_LOGS_ENABLED=true
- DD_LOGS_INJECTION=true
- DD_ENV=production
- DD_SITE=datadoghq.eu
- DD_LOGS_CONFIG_CONTAINER_COLLECT_ALL=true
- DD_CONTAINER_EXCLUDE=name:dd-agent name:mongomyadmin name:orderlion-temp name:traefik name:exitelink name:autoheal name:bullboard
- DD_CONTAINER_ENV_AS_TAGS={"MY_CONTAINER_NAME":"container_name"}
networks:
- webswarm
deploy:
mode: global
restart_policy:
condition: on-failure
volumes:
loadbalancerdata:
networks:
webswarm:
external: true
Describe what you expected:
I would expect, that the logs inside DD show up coming from the docker containers with names like my-app-1
, my-app-2
, ...
But they simple don't!
I checked with docker inspect
my main app containers in the swarm and the env var MY_CONTAINER_NAME
correctly shows up inside docker inspect
with the right value! Somehow, the dd-agent is simply ignoring the DD_CONTAINER_ENV_AS_TAGS
env var. I tried almost everything but can't get it to work.
Additional environment details (Operating System, Cloud provider, etc):
Ubuntu 22.04 LTS, DD Agent 7.70.2, all hosted on AWS