Skip to content

Commit dfa8497

Browse files
committed
Add docker resource cleanup task
1 parent 291db10 commit dfa8497

File tree

2 files changed

+216
-0
lines changed

2 files changed

+216
-0
lines changed

gefapi/celery.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ def make_celery(app):
4949
"gefapi.tasks.docker_completed_monitoring.monitor_completed_docker_services": {
5050
"queue": "build"
5151
},
52+
# Docker resource cleanup task - run on build queue with Docker access
53+
"gefapi.tasks.docker_resource_cleanup.cleanup_docker_build_cache": {
54+
"queue": "build"
55+
},
5256
# Stats cache refresh tasks - run on default queue
5357
"gefapi.tasks.stats_cache_refresh.refresh_dashboard_stats_cache": {
5458
"queue": "default"
@@ -105,6 +109,11 @@ def make_celery(app):
105109
"schedule": 180.0, # Every 3 minutes - check for completed services
106110
"options": {"queue": "build"}, # Run on build queue with Docker access
107111
},
112+
"cleanup-docker-build-cache": {
113+
"task": "gefapi.tasks.docker_resource_cleanup.cleanup_docker_build_cache",
114+
"schedule": 604800.0, # Every week (7 days = 604800 seconds)
115+
"options": {"queue": "build"}, # Run on build queue with Docker access
116+
},
108117
# Stats cache refresh tasks for performance optimization
109118
"refresh-dashboard-stats-cache": {
110119
"task": "gefapi.tasks.stats_cache_refresh.refresh_dashboard_stats_cache",
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
"""DOCKER RESOURCE CLEANUP TASKS
2+
3+
Periodic cleanup of Docker resources including build cache and unused images.
4+
"""
5+
6+
import contextlib
7+
import logging
8+
9+
from celery import Task
10+
import rollbar
11+
12+
from gefapi.services.docker_service import get_docker_client
13+
14+
logger = logging.getLogger(__name__)
15+
16+
17+
class DockerResourceCleanupTask(Task):
18+
"""Base task for Docker resource cleanup"""
19+
20+
def on_failure(self, exc, task_id, args, kwargs, einfo):
21+
logger.error(f"Docker resource cleanup task failed: {exc}")
22+
rollbar.report_exc_info()
23+
24+
25+
# Import celery after other imports to avoid circular dependency
26+
from gefapi import celery # noqa: E402
27+
28+
29+
@celery.task(base=DockerResourceCleanupTask, bind=True)
30+
def cleanup_docker_build_cache(self):
31+
"""Clean up Docker build cache to reclaim disk space.
32+
33+
This task runs weekly and prunes:
34+
- Dangling images (untagged images from failed/incomplete builds)
35+
- Build cache (docker builder prune)
36+
- Unused volumes (orphaned volumes not attached to containers)
37+
- Unused networks
38+
39+
NOTE: This task intentionally does NOT prune non-dangling unused images
40+
to avoid removing cached registry images needed for running services.
41+
42+
Returns a summary of space reclaimed.
43+
"""
44+
logger.info("[TASK]: Starting Docker build cache and image cleanup")
45+
46+
try:
47+
docker_client = get_docker_client()
48+
if docker_client is None:
49+
logger.warning(
50+
"[TASK]: Docker client not available, skipping Docker resource cleanup"
51+
)
52+
return {
53+
"success": False,
54+
"error": "Docker client not available",
55+
"build_cache_pruned": False,
56+
"images_pruned": 0,
57+
"space_reclaimed_bytes": 0,
58+
}
59+
60+
total_space_reclaimed = 0
61+
images_removed = 0
62+
build_cache_pruned = False
63+
64+
# 1. Prune dangling images (untagged images from failed/incomplete builds)
65+
try:
66+
logger.info("[TASK]: Pruning dangling images...")
67+
prune_result = docker_client.images.prune(filters={"dangling": True})
68+
images_deleted = prune_result.get("ImagesDeleted") or []
69+
space_reclaimed = prune_result.get("SpaceReclaimed", 0)
70+
71+
images_removed += len(images_deleted)
72+
total_space_reclaimed += space_reclaimed
73+
74+
logger.info(
75+
"[TASK]: Removed %d dangling images, reclaimed %s bytes",
76+
len(images_deleted),
77+
space_reclaimed,
78+
)
79+
except Exception as e:
80+
logger.warning(f"[TASK]: Failed to prune dangling images: {e}")
81+
82+
# 2. Prune build cache
83+
# Note: The Docker SDK's df() and prune methods may not directly
84+
# support build cache pruning. We use the low-level API.
85+
try:
86+
logger.info("[TASK]: Pruning Docker build cache...")
87+
88+
# Use low-level API client for builder prune
89+
# docker builder prune -f (force, non-interactive)
90+
api_client = docker_client.api
91+
92+
# The build prune endpoint was added in Docker API 1.31
93+
# POST /build/prune
94+
try:
95+
# Try to call the builder prune endpoint directly
96+
response = api_client._post(
97+
api_client._url("/build/prune"),
98+
params={"all": False}, # Only unused build cache, not all
99+
)
100+
result = api_client._result(response, json=True)
101+
102+
cache_space = result.get("SpaceReclaimed", 0)
103+
caches_deleted = result.get("CachesDeleted") or []
104+
105+
total_space_reclaimed += cache_space
106+
build_cache_pruned = True
107+
108+
logger.info(
109+
"[TASK]: Pruned %d build cache entries, reclaimed %s bytes",
110+
len(caches_deleted),
111+
cache_space,
112+
)
113+
except AttributeError:
114+
# Fallback: If _post is not available, try using requests directly
115+
logger.warning(
116+
"[TASK]: Low-level API not available for build prune, "
117+
"trying alternative method..."
118+
)
119+
120+
# Try using the containers prune as a fallback for stopped containers
121+
container_prune = docker_client.containers.prune()
122+
containers_deleted = container_prune.get("ContainersDeleted") or []
123+
container_space = container_prune.get("SpaceReclaimed", 0)
124+
125+
total_space_reclaimed += container_space
126+
build_cache_pruned = True
127+
128+
logger.info(
129+
"[TASK]: Pruned %d stopped containers, reclaimed %s bytes",
130+
len(containers_deleted),
131+
container_space,
132+
)
133+
134+
except Exception as e:
135+
logger.warning(f"[TASK]: Failed to prune build cache: {e}")
136+
137+
# NOTE: We intentionally do NOT prune non-dangling unused images here.
138+
# Such images may be cached copies of registry images needed for running
139+
# new services. Pruning them would force re-pulls from the registry,
140+
# adding latency and potentially breaking service execution if the
141+
# registry is unavailable.
142+
143+
# 3. Prune unused volumes (orphaned volumes not attached to containers)
144+
try:
145+
logger.info("[TASK]: Pruning unused volumes...")
146+
volume_prune = docker_client.volumes.prune()
147+
volumes_deleted = volume_prune.get("VolumesDeleted") or []
148+
volume_space = volume_prune.get("SpaceReclaimed", 0)
149+
150+
total_space_reclaimed += volume_space
151+
152+
logger.info(
153+
"[TASK]: Pruned %d unused volumes, reclaimed %s bytes",
154+
len(volumes_deleted),
155+
volume_space,
156+
)
157+
except Exception as e:
158+
logger.warning(f"[TASK]: Failed to prune volumes: {e}")
159+
160+
# 4. Prune unused networks
161+
try:
162+
logger.info("[TASK]: Pruning unused networks...")
163+
network_prune = docker_client.networks.prune()
164+
networks_deleted = network_prune.get("NetworksDeleted") or []
165+
166+
logger.info(
167+
"[TASK]: Pruned %d unused networks",
168+
len(networks_deleted),
169+
)
170+
except Exception as e:
171+
logger.warning(f"[TASK]: Failed to prune networks: {e}")
172+
173+
# Convert bytes to human-readable format for logging
174+
space_mb = total_space_reclaimed / (1024 * 1024)
175+
space_gb = total_space_reclaimed / (1024 * 1024 * 1024)
176+
177+
if space_gb >= 1:
178+
space_str = f"{space_gb:.2f} GB"
179+
else:
180+
space_str = f"{space_mb:.2f} MB"
181+
182+
result = {
183+
"success": True,
184+
"build_cache_pruned": build_cache_pruned,
185+
"images_pruned": images_removed,
186+
"space_reclaimed_bytes": total_space_reclaimed,
187+
"space_reclaimed_human": space_str,
188+
}
189+
190+
logger.info(
191+
"[TASK]: Docker resource cleanup complete. Removed %d images, reclaimed %s",
192+
images_removed,
193+
space_str,
194+
)
195+
196+
return result
197+
198+
except Exception as error:
199+
logger.error(f"[TASK]: Error during Docker resource cleanup: {str(error)}")
200+
logger.exception("Full traceback:")
201+
202+
# Report to rollbar if available
203+
with contextlib.suppress(Exception):
204+
rollbar.report_exc_info()
205+
206+
# Re-raise the error so Celery can handle it
207+
raise error

0 commit comments

Comments
 (0)