Skip to content

Commit 3c3dfe3

Browse files
authored
Merge pull request #212 from bluesliverx/main
XENG-9472 Catch more cleanup errors and fix cache logging
2 parents eecbb38 + 3d651ad commit 3c3dfe3

File tree

5 files changed

+71
-53
lines changed

5 files changed

+71
-53
lines changed

buildrunner/docker/daemon.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,12 @@ def stop(self):
104104
self.log.write(
105105
f"Destroying Docker daemon container {self._daemon_container:.10}\n"
106106
)
107-
if self._daemon_container:
108-
self.docker_client.remove_container(
109-
self._daemon_container,
110-
force=True,
111-
v=True,
112-
)
107+
try:
108+
if self._daemon_container:
109+
self.docker_client.remove_container(
110+
self._daemon_container,
111+
force=True,
112+
v=True,
113+
)
114+
except Exception as _ex:
115+
self.log.write(f"Failed to remove Docker daemon container: {_ex}\n")

buildrunner/docker/runner.py

Lines changed: 39 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -327,11 +327,16 @@ def cleanup(self):
327327
)
328328
if container_ids:
329329
for container_id in container_ids:
330-
self.docker_client.remove_container(
331-
container_id["Id"],
332-
force=True,
333-
v=True,
334-
)
330+
try:
331+
self.docker_client.remove_container(
332+
container_id["Id"],
333+
force=True,
334+
v=True,
335+
)
336+
except Exception as _ex:
337+
print(
338+
f'Unable to delete docker container with id "{container_id["Id"]}"'
339+
)
335340
else:
336341
print(
337342
f'Unable to find docker container with name or label "{container}"'
@@ -340,11 +345,16 @@ def cleanup(self):
340345
print(
341346
f'Unable to find docker container with name or label "{container}"'
342347
)
343-
self.docker_client.remove_container(
344-
self.container["Id"],
345-
force=True,
346-
v=True,
347-
)
348+
try:
349+
self.docker_client.remove_container(
350+
self.container["Id"],
351+
force=True,
352+
v=True,
353+
)
354+
except docker.errors.NotFound:
355+
print(
356+
f'Unable to delete docker container with id "{self.container["Id"]}"'
357+
)
348358

349359
self.container = None
350360

@@ -353,16 +363,16 @@ def _get_cache_file_from_prefix(
353363
logger: ContainerLogger, local_cache_archive_file: str, docker_path: str
354364
) -> Optional[str]:
355365
if os.path.exists(local_cache_archive_file):
356-
logger.write(
357-
f"Using cache {local_cache_archive_file} for destination path {docker_path}\n"
366+
logger.info(
367+
f"Using cache {local_cache_archive_file} for destination path {docker_path}"
358368
)
359369
return local_cache_archive_file
360370
cache_dir = os.path.dirname(local_cache_archive_file)
361371

362372
if not os.path.exists(cache_dir):
363-
logger.write(
373+
logger.info(
364374
f"Cache directory {cache_dir} does not exist, "
365-
f"skipping restore of archive {local_cache_archive_file}\n"
375+
f"skipping restore of archive {local_cache_archive_file}"
366376
)
367377
return None
368378

@@ -381,14 +391,14 @@ def _get_cache_file_from_prefix(
381391
local_cache_archive_match = curr_archive_file
382392

383393
if local_cache_archive_match is None:
384-
logger.write(
394+
logger.info(
385395
f"Not able to restore cache {docker_path} since "
386-
f"there was no matching prefix for `{local_cache_archive_file}`\n"
396+
f"there was no matching prefix for `{local_cache_archive_file}`"
387397
)
388398
return None
389-
logger.write(
399+
logger.info(
390400
f"Found cache {local_cache_archive_match} matching prefix {local_cache_archive_file} "
391-
f"for destination path {docker_path}\n"
401+
f"for destination path {docker_path}"
392402
)
393403

394404
return local_cache_archive_match
@@ -422,9 +432,9 @@ def restore_caches(self, logger: ContainerLogger, caches: OrderedDict) -> None:
422432
restored_cache_src = set()
423433
for local_cache_archive_file, docker_path in caches.items():
424434
if docker_path in restored_cache_src:
425-
logger.write(
435+
logger.info(
426436
f"Cache for destination path {docker_path} has already been matched and restored to the container, "
427-
f"skipping {local_cache_archive_file}\n"
437+
f"skipping {local_cache_archive_file}"
428438
)
429439
continue
430440

@@ -507,13 +517,13 @@ def write_cache_history_log(
507517
file_obj = acquire_flock_open_write_binary(
508518
lock_file=cache_history_log, logger=logger, mode="a"
509519
)
510-
logger.write(
520+
logger.info(
511521
f"File lock acquired. Attempting to write cache history log to {cache_history_log}"
512522
)
513523
file_obj.write(log_str)
514524
finally:
515525
release_flock(file_obj, logger)
516-
logger.write("Writing to cache history log completed. Released file lock.")
526+
logger.info("Writing to cache history log completed, released file lock")
517527

518528
def save_caches(
519529
self, logger: ContainerLogger, caches: OrderedDict, env_vars: dict = dict()
@@ -526,10 +536,10 @@ def save_caches(
526536
for local_cache_archive_file, docker_path in caches.items():
527537
if docker_path not in saved_cache_src:
528538
saved_cache_src.add(docker_path)
529-
logger.write(
530-
f"Saving cache `{docker_path}` "
539+
logger.info(
540+
f"Saving cache {docker_path} "
531541
f"running on container {self.container['Id']} "
532-
f"to local cache `{local_cache_archive_file}`\n"
542+
f"to local cache {local_cache_archive_file}"
533543
)
534544

535545
log_line = (
@@ -551,9 +561,7 @@ def save_caches(
551561
file_obj = acquire_flock_open_write_binary(
552562
lock_file=local_cache_archive_file, logger=logger
553563
)
554-
logger.write(
555-
"File lock acquired. Attempting to write to cache."
556-
)
564+
logger.info("File lock acquired. Attempting to write to cache.")
557565
self._write_cache(docker_path, file_obj)
558566
except Exception as e:
559567
raise BuildRunnerSavingCache(
@@ -564,11 +572,11 @@ def save_caches(
564572
assert tarfile.is_tarfile(
565573
local_cache_archive_file
566574
), f"Failed to create cache {local_cache_archive_file} tar file."
567-
logger.write("Writing to cache completed. Released file lock.")
575+
logger.info("Writing to cache completed. Released file lock.")
568576
else:
569-
logger.write(
577+
logger.info(
570578
f"The following `{docker_path}` in docker has already been saved. "
571-
f"It will not be saved again to `{local_cache_archive_file}`\n"
579+
f"It will not be saved again to `{local_cache_archive_file}`"
572580
)
573581

574582
def run(self, cmd, console=None, stream=True, log=None, workdir=None):

buildrunner/sshagent/__init__.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -245,16 +245,13 @@ def stop(self):
245245
# kill ssh connection thread
246246
self.log.write("Closing ssh-agent container connection\n")
247247
if self._ssh_client:
248-
# pylint: disable=W0212
249248
if self._ssh_client._agent:
250249
try:
251250
self._ssh_client._agent.close()
252-
# pylint: disable=W0703
253251
except Exception as _ex:
254252
self.log.write(f"Error stopping ssh-agent: {_ex}\n")
255253
try:
256254
self._ssh_client.close()
257-
# pylint: disable=W0703
258255
except Exception as _ex:
259256
self.log.write(f"Error stopping ssh-agent connection: {_ex}\n")
260257

@@ -263,11 +260,14 @@ def stop(self):
263260
self.log.write(
264261
f"Destroying ssh-agent container {self._ssh_agent_container:.10}\n"
265262
)
266-
self.docker_client.remove_container(
267-
self._ssh_agent_container,
268-
force=True,
269-
v=True,
270-
)
263+
try:
264+
self.docker_client.remove_container(
265+
self._ssh_agent_container,
266+
force=True,
267+
v=True,
268+
)
269+
except Exception as _ex:
270+
self.log.write(f"Error destroying ssh-agent container: {_ex}\n")
271271

272272
def get_ssh_agent_image(self):
273273
"""

buildrunner/steprunner/tasks/run.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,10 @@ def __del__(self):
9292
self.step_runner.log.info(
9393
f"Deleting network {self.step_runner.network_name}"
9494
)
95-
self._docker_client.remove_network(self.step_runner.network_name)
95+
try:
96+
self._docker_client.remove_network(self.step_runner.network_name)
97+
except Exception as _ex:
98+
self.step_runner.log.error(f"Error removing network: {_ex}")
9699

97100
def _get_source_container(self):
98101
"""
@@ -1119,9 +1122,15 @@ def run(self, context: dict): # pylint: disable=too-many-statements,too-many-br
11191122
"Skipping cache save due to failed exit code"
11201123
)
11211124
else:
1122-
self.runner.save_caches(
1123-
container_meta_logger, caches, container_args.get("environment")
1124-
)
1125+
try:
1126+
self.runner.save_caches(
1127+
container_meta_logger, caches, container_args.get("environment")
1128+
)
1129+
except Exception as _ex:
1130+
container_meta_logger.error(
1131+
f"Error saving caches, ignoring: {_ex}",
1132+
)
1133+
# Failing to save caches should not fail the build, just continue as normal
11251134

11261135
finally:
11271136
if self.runner:

buildrunner/utils.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -242,9 +242,7 @@ def get_lock(file_obj: io.IOBase):
242242
f"PID:{pid} failed to acquire file lock for {lock_file} after timeout of {timeout_seconds} seconds"
243243
)
244244

245-
logger.info(
246-
f"PID:{pid} file opened and lock acquired for {lock_file} ({lock_file_obj})"
247-
)
245+
logger.info(f"PID:{pid} file opened and lock acquired for {lock_file}")
248246

249247
return lock_file_obj
250248

@@ -308,4 +306,4 @@ def release_flock(
308306
return
309307
portalocker.unlock(lock_file_obj)
310308
lock_file_obj.close()
311-
logger.write(f"PID:{os.getpid()} released and closed file {lock_file_obj}")
309+
logger.info(f"PID:{os.getpid()} released and closed file {lock_file_obj.name}")

0 commit comments

Comments
 (0)