Skip to content

Commit a4cd2c3

Browse files
author
saville
committed
XENG-9472 Catch more cleanup errors and fix cache logging
1 parent eecbb38 commit a4cd2c3

File tree

5 files changed

+61
-48
lines changed

5 files changed

+61
-48
lines changed

buildrunner/docker/daemon.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,12 @@ def stop(self):
104104
self.log.write(
105105
f"Destroying Docker daemon container {self._daemon_container:.10}\n"
106106
)
107-
if self._daemon_container:
108-
self.docker_client.remove_container(
109-
self._daemon_container,
110-
force=True,
111-
v=True,
112-
)
107+
try:
108+
if self._daemon_container:
109+
self.docker_client.remove_container(
110+
self._daemon_container,
111+
force=True,
112+
v=True,
113+
)
114+
except Exception as _ex:
115+
self.log.write(f"Failed to remove Docker daemon container: {_ex}\n")

buildrunner/docker/runner.py

Lines changed: 29 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -340,11 +340,16 @@ def cleanup(self):
340340
print(
341341
f'Unable to find docker container with name or label "{container}"'
342342
)
343-
self.docker_client.remove_container(
344-
self.container["Id"],
345-
force=True,
346-
v=True,
347-
)
343+
try:
344+
self.docker_client.remove_container(
345+
self.container["Id"],
346+
force=True,
347+
v=True,
348+
)
349+
except docker.errors.NotFound:
350+
print(
351+
f'Unable to find docker container with id "{self.container["Id"]}"'
352+
)
348353

349354
self.container = None
350355

@@ -353,16 +358,16 @@ def _get_cache_file_from_prefix(
353358
logger: ContainerLogger, local_cache_archive_file: str, docker_path: str
354359
) -> Optional[str]:
355360
if os.path.exists(local_cache_archive_file):
356-
logger.write(
357-
f"Using cache {local_cache_archive_file} for destination path {docker_path}\n"
361+
logger.info(
362+
f"Using cache {local_cache_archive_file} for destination path {docker_path}"
358363
)
359364
return local_cache_archive_file
360365
cache_dir = os.path.dirname(local_cache_archive_file)
361366

362367
if not os.path.exists(cache_dir):
363-
logger.write(
368+
logger.info(
364369
f"Cache directory {cache_dir} does not exist, "
365-
f"skipping restore of archive {local_cache_archive_file}\n"
370+
f"skipping restore of archive {local_cache_archive_file}"
366371
)
367372
return None
368373

@@ -381,14 +386,14 @@ def _get_cache_file_from_prefix(
381386
local_cache_archive_match = curr_archive_file
382387

383388
if local_cache_archive_match is None:
384-
logger.write(
389+
logger.info(
385390
f"Not able to restore cache {docker_path} since "
386-
f"there was no matching prefix for `{local_cache_archive_file}`\n"
391+
f"there was no matching prefix for `{local_cache_archive_file}`"
387392
)
388393
return None
389-
logger.write(
394+
logger.info(
390395
f"Found cache {local_cache_archive_match} matching prefix {local_cache_archive_file} "
391-
f"for destination path {docker_path}\n"
396+
f"for destination path {docker_path}"
392397
)
393398

394399
return local_cache_archive_match
@@ -422,9 +427,9 @@ def restore_caches(self, logger: ContainerLogger, caches: OrderedDict) -> None:
422427
restored_cache_src = set()
423428
for local_cache_archive_file, docker_path in caches.items():
424429
if docker_path in restored_cache_src:
425-
logger.write(
430+
logger.info(
426431
f"Cache for destination path {docker_path} has already been matched and restored to the container, "
427-
f"skipping {local_cache_archive_file}\n"
432+
f"skipping {local_cache_archive_file}"
428433
)
429434
continue
430435

@@ -507,13 +512,13 @@ def write_cache_history_log(
507512
file_obj = acquire_flock_open_write_binary(
508513
lock_file=cache_history_log, logger=logger, mode="a"
509514
)
510-
logger.write(
515+
logger.info(
511516
f"File lock acquired. Attempting to write cache history log to {cache_history_log}"
512517
)
513518
file_obj.write(log_str)
514519
finally:
515520
release_flock(file_obj, logger)
516-
logger.write("Writing to cache history log completed. Released file lock.")
521+
logger.info("Writing to cache history log completed, released file lock")
517522

518523
def save_caches(
519524
self, logger: ContainerLogger, caches: OrderedDict, env_vars: dict = dict()
@@ -526,10 +531,10 @@ def save_caches(
526531
for local_cache_archive_file, docker_path in caches.items():
527532
if docker_path not in saved_cache_src:
528533
saved_cache_src.add(docker_path)
529-
logger.write(
530-
f"Saving cache `{docker_path}` "
534+
logger.info(
535+
f"Saving cache {docker_path} "
531536
f"running on container {self.container['Id']} "
532-
f"to local cache `{local_cache_archive_file}`\n"
537+
f"to local cache {local_cache_archive_file}"
533538
)
534539

535540
log_line = (
@@ -551,9 +556,7 @@ def save_caches(
551556
file_obj = acquire_flock_open_write_binary(
552557
lock_file=local_cache_archive_file, logger=logger
553558
)
554-
logger.write(
555-
"File lock acquired. Attempting to write to cache."
556-
)
559+
logger.info("File lock acquired. Attempting to write to cache.")
557560
self._write_cache(docker_path, file_obj)
558561
except Exception as e:
559562
raise BuildRunnerSavingCache(
@@ -564,11 +567,11 @@ def save_caches(
564567
assert tarfile.is_tarfile(
565568
local_cache_archive_file
566569
), f"Failed to create cache {local_cache_archive_file} tar file."
567-
logger.write("Writing to cache completed. Released file lock.")
570+
logger.info("Writing to cache completed. Released file lock.")
568571
else:
569-
logger.write(
572+
logger.info(
570573
f"The following `{docker_path}` in docker has already been saved. "
571-
f"It will not be saved again to `{local_cache_archive_file}`\n"
574+
f"It will not be saved again to `{local_cache_archive_file}`"
572575
)
573576

574577
def run(self, cmd, console=None, stream=True, log=None, workdir=None):

buildrunner/sshagent/__init__.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -245,16 +245,13 @@ def stop(self):
245245
# kill ssh connection thread
246246
self.log.write("Closing ssh-agent container connection\n")
247247
if self._ssh_client:
248-
# pylint: disable=W0212
249248
if self._ssh_client._agent:
250249
try:
251250
self._ssh_client._agent.close()
252-
# pylint: disable=W0703
253251
except Exception as _ex:
254252
self.log.write(f"Error stopping ssh-agent: {_ex}\n")
255253
try:
256254
self._ssh_client.close()
257-
# pylint: disable=W0703
258255
except Exception as _ex:
259256
self.log.write(f"Error stopping ssh-agent connection: {_ex}\n")
260257

@@ -263,11 +260,14 @@ def stop(self):
263260
self.log.write(
264261
f"Destroying ssh-agent container {self._ssh_agent_container:.10}\n"
265262
)
266-
self.docker_client.remove_container(
267-
self._ssh_agent_container,
268-
force=True,
269-
v=True,
270-
)
263+
try:
264+
self.docker_client.remove_container(
265+
self._ssh_agent_container,
266+
force=True,
267+
v=True,
268+
)
269+
except Exception as _ex:
270+
self.log.write(f"Error destroying ssh-agent container: {_ex}\n")
271271

272272
def get_ssh_agent_image(self):
273273
"""

buildrunner/steprunner/tasks/run.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,10 @@ def __del__(self):
9292
self.step_runner.log.info(
9393
f"Deleting network {self.step_runner.network_name}"
9494
)
95-
self._docker_client.remove_network(self.step_runner.network_name)
95+
try:
96+
self._docker_client.remove_network(self.step_runner.network_name)
97+
except Exception as _ex:
98+
self.step_runner.log.error(f"Error removing network: {_ex}")
9699

97100
def _get_source_container(self):
98101
"""
@@ -1119,9 +1122,15 @@ def run(self, context: dict): # pylint: disable=too-many-statements,too-many-br
11191122
"Skipping cache save due to failed exit code"
11201123
)
11211124
else:
1122-
self.runner.save_caches(
1123-
container_meta_logger, caches, container_args.get("environment")
1124-
)
1125+
try:
1126+
self.runner.save_caches(
1127+
container_meta_logger, caches, container_args.get("environment")
1128+
)
1129+
except Exception as _ex:
1130+
container_meta_logger.error(
1131+
f"Error saving caches, ignoring: {_ex}",
1132+
)
1133+
# Failing to save caches should not fail the build, just continue as normal
11251134

11261135
finally:
11271136
if self.runner:

buildrunner/utils.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -242,9 +242,7 @@ def get_lock(file_obj: io.IOBase):
242242
f"PID:{pid} failed to acquire file lock for {lock_file} after timeout of {timeout_seconds} seconds"
243243
)
244244

245-
logger.info(
246-
f"PID:{pid} file opened and lock acquired for {lock_file} ({lock_file_obj})"
247-
)
245+
logger.info(f"PID:{pid} file opened and lock acquired for {lock_file}")
248246

249247
return lock_file_obj
250248

@@ -308,4 +306,4 @@ def release_flock(
308306
return
309307
portalocker.unlock(lock_file_obj)
310308
lock_file_obj.close()
311-
logger.write(f"PID:{os.getpid()} released and closed file {lock_file_obj}")
309+
logger.info(f"PID:{os.getpid()} released and closed file {lock_file_obj.name}")

0 commit comments

Comments
 (0)