Fix training pipeline and cleanup logs (#142)

Phil26AT · web-flow · commit 893fbdabf44c · 2025-07-07T23:08:55.000+02:00
* Fix OmegaConf resolve

* Fix bug with benchmark logging in train.py

* Fix logging errors in train

* Fix cleanup of logfiles

* Add log cleanup interval option (default 2 minutes) to train.py

* Fix formatting

* Black formatting
diff --git a/gluefactory/train.py b/gluefactory/train.py
@@ -213,7 +213,7 @@ def write_image_summaries(writer, name, figures, step):
             for k, fig in figs.items():
                 writer.add_figure(f"{name}/{i}_{k}", fig, step)
     else:
-        for k, fig in figs.items():
+        for k, fig in figures.items():
             writer.add_figure(f"{name}/{k}", fig, step)
 
 
@@ -414,16 +414,19 @@ def trace_handler(p):
         ):
             for bname, eval_conf in conf.get("benchmarks", {}).items():
                 logger.info(f"Running eval on {bname}")
-                results, figures, _ = run_benchmark(
+                summaries, figures, _ = run_benchmark(
                     bname,
                     eval_conf,
                     settings.EVAL_PATH / bname / args.experiment / str(epoch),
                     model.eval(),
                 )
-                logger.info(str(results))
-                write_dict_summaries(writer, f"test/{bname}", results, epoch)
+                str_summaries = [
+                    f"{k} {v:.3E}" for k, v in summaries.items() if isinstance(v, float)
+                ]
+                logger.info(f'[{bname}] {{{", ".join(str_summaries)}}}')
+                write_dict_summaries(writer, f"test/{bname}", summaries, epoch)
                 write_image_summaries(writer, f"figures/{bname}", figures, epoch)
-                del results, figures
+                del summaries, figures
 
         # set the seed
         set_seed(conf.train.seed + epoch)
@@ -572,7 +575,7 @@ def trace_handler(p):
                         loss_fn,
                         conf.train,
                         rank,
-                        pbar=(rank == -1),
+                        pbar=(rank == 0),
                     )
 
                 if rank == 0:
@@ -615,7 +618,7 @@ def trace_handler(p):
                         loss_fn,
                         conf.train,
                         rank,
-                        pbar=(rank == -1),
+                        pbar=(rank == 0),
                     )
                     best_eval = results[conf.train.best_key]
                 best_eval = save_experiment(
@@ -659,7 +662,9 @@ def trace_handler(p):
 
 def main_worker(rank, conf, output_dir, args):
     if rank == 0:
-        with capture_outputs(output_dir / "log.txt"):
+        with capture_outputs(
+            output_dir / "log.txt", cleanup_interval=args.cleanup_interval
+        ):
             training(rank, conf, output_dir, args)
     else:
         training(rank, conf, output_dir, args)
@@ -682,6 +687,11 @@ def main_worker(rank, conf, output_dir, args):
         type=str,
         choices=["default", "reduce-overhead", "max-autotune"],
     )
+    parser.add_argument(
+        "--cleanup_interval",
+        default=120,  # Cleanup log files every 120 seconds.
+        type=int,
+    )
     parser.add_argument("--overfit", action="store_true")
     parser.add_argument("--restore", action="store_true")
     parser.add_argument("--distributed", action="store_true")
@@ -700,7 +710,9 @@ def main_worker(rank, conf, output_dir, args):
 
     conf = OmegaConf.from_cli(args.dotlist)
     if args.conf:
-        conf = OmegaConf.merge(OmegaConf.resolve(OmegaConf.load(args.conf)), conf)
+        yaml_conf = OmegaConf.load(args.conf)
+        OmegaConf.resolve(yaml_conf)
+        conf = OmegaConf.merge(yaml_conf, conf)
     elif args.restore:
         restore_conf = OmegaConf.load(output_dir / "config.yaml")
         conf = OmegaConf.merge(restore_conf, conf)
diff --git a/gluefactory/utils/stdout_capturing.py b/gluefactory/utils/stdout_capturing.py
@@ -11,7 +11,6 @@
 import subprocess
 import sys
 from contextlib import contextmanager
-from threading import Timer
 
 
 def apply_backspaces_and_linefeeds(text):
@@ -61,14 +60,36 @@ def flush():
         pass  # unsupported
 
 
+def cleanup(filename):
+    with open(str(filename), "r", newline="") as target:
+        text = target.read()
+    text = apply_backspaces_and_linefeeds(text)
+    with open(str(filename), "w") as target:
+        target.write(text)
+
+
 # Duplicate stdout and stderr to a file. Inspired by:
 # http://eli.thegreenplace.net/2015/redirecting-all-kinds-of-stdout-in-python/
 # http://stackoverflow.com/a/651718/1388435
 # http://stackoverflow.com/a/22434262/1388435
 @contextmanager
-def capture_outputs(filename):
+def capture_outputs(filename, cleanup_interval=None):
     """Duplicate stdout and stderr to a file on the file descriptor level."""
-    with open(str(filename), "a+") as target:
+
+    if cleanup_interval is not None:
+        from threading import Timer
+
+        class RepeatTimer(Timer):
+            def run(self):
+                while not self.finished.wait(self.interval):
+                    self.function(*self.args, **self.kwargs)
+
+        timer = RepeatTimer(cleanup_interval, lambda: cleanup(filename))
+        timer.start()
+    else:
+        timer = None
+
+    with open(str(filename), mode="a+", newline="") as target:
         original_stdout_fd = 1
         original_stderr_fd = 2
         target_fd = target.fileno()
@@ -109,26 +130,12 @@ def capture_outputs(filename):
             os.dup2(saved_stdout_fd, original_stdout_fd)
             os.dup2(saved_stderr_fd, original_stderr_fd)
 
-            # wait for completion of the tee processes with timeout
-            # implemented using a timer because timeout support is py3 only
-            def kill_tees():
-                tee_stdout.kill()
-                tee_stderr.kill()
-
-            tee_timer = Timer(1, kill_tees)
-            try:
-                tee_timer.start()
-                tee_stdout.wait()
-                tee_stderr.wait()
-            finally:
-                tee_timer.cancel()
-
+            tee_stdout.wait(timeout=1)
+            tee_stderr.wait(timeout=1)
             os.close(saved_stdout_fd)
             os.close(saved_stderr_fd)
 
-    # Cleanup log file
-    with open(str(filename), "r") as target:
-        text = target.read()
-    text = apply_backspaces_and_linefeeds(text)
-    with open(str(filename), "w") as target:
-        target.write(text)
+            if timer is not None:
+                timer.cancel()
+
+            cleanup(filename)