Add start with eval option (#84)

erogol · web-flow · commit 18ce42eb189e · 2022-12-08T00:18:16.000+01:00
* Add start with eval option

* Ping for training run

* Drop p3.6 from CI

* Turn off telemetry on CI

* Bump up to v0.0.18
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
         experimental: [false]
     steps:
       - uses: actions/checkout@v2
@@ -31,6 +31,9 @@ jobs:
           cache-dependency-path: 'requirements*'
       - name: check OS
         run: cat /etc/os-release
+      - name: Telemetry off
+        run: |
+          export TRAINER_TELEMETRY=0
       - name: Install dependencies
         run: |
           sudo apt-get update
diff --git a/trainer/VERSION b/trainer/VERSION
@@ -1 +1 @@
-v0.0.17
+v0.0.18
diff --git a/trainer/analytics.py b/trainer/analytics.py
@@ -0,0 +1,12 @@
+import os
+
+import requests
+
+telemetry = os.environ.get("TRAINER_TELEMETRY")
+
+
+def ping_training_run():
+    if telemetry == "0":
+        return
+    URL = "https://coqui.gateway.scarf.sh/trainer/training_run"
+    _ = requests.get(URL)
diff --git a/trainer/callbacks.py b/trainer/callbacks.py
@@ -1,4 +1,4 @@
-from typing import Dict, Callable
+from typing import Callable, Dict
 
 
 class TrainerCallback:
diff --git a/trainer/trainer.py b/trainer/trainer.py
@@ -19,6 +19,7 @@
 from torch.nn.parallel import DistributedDataParallel as DDP_th
 from torch.utils.data import DataLoader
 
+from trainer.analytics import ping_training_run
 from trainer.callbacks import TrainerCallback
 from trainer.generic_utils import (
     KeepAverage,
@@ -241,6 +242,10 @@ class TrainerArgs(Coqpit):
         default=False,
         metadata={"help": "Skip training and only run evaluation and test."},
     )
+    start_with_eval: bool = field(
+        default=False,
+        metadata={"help": "Start with evaluation and test."},
+    )
     small_run: int = field(
         default=None,
         metadata={
@@ -388,6 +393,7 @@ def __init__(  # pylint: disable=dangerous-default-value
         self.grad_accum_steps = args.grad_accum_steps
         self.overfit_batch = args.overfit_batch
         self.skip_train_epoch = args.skip_train_epoch
+        self.start_with_eval = args.start_with_eval
 
         assert self.grad_accum_steps > 0, " [!] grad_accum_steps must be greater than 0."
 
@@ -519,6 +525,7 @@ def __init__(  # pylint: disable=dangerous-default-value
         self.callbacks.on_init_end(self)
         self.dashboard_logger.add_config(config)
         self.save_training_script()
+        ping_training_run()
 
     def save_training_script(self):
         """Save the training script to tracking dashboard and output path."""
@@ -1519,7 +1526,7 @@ def _fit(self) -> None:
             self.keep_avg_eval = KeepAverage() if self.config.run_eval else None
             self.epochs_done = epoch
             self.c_logger.print_epoch_start(epoch, self.config.epochs, self.output_path)
-            if not self.skip_train_epoch:
+            if not self.skip_train_epoch and not self.start_with_eval:
                 self.train_epoch()
             if self.config.run_eval:
                 self.eval_epoch()
@@ -1532,6 +1539,7 @@ def _fit(self) -> None:
             if self.args.rank in [None, 0]:
                 self.save_best_model()
             self.callbacks.on_epoch_end(self)
+            self.start_with_eval = False
 
     def fit_with_largest_batch_size(self, starting_batch_size=2048) -> None:
         cuda_meminfo()
@@ -1552,7 +1560,7 @@ def fit_with_largest_batch_size(self, starting_batch_size=2048) -> None:
                     torch.cuda.empty_cache()
                 else:
                     raise
-            except Exception as exception: #pylint: disable=broad-except
+            except Exception as exception:  # pylint: disable=broad-except
                 # catches the torch.cuda.OutOfMemoryError
                 if bs > 1 and should_reduce_batch_size(exception):
                     bs //= 2
diff --git a/trainer/utils/cpu_memory.py b/trainer/utils/cpu_memory.py
@@ -8,8 +8,9 @@ def get_available_cpu_memory():
     available_memory = psutil.virtual_memory().available
 
     try:
-        import resource # pylint: disable=import-outside-toplevel
-        _, hard_mem_limit = resource.getrlimit(resource.RLIMIT_AS) #pylint: disable=unused-variable
+        import resource  # pylint: disable=import-outside-toplevel
+
+        _, hard_mem_limit = resource.getrlimit(resource.RLIMIT_AS)  # pylint: disable=unused-variable
         if hard_mem_limit != resource.RLIM_INFINITY:
             used_memory = this_process.memory_info().vms
             available_memory = min(hard_mem_limit - used_memory, available_memory)
@@ -21,9 +22,9 @@ def get_available_cpu_memory():
 
 def set_cpu_memory_limit(num_gigabytes):
     try:
-        import resource # pylint: disable=import-outside-toplevel
+        import resource  # pylint: disable=import-outside-toplevel
 
-        num_bytes = int(num_gigabytes * 2 ** 30)
+        num_bytes = int(num_gigabytes * 2**30)
         _, hard_limit = resource.getrlimit(resource.RLIMIT_AS)
         if hard_limit != resource.RLIM_INFINITY:
             hard_limit = min(num_bytes, hard_limit)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from typing import Dict, Callable`
	`1`	`+from typing import Callable, Dict`
`2`	`2`
`3`	`3`
`4`	`4`	`class TrainerCallback:`