merge

Tongjilibo · Tongjilibo · commit 441ae4ceae19 · 2024-02-19T08:48:31.000+08:00
diff --git a/README.md b/README.md
@@ -67,6 +67,7 @@ pip install git+https://github.com/Tongjilibo/torch4keras.git
 ## 4. 版本历史
 |更新日期| 版本 | 版本说明 |
 |------| ----------------- |----------- |
+|20240204|v0.1.9           | 增加Timeit, Timeit2, timeit等时间/速度监控|
 |20240116|v0.1.8           | 重新整理snippets, 重写save_pretrained|
 |20231219|v0.1.7           | 增加SimpleStreamFileLogger和LoggerHandler, 修改Logger的格式|
 |20231208|v0.1.6.post2     |监控fit过程，有报错则发送邮件提醒; 解决torch2.0的compile冲突问题; 修复clip_grad_norm的bug|
diff --git a/docs/History.md b/docs/History.md
@@ -1,5 +1,6 @@
 ## 更新历史
 
+- **20240204**: 增加Timeit, Timeit2, timeid等时间/速度监控
 - **20240116**: 重新整理snippets, 重写save_pretrained
 - **20231219**: 增加SimpleStreamFileLogger和LoggerHandler, 修改Logger的格式
 - **20231208**: 监控fit过程，有报错则发送邮件提醒; 解决torch2.0的compile冲突问题
diff --git a/test/test_time.py b/test/test_time.py
@@ -1,4 +1,4 @@
-from torch4keras.snippets import timeit, Timeit
+from torch4keras.snippets import timeit, Timeit, Timeit2
 import time
 
 
@@ -13,19 +13,30 @@ def func(n=10):
 with Timeit() as ti:
     for i in range(10):
         time.sleep(0.1)
-        ti.lap(prefix=i, restart=False)  # 统计累计耗时
+        ti.lap(name=i, reset=False)  # 统计累计耗时
 
 # 上下文管理器 - 统计每段速度
 with Timeit() as ti:
     for i in range(10):
         time.sleep(0.1)
-        ti.lap(count=10, prefix=i, restart=True)
+        ti.lap(count=10, name=i, reset=True)
     ti(10) # 统计速度
 
 
 # 上下文管理器 - 统计速度
 with Timeit() as ti:
     for i in range(10):
         time.sleep(0.1)
-        ti.lap(prefix=i, restart=True)
-    ti(10) # 统计速度
+        ti.lap(name=i, reset=True)
+    ti(10) # 统计速度
+
+ti = Timeit2()
+for i in range(10):
+    time.sleep(0.1)
+    ti.lap(name=i)
+
+for i in range(10):
+    time.sleep(0.1)
+    ti.lap(name=i)
+ti.end() # 打印时长
+
diff --git a/torch4keras/snippets/log.py b/torch4keras/snippets/log.py
@@ -62,6 +62,12 @@ def log_info(string:str, verbose:int=1):
     return res
 
 
+@functools.lru_cache(None)
+def log_info_once(string:str, verbose=1):
+    ''' 单次warning '''
+    return log_info(string, verbose)
+
+
 def log_warn(string:str, verbose:int=1):
     '''[WARNING]: message, 黄色前缀'''
     res = colorful('[WARNING]', color='yellow') + ' ' + string.strip()
@@ -70,6 +76,12 @@ def log_warn(string:str, verbose:int=1):
     return res
 
 
+@functools.lru_cache(None)
+def log_warn_once(string:str, verbose=1):
+    ''' 单次warning '''
+    return log_warn(string, verbose)
+
+
 def log_error(string:str, verbose:int=1):
     '''[ERROR]: message, 红色前缀'''
     res = colorful('[ERROR]', color='red') + ' ' + string.strip()
@@ -79,9 +91,9 @@ def log_error(string:str, verbose:int=1):
 
 
 @functools.lru_cache(None)
-def log_warn_once(string:str, verbose=1):
+def log_error_once(string:str, verbose=1):
     ''' 单次warning '''
-    return log_warn(string, verbose)
+    return log_error(string, verbose)
 
 
 @functools.lru_cache(None)
diff --git a/torch4keras/snippets/misc.py b/torch4keras/snippets/misc.py
@@ -5,6 +5,7 @@
 import os
 import random
 from .log import log_info, log_warn, log_error
+import json
 
 
 def seed_everything(seed:int=None):
@@ -77,7 +78,6 @@ def allowDotting(self, state=True):
 class JsonConfig:
     '''读取配置文件并返回可.操作符的字典'''
     def __new__(self, json_path, encoding='utf-8'):
-        import json
         return DottableDict(json.load(open(json_path, "r", encoding=encoding)))
 
 
diff --git a/torch4keras/snippets/monitor.py b/torch4keras/snippets/monitor.py
@@ -5,6 +5,7 @@
 import copy
 import functools
 from .log import log_info, log_warn, log_error
+from pprint import pprint
 
 
 def format_time(eta, hhmmss=True):
@@ -61,56 +62,108 @@ class Timeit:
     with Timeit() as ti:
         for i in range(10):
             time.sleep(0.1)
-            # ti.lap(prefix=i, restart=False)  # 统计累计耗时
-            # ti.lap(prefix=i, restart=True)  # 统计间隔耗时
-            # ti.lap(count=10, prefix=i, restart=True)  # 统计每段速度
+            # ti.lap(name=i, restart=False)  # 统计累计耗时
+            # ti.lap(name=i, restart=True)  # 统计间隔耗时
+            # ti.lap(count=10, name=i, restart=True)  # 统计每段速度
         # ti(10) # 统计速度
     '''
-    def __enter__(self):
+    def __enter__(self, template='Average speed: {:.2f}/s'):
         self.count = None
         self.start_tm = time.time()
-        self.template = 'Average speed: {:.2f}/s'
+        self.template = template
         return self
 
     def __call__(self, count):
         self.count = count
 
-    def restart(self):
+    def reset(self):
         '''自定义开始记录的地方'''
         self.start_tm = time.time()
     
-    def lap(self, count:int=None, prefix:str=None, restart=False):
+    def lap(self, name:str=None, count:int=None, reset=False):
         '''
+        :params name: 打印时候自定义的前缀
         :params count: 需要计算平均生成速度中统计的次数
-        :params prefix: 打印时候自定义的前缀
-        :params restart: 是否重置start_tm, True只记录时间间隔，否则记录的是从一开始的累计时间
+        :params reset: 是否重置start_tm, True只记录时间间隔，否则记录的是从一开始的累计时间
         '''
         if count is not None:
             self.count = count
-        prefix = '' if prefix is None else str(prefix).strip() + ' - '
+        name = '' if name is None else str(name).strip() + ' - '
 
         end_tm = time.time()
         consume = end_tm - self.start_tm
         if self.count is None:
+            # 只log时间
             consume = format_time(consume, hhmmss=False)
             start1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.start_tm))
             end1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_tm))
-            log_info(prefix + f'Cost {consume} [{start1} < {end1}]')
+            log_info(name + f'Cost {consume} [{start1} < {end1}]')
         elif consume > 0:
             speed = self.count / consume
-            log_info(prefix + self.template.format(speed))
+            log_info(name + self.template.format(speed))
         else:
             pass
             # log_warn('Time duration = 0')
         
-        if restart:
-            self.restart()
+        if reset:
+            self.reset()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.lap()
         print()
 
 
+class Timeit2:
+    '''记录耗时
+
+    Example
+    ----------------------
+    ti = Timeit2()
+    for i in range(10):
+        time.sleep(0.1)
+        ti.lap(name=i)
+    ti.end() # 打印各个步骤时长
+    '''
+    def __init__(self):
+        self.reset()
+
+    def __call__(self, *args, **kwargs):
+        self.lap(*args, **kwargs)
+
+    def reset(self):
+        '''自定义开始记录的地方'''
+        self.cost = dict()
+        self.count = dict()
+        self.start_tm = time.time()
+
+    def restart(self):
+        self.start_tm = time.time()
+
+    def lap(self, name:str):
+        '''
+        :params name: 打印时候自定义的前缀
+        '''
+        end_tm = time.time()
+        consume = end_tm - self.start_tm
+        name = str(name)
+        self.cost[name] = self.cost.get(name, 0) + consume
+        self.count[name] = self.count.get(name, 0) + 1
+        self.start_tm = time.time()
+
+    def end(self, verbose=1):
+        for k, v in self.count.items():
+            if v > 1:
+                self.cost['avg_' + k] = self.cost[k] / v
+        
+        if verbose > 0:
+            log_info('Cost detail')
+            pprint(self.cost)
+            print()
+
+        self.reset()
+        return self.cost
+
+
 def send_email(mail_receivers:Union[str,list], mail_subject:str, mail_msg:str="", mail_host:str=None, 
                mail_user:str=None, mail_pwd:str=None, mail_sender:str=None):
     ''' 发送邮件(默认使用笔者自己注册的邮箱，若含敏感信息请使用自己注册的邮箱)
diff --git a/torch4keras/trainer.py b/torch4keras/trainer.py
@@ -363,7 +363,6 @@ def fit(self, train_dataloader, steps_per_epoch=None, epochs=1, callbacks=None,
                 # forward和backward
                 if not self.unwrap_model().training:
                     self.unwrap_model().train()  # 设置为train模式
-                    
                 tr_loss, tr_loss_detail = 0, {}
                 for _ in range(self.grad_accumulation_steps):
                     train_X, train_y = self._prepare_nextbatch()  # 获取下一个batch的训练数据
@@ -562,23 +561,21 @@ def save_to_checkpoint(self, save_dir:str=None, model_path:str=None, optimizer_p
         :param mapping: dict, 模型文件的mapping
         :param trainable_only
         '''
-        model_path = model_path or os.path.join(save_dir, 'model.pt')
-        optimizer_path = optimizer_path or os.path.join(save_dir, 'optimizer.pt')
-        scheduler_path = scheduler_path or os.path.join(save_dir, 'scheduler.pt')
-        steps_params_path = steps_params_path or os.path.join(save_dir, 'steps_params.pt')
+        model_path = model_path or os.path.join(save_dir or './', 'model.pt')
+        optimizer_path = optimizer_path or os.path.join(save_dir or './', 'optimizer.pt')
+        scheduler_path = scheduler_path or os.path.join(save_dir or './', 'scheduler.pt')
+        steps_params_path = steps_params_path or os.path.join(save_dir or './', 'steps_params.pt')
 
         verbose_str = ''
         if model_path:
             self.save_weights(model_path, mapping=mapping, trainable_only=trainable_only)
             verbose_str += f'Model weights successfuly saved to {model_path}\n'
         if optimizer_path:
-            save_dir = os.path.dirname(optimizer_path)
-            os.makedirs(save_dir, exist_ok=True)
+            os.makedirs(os.path.dirname(optimizer_path), exist_ok=True)
             torch.save(self.optimizer.state_dict(), optimizer_path)
             verbose_str += f'Optimizer successfuly saved to {optimizer_path}\n'
         if scheduler_path and (self.scheduler is not None):
-            save_dir = os.path.dirname(scheduler_path)
-            os.makedirs(save_dir, exist_ok=True)
+            os.makedirs(os.path.dirname(scheduler_path), exist_ok=True)
             torch.save(self.scheduler.state_dict(), scheduler_path)
             verbose_str += f'Scheduler successfuly saved to {scheduler_path}\n'
         if steps_params_path: