deepmodeling
diff --git a/‎unimol_tools/unimol_tools/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎unimol_tools/unimol_tools/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎unimol_tools/unimol_tools/config/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎unimol_tools/unimol_tools/config/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unimol_tools/unimol_tools/config/default.yaml‎
Lines changed: 4 additions & 2 deletions b/‎unimol_tools/unimol_tools/config/default.yaml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎unimol_tools/unimol_tools/config/model_config.py‎
Lines changed: 3 additions & 3 deletions b/‎unimol_tools/unimol_tools/config/model_config.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎unimol_tools/unimol_tools/data/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎unimol_tools/unimol_tools/data/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unimol_tools/unimol_tools/data/conformer.py‎
Lines changed: 184 additions & 78 deletions b/‎unimol_tools/unimol_tools/data/conformer.py‎
Lines changed: 184 additions & 78 deletions
diff --git a/‎unimol_tools/unimol_tools/data/datahub.py‎
Lines changed: 44 additions & 24 deletions b/‎unimol_tools/unimol_tools/data/datahub.py‎
Lines changed: 44 additions & 24 deletions
diff --git a/‎unimol_tools/unimol_tools/data/datareader.py‎
Lines changed: 52 additions & 28 deletions b/‎unimol_tools/unimol_tools/data/datareader.py‎
Lines changed: 52 additions & 28 deletions
@@ -1,3 +1,3 @@
-from .train import MolTrain
 from .predict import MolPredict
-from .predictor import UniMolRepr
+from .predictor import UniMolRepr
+from .train import MolTrain
@@ -1 +1 @@
-from .model_config import MODEL_CONFIG, MODEL_CONFIG_V2
+from .model_config import MODEL_CONFIG, MODEL_CONFIG_V2
@@ -17,5 +17,7 @@ learning_rate: 1e-4
 warmup_ratio: 0.03
 batch_size: 16
 max_norm: 5.0
-cuda: True
-amp: True
+use_cuda: True
+use_amp: True
+use_ddp: True
+use_gpu: 0, 1
@@ -1,12 +1,12 @@
 MODEL_CONFIG = {
-    "weight":{
+    "weight": {
         "protein": "poc_pre_220816.pt",
         "molecule_no_h": "mol_pre_no_h_220816.pt",
         "molecule_all_h": "mol_pre_all_h_220816.pt",
         "crystal": "mp_all_h_230313.pt",
         "oled": "oled_pre_no_h_230101.pt",
     },
-    "dict":{
+    "dict": {
         "protein": "poc.dict.txt",
         "molecule_no_h": "mol.dict.txt",
         "molecule_all_h": "mol.dict.txt",
@@ -23,4 +23,4 @@
         '570m': 'modelzoo/570M/checkpoint.pt',
         '1.1B': 'modelzoo/1.1B/checkpoint.pt',
     },
-}
+}
@@ -1,2 +1,2 @@
 from .datahub import DataHub
-from .dictionary import Dictionary
+from .dictionary import Dictionary
@@ -3,20 +3,23 @@
 # LICENSE file in the root directory of this source tree.
 
 from __future__ import absolute_import, division, print_function
+
 import numpy as np
+
+from ..utils import logger
+from .conformer import ConformerGen, UniMolV2Feature
 from .datareader import MolDataReader
 from .datascaler import TargetScaler
-from .conformer import ConformerGen, UniMolV2Feature
 from .split import Splitter
-from ..utils import logger
 
 
 class DataHub(object):
     """
     The DataHub class is responsible for storing and preprocessing data for machine learning tasks.
-    It initializes with configuration options to handle different types of tasks such as regression, 
+    It initializes with configuration options to handle different types of tasks such as regression,
     classification, and others. It also supports data scaling and handling molecular data.
     """
+
     def __init__(self, data=None, is_train=True, save_path=None, **params):
         """
         Initializes the DataHub instance with data and configuration for the ML task.
@@ -35,44 +38,54 @@ def __init__(self, data=None, is_train=True, save_path=None, **params):
         self.ss_method = params.get('target_normalize', 'none')
         self._init_data(**params)
         self._init_split(**params)
-    
+
     def _init_data(self, **params):
         """
         Initializes and preprocesses the data based on the task and parameters provided.
 
-        This method handles reading raw data, scaling targets, and transforming data for use with 
-        molecular inputs. It tailors the preprocessing steps based on the task type, such as regression 
+        This method handles reading raw data, scaling targets, and transforming data for use with
+        molecular inputs. It tailors the preprocessing steps based on the task type, such as regression
         or classification.
 
         :param params: Additional parameters for data processing.
         :raises ValueError: If the task type is unknown.
         """
         self.data = MolDataReader().read_data(self.data, self.is_train, **params)
-        self.data['target_scaler'] = TargetScaler(self.ss_method, self.task, self.save_path)
-        if self.task == 'regression': 
-            target = np.array(self.data['raw_target']).reshape(-1,1).astype(np.float32)
+        self.data['target_scaler'] = TargetScaler(
+            self.ss_method, self.task, self.save_path
+        )
+        if self.task == 'regression':
+            target = np.array(self.data['raw_target']).reshape(-1, 1).astype(np.float32)
             if self.is_train:
                 self.data['target_scaler'].fit(target, self.save_path)
                 self.data['target'] = self.data['target_scaler'].transform(target)
             else:
                 self.data['target'] = target
         elif self.task == 'classification':
-            target = np.array(self.data['raw_target']).reshape(-1,1).astype(np.int32)
+            target = np.array(self.data['raw_target']).reshape(-1, 1).astype(np.int32)
             self.data['target'] = target
-        elif self.task =='multiclass':
-            target = np.array(self.data['raw_target']).reshape(-1,1).astype(np.int32)
+        elif self.task == 'multiclass':
+            target = np.array(self.data['raw_target']).reshape(-1, 1).astype(np.int32)
             self.data['target'] = target
             if not self.is_train:
-                self.data['multiclass_cnt'] = self.multiclass_cnt 
+                self.data['multiclass_cnt'] = self.multiclass_cnt
         elif self.task == 'multilabel_regression':
-            target = np.array(self.data['raw_target']).reshape(-1,self.data['num_classes']).astype(np.float32)
+            target = (
+                np.array(self.data['raw_target'])
+                .reshape(-1, self.data['num_classes'])
+                .astype(np.float32)
+            )
             if self.is_train:
                 self.data['target_scaler'].fit(target, self.save_path)
-                self.data['target'] = self.data['target_scaler'].transform(target)                
+                self.data['target'] = self.data['target_scaler'].transform(target)
             else:
                 self.data['target'] = target
         elif self.task == 'multilabel_classification':
-            target = np.array(self.data['raw_target']).reshape(-1,self.data['num_classes']).astype(np.int32)
+            target = (
+                np.array(self.data['raw_target'])
+                .reshape(-1, self.data['num_classes'])
+                .astype(np.int32)
+            )
             self.data['target'] = target
         elif self.task == 'repr':
             self.data['target'] = self.data['raw_target']
@@ -81,23 +94,30 @@ def _init_data(self, **params):
 
         if params.get('model_name', None) == 'unimolv1':
             if 'atoms' in self.data and 'coordinates' in self.data:
-                no_h_list = ConformerGen(**params).transform_raw(self.data['atoms'], self.data['coordinates'])
+                no_h_list = ConformerGen(**params).transform_raw(
+                    self.data['atoms'], self.data['coordinates']
+                )
             else:
-                smiles_list = self.data["smiles"]                  
+                smiles_list = self.data["smiles"]
                 no_h_list = ConformerGen(**params).transform(smiles_list)
         elif params.get('model_name', None) == 'unimolv2':
             if 'atoms' in self.data and 'coordinates' in self.data:
-                no_h_list = UniMolV2Feature(**params).transform_raw(self.data['atoms'], self.data['coordinates'])
+                no_h_list = UniMolV2Feature(**params).transform_raw(
+                    self.data['atoms'], self.data['coordinates']
+                )
             else:
-                smiles_list = self.data["smiles"]                  
+                smiles_list = self.data["smiles"]
                 no_h_list = UniMolV2Feature(**params).transform(smiles_list)
 
         self.data['unimol_input'] = no_h_list
 
     def _init_split(self, **params):
 
-        self.split_method = params.get('split_method','5fold_random')
-        kfold, method = int(self.split_method.split('fold')[0]), self.split_method.split('_')[-1]    # Nfold_xxxx
+        self.split_method = params.get('split_method', '5fold_random')
+        kfold, method = (
+            int(self.split_method.split('fold')[0]),
+            self.split_method.split('_')[-1],
+        )  # Nfold_xxxx
         self.kfold = params.get('kfold', kfold)
         self.method = params.get('split', method)
         self.split_seed = params.get('split_seed', 42)
@@ -110,8 +130,8 @@ def _init_split(self, **params):
             logger.info(f"Kfold is 1, all data is used for training.")
         else:
             logger.info(f"Split method: {self.method}, fold: {self.kfold}")
-        nfolds = np.zeros(len(split_nfolds[0][0])+len(split_nfolds[0][1]), dtype=int)
+        nfolds = np.zeros(len(split_nfolds[0][0]) + len(split_nfolds[0][1]), dtype=int)
         for enu, (tr_idx, te_idx) in enumerate(split_nfolds):
             nfolds[te_idx] = enu
         self.data['split_nfolds'] = split_nfolds
-        return split_nfolds
+        return split_nfolds
@@ -5,17 +5,21 @@
 from __future__ import absolute_import, division, print_function
 
 import os
-import pandas as pd
+import pathlib
+
 import numpy as np
+import pandas as pd
 from rdkit import Chem
-from ..utils import logger
-import pathlib
 from rdkit.Chem.Scaffolds import MurckoScaffold
 
+from ..utils import logger
+
+
 class MolDataReader(object):
     '''A class to read Mol Data.'''
+
     def read_data(self, data=None, is_train=True, **params):
-        # TO DO 
+        # TO DO
         # 1. add anomaly detection & outlier removal.
         # 2. add support for other file format.
         # 3. add support for multi tasks.
@@ -26,7 +30,7 @@ def read_data(self, data=None, is_train=True, **params):
         1. if target_cols is not None, use target_cols as target columns.
         2. if target_cols is None, use all columns with prefix 'target_col_prefix' as target columns.
         3. use given target_cols as target columns placeholder with value -1.0 for predict
-        
+
         :param data: The input molecular data. Can be a file path (str), a dictionary, or a list of SMILES strings.
         :param is_train: (bool) A flag indicating if the operation is for training. Determines data processing steps.
         :param params: A dictionary of additional parameters for data processing.
@@ -50,21 +54,21 @@ def read_data(self, data=None, is_train=True, **params):
             # load from dict
             if 'target' in data:
                 label = np.array(data['target'])
-                if len(label.shape)==1 or label.shape[1] == 1:
+                if len(label.shape) == 1 or label.shape[1] == 1:
                     data[target_col_prefix] = label.reshape(-1)
                 else:
                     for i in range(label.shape[1]):
-                        data[target_col_prefix + str(i)] = label[:,i]
+                        data[target_col_prefix + str(i)] = label[:, i]
 
             _ = data.pop('target', None)
             data = pd.DataFrame(data).rename(columns={smiles_col: 'SMILES'})
-        
+
         elif isinstance(data, list) or isinstance(data, np.ndarray):
             # load from smiles list
             data = pd.DataFrame(data, columns=['SMILES'])
         else:
             raise ValueError('Unknown data type: {}'.format(type(data)))
-        
+
         #### parsing target columns
         #### 1. if target_cols is not None, use target_cols as target columns.
         #### 2. if target_cols is None, use all columns with prefix 'target_col_prefix' as target columns.
@@ -77,37 +81,45 @@ def read_data(self, data=None, is_train=True, **params):
             multiclass_cnt = None
         else:
             if target_cols is None:
-                target_cols = [item for item in data.columns if item.startswith(target_col_prefix)]
+                target_cols = [
+                    item for item in data.columns if item.startswith(target_col_prefix)
+                ]
             elif isinstance(target_cols, str):
                 target_cols = target_cols.split(',')
             elif isinstance(target_cols, list):
                 pass
             else:
-                raise ValueError('Unknown target_cols type: {}'.format(type(target_cols)))
-                              
+                raise ValueError(
+                    'Unknown target_cols type: {}'.format(type(target_cols))
+                )
+
             if is_train:
                 if anomaly_clean:
-                    data = self.anomaly_clean(data, task, target_cols)  
+                    data = self.anomaly_clean(data, task, target_cols)
                 if task == 'multiclass':
                     multiclass_cnt = int(data[target_cols].max() + 1)
             else:
                 for col in target_cols:
                     if col not in data.columns or data[col].isnull().any():
                         data[col] = -1.0
-                    
+
             targets = data[target_cols].values.tolist()
             num_classes = len(target_cols)
-        
+
         dd = {
             'raw_data': data,
             'raw_target': targets,
             'num_classes': num_classes,
             'target_cols': target_cols,
-            'multiclass_cnt': multiclass_cnt if task == 'multiclass' and is_train else None
+            'multiclass_cnt': (
+                multiclass_cnt if task == 'multiclass' and is_train else None
+            ),
         }
         if smiles_col in data.columns:
-            mask = data[smiles_col].apply(lambda smi: self.check_smiles(smi, is_train, smi_strict))
-            data = data[mask]  
+            mask = data[smiles_col].apply(
+                lambda smi: self.check_smiles(smi, is_train, smi_strict)
+            )
+            data = data[mask]
             dd['smiles'] = data[smiles_col].tolist()
             dd['scaffolds'] = data[smiles_col].map(self.smi2scaffold).tolist()
         else:
@@ -127,7 +139,7 @@ def read_data(self, data=None, is_train=True, **params):
 
         return dd
 
-    def check_smiles(self,smi, is_train, smi_strict):
+    def check_smiles(self, smi, is_train, smi_strict):
         """
         Validates a SMILES string and decides whether it should be included based on training mode and strictness.
 
@@ -144,9 +156,9 @@ def check_smiles(self,smi, is_train, smi_strict):
                 return False
             else:
                 raise ValueError(f'SMILES rule is illegal: {smi}')
-        return True    
-    
-    def smi2scaffold(self,smi):
+        return True
+
+    def smi2scaffold(self, smi):
         """
         Converts a SMILES string to its corresponding scaffold.
 
@@ -155,10 +167,12 @@ def smi2scaffold(self,smi):
         :return: (str) The scaffold of the SMILES string, or the original SMILES if conversion fails.
         """
         try:
-            return MurckoScaffold.MurckoScaffoldSmiles(smiles=smi, includeChirality=True)
+            return MurckoScaffold.MurckoScaffoldSmiles(
+                smiles=smi, includeChirality=True
+            )
         except:
             return smi
-    
+
     def anomaly_clean(self, data, task, target_cols):
         """
         Performs anomaly cleaning on the data based on the specified task.
@@ -170,13 +184,18 @@ def anomaly_clean(self, data, task, target_cols):
         :return: (DataFrame) The cleaned dataset.
         :raises ValueError: If the provided task is not recognized.
         """
-        if task in ['classification', 'multiclass', 'multilabel_classification', 'multilabel_regression']:
+        if task in [
+            'classification',
+            'multiclass',
+            'multilabel_classification',
+            'multilabel_regression',
+        ]:
             return data
         if task == 'regression':
             return self.anomaly_clean_regression(data, target_cols)
         else:
             raise ValueError('Unknown task: {}'.format(task))
-    
+
     def anomaly_clean_regression(self, data, target_cols):
         """
         Performs anomaly cleaning specifically for regression tasks using a 3-sigma threshold.
@@ -189,6 +208,11 @@ def anomaly_clean_regression(self, data, target_cols):
         sz = data.shape[0]
         target_col = target_cols[0]
         _mean, _std = data[target_col].mean(), data[target_col].std()
-        data = data[(data[target_col] > _mean - 3 * _std) & (data[target_col] < _mean + 3 * _std)]
-        logger.info('Anomaly clean with 3 sigma threshold: {} -> {}'.format(sz, data.shape[0]))
+        data = data[
+            (data[target_col] > _mean - 3 * _std)
+            & (data[target_col] < _mean + 3 * _std)
+        ]
+        logger.info(
+            'Anomaly clean with 3 sigma threshold: {} -> {}'.format(sz, data.shape[0])
+        )
         return data
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .model_config import MODEL_CONFIG, MODEL_CONFIG_V2`
	`1`	`+from .model_config import MODEL_CONFIG, MODEL_CONFIG_V2`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`from .datahub import DataHub`
`2`		`-from .dictionary import Dictionary`
	`2`	`+from .dictionary import Dictionary`