deepmodeling
diff --git a/‎README.md‎
Lines changed: 144 additions & 14 deletions b/‎README.md‎
Lines changed: 144 additions & 14 deletions
diff --git a/‎docs/source/quickstart.md‎
Lines changed: 75 additions & 0 deletions b/‎docs/source/quickstart.md‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 6 additions & 1 deletion b/‎requirements.txt‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎setup.py‎
Lines changed: 5 additions & 1 deletion b/‎setup.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎tests/test_multilabel_classification.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/test_multilabel_classification.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/test_representation.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/test_representation.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unimol_tools/cli/__init__.py‎ b/‎unimol_tools/cli/__init__.py‎
diff --git a/‎unimol_tools/cli/run_predict.py‎
Lines changed: 15 additions & 0 deletions b/‎unimol_tools/cli/run_predict.py‎
Lines changed: 15 additions & 0 deletions
@@ -19,7 +19,6 @@ Unimol_tools is a easy-use wrappers for property prediction,representation and d
 
 ## Install
 - pytorch is required, please install pytorch according to your environment. if you are using cuda, please install pytorch with cuda. More details can be found at https://pytorch.org/get-started/locally/
-- currently, rdkit needs with numpy<2.0.0, please install rdkit with numpy<2.0.0.
 
 ### Option 1: Installing from PyPi (Recommended, for stable version)
 
@@ -38,13 +37,13 @@ pip install huggingface_hub
 ### Option 2: Installing from source (for latest version)
 
 ```python
-## Dependencies installation
-pip install -r requirements.txt
-
 ## Clone repository
 git clone https://github.com/deepmodeling/unimol_tools.git
 cd unimol_tools
 
+## Dependencies installation
+pip install -r requirements.txt
+
 ## Install
 python setup.py install
 ```
@@ -53,6 +52,10 @@ python setup.py install
 
 The UniMol pretrained models can be found at [dptech/Uni-Mol-Models](https://huggingface.co/dptech/Uni-Mol-Models/tree/main).
 
+If ``pretrained_model_path`` or ``pretrained_dict_path`` are left as ``None`` the
+toolkit will automatically download the corresponding files from this
+Hugging Face repository at runtime.
+
 If the download is slow, you can use a mirror, such as:
 
 ```bash
@@ -70,6 +73,7 @@ export UNIMOL_WEIGHT_DIR=/path/to/your/weights/dir/
 ```
 
 ## News
+- 2025-09-22: Lightweight pre-training tools are now available in Unimol_tools!
 - 2025-05-26: Unimol_tools is now independent from the Uni-Mol repository!
 - 2025-03-28: Unimol_tools now support Distributed Data Parallel (DDP)!
 - 2024-11-22: Unimol V2 has been added to Unimol_tools!
@@ -82,15 +86,40 @@ export UNIMOL_WEIGHT_DIR=/path/to/your/weights/dir/
 ### Molecule property prediction
 ```python
 from unimol_tools import MolTrain, MolPredict
-clf = MolTrain(task='classification', 
-                data_type='molecule', 
-                epochs=10, 
-                batch_size=16, 
-                metrics='auc',
-                )
+clf = MolTrain(
+    task='classification',
+    data_type='molecule',
+    epochs=10,
+    batch_size=16,
+    metrics='auc',
+    # pretrained weights are downloaded automatically when left as ``None``
+    # pretrained_model_path='/path/to/checkpoint.ckpt',
+    # pretrained_dict_path='/path/to/dict.txt',
+)
 clf.fit(data = train_data)
-# currently support data with smiles based csv/txt file, and
-# custom dict of {'atoms':[['C','C'],['C','H','O']], 'coordinates':[coordinates_1,coordinates_2]}
+# currently support data with smiles based csv/txt file, and sdf file with mol,
+# and custom dict of {'atoms':[['C','C'],['C','H','O']], 'coordinates':[coordinates_1,coordinates_2]}
+
+# The dict format can refer to the following format, or be obtained from sdf, 
+# which can also be directly input into the model.
+train_sdf = PandasTools.LoadSDF('exp/unimol_conformers_train.sdf')
+train_dict = {
+    'atoms': [list(atom.GetSymbol() for atom in mol.GetAtoms()) for mol in train_sdf['ROMol']],
+    # atoms[0]: ['C', 'C', 'O', 'C', 'O', 'C', ...]
+    'coordinates': [mol.GetConformers()[0].GetPositions() for mol in train_sdf['ROMol']],
+    # coordinates[0]: array([[ 6.6462, -1.8268,  1.9275],
+    #                        [ 6.1552, -1.9367,  0.4873],
+    #                        [ 5.1832, -0.8757,  0.3007],
+    #                        [ 5.4651, -0.0272, -0.7266],
+    #                        [ 4.8586, -0.0844, -1.7917],
+    #                        [ 6.5362,  0.9767, -0.3742],
+    #                        ...,])
+    'TARGET': train_sdf['TARGET'].tolist()
+    # TARGET: [0, 1, 0, 0, 1, 0, ...]
+}
+# clf.fit(data = train_sdf)
+# clf.fit(data = train_dict)
+
 
 clf = MolPredict(load_model='../exp')
 res = clf.predict(data = test_data)
@@ -99,8 +128,14 @@ res = clf.predict(data = test_data)
 ```python
 import numpy as np
 from unimol_tools import UniMolRepr
-# single smiles unimol representation
-clf = UniMolRepr(data_type='molecule', remove_hs=False)
+# single SMILES UniMol representation. If no paths are provided the
+# pretrained model and dictionary are fetched from Hugging Face.
+clf = UniMolRepr(
+    data_type='molecule',
+    remove_hs=False,
+    # pretrained_model_path='/path/to/checkpoint.ckpt',
+    # pretrained_dict_path='/path/to/dict.txt',
+)
 smiles = 'c1ccc(cc1)C2=NCC(=O)Nc3c2cc(cc3)[N+](=O)[O]'
 smiles_list = [smiles]
 unimol_repr = clf.get_repr(smiles_list, return_atomic_reprs=True)
@@ -110,6 +145,101 @@ print(np.array(unimol_repr['cls_repr']).shape)
 print(np.array(unimol_repr['atomic_reprs']).shape)
 ```
 
+### Command-line utilities
+
+Hydra-powered entry points make training, prediction, and representation
+available from the command line. Key-value pairs override options from the
+YAML files in `unimol_tools/config`.
+
+#### Training
+```bash
+python -m unimol_tools.cli.run_train \
+    train_path=train.csv \
+    task=regression \
+    save_path=./exp \
+    smiles_col=smiles \
+    target_cols=[target1] \
+    epochs=10 \
+    learning_rate=1e-4 \
+    batch_size=16 \
+    kfold=5
+```
+
+#### Prediction
+```bash
+python -m unimol_tools.cli.run_predict load_model=./exp data_path=test.csv
+```
+
+#### Representation
+```bash
+python -m unimol_tools.cli.run_repr data_path=test.csv smiles_col=smiles
+```
+
+### Molecule pretraining
+
+`unimol_tools` provides a command-line utility for pretraining Uni-Mol models on
+your own dataset. The script uses
+[Hydra](https://hydra.cc/) so configuration values can be overridden at the
+command line. Two common invocation examples are shown below: one for LMDB data
+and one for a CSV of SMILES strings.
+
+#### LMDB dataset
+
+```bash
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export HYDRA_FULL_ERROR=1
+export OMP_NUM_THREADS=1
+
+torchrun --standalone --nproc_per_node=NUM_GPUS \
+    -m unimol_tools.cli.run_pretrain \
+    dataset.train_path=train.lmdb \
+    dataset.valid_path=valid.lmdb \
+    dataset.data_type=lmdb \
+    dataset.dict_path=dict.txt \
+    training.total_steps=1000000 \
+    training.batch_size=16 \
+    training.update_freq=1
+```
+
+`dataset.dict_path` is optional. The effective batch size is
+`n_gpu * training.batch_size * training.update_freq`.
+
+#### CSV dataset
+
+```bash
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export HYDRA_FULL_ERROR=1
+export OMP_NUM_THREADS=1
+
+torchrun --standalone --nproc_per_node=NUM_GPUS \
+    -m unimol_tools.cli.run_pretrain \
+    dataset.train_path=train.csv \
+    dataset.valid_path=valid.csv \
+    dataset.data_type=csv \
+    dataset.smiles_column=smiles \
+    training.total_steps=1000000 \
+    training.batch_size=16 \
+    training.update_freq=1
+```
+
+For multi-node training, specify additional arguments, for example:
+
+```bash
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export HYDRA_FULL_ERROR=1
+export OMP_NUM_THREADS=1
+
+torchrun --nnodes=2 --nproc_per_node=8 --node_rank=0 \
+    --master_addr=<master-ip> --master_port=<port> \
+    -m unimol_tools.cli.run_pretrain ...
+```
+
+All available options are defined in
+[`pretrain_config.py`](unimol_tools/pretrain/pretrain_config.py), and checkpoints
+along with the dictionary are saved to the run directory. When GPU memory is
+limited, increase `training.update_freq` to accumulate gradients while keeping
+the effective batch size `n_gpu * training.batch_size * training.update_freq`.
+
 ## Credits
 We thanks all contributors from the community for their suggestions, bug reports and chemistry advices. Currently unimol-tools is maintained by Yaning Cui, Xiaohong Ji, Zhifeng Gao from DP Technology and AI for Science Insitution, Beijing.
 
 
@@ -39,6 +39,36 @@ pred = clf.fit(data = train_data)
 clf = MolPredict(load_model='../exp')
 res = clf.predict(data = test_data)
 ```
+
+### Command-line utilities
+
+Training, prediction, and representation can also be launched from the
+command line by overriding options in the YAML config files.
+
+#### Training
+```bash
+python -m unimol_tools.cli.run_train \
+    train_path=train.csv \
+    task=regression \
+    save_path=./exp \
+    smiles_col=smiles \
+    target_cols=[target1] \
+    epochs=10 \
+    learning_rate=1e-4 \
+    batch_size=16 \
+    kfold=5
+```
+
+#### Prediction
+```bash
+python -m unimol_tools.cli.run_predict load_model=./exp data_path=test.csv
+```
+
+#### Representation
+```bash
+python -m unimol_tools.cli.run_repr data_path=test.csv smiles_col=smiles
+```
+
 ## Uni-Mol molecule and atoms level representation
 
 Uni-Mol representation can easily be achieved as follow.
@@ -60,6 +90,51 @@ print(np.array(unimol_repr['cls_repr']).shape)
 # atomic level repr, align with rdkit mol.GetAtoms()
 print(np.array(unimol_repr['atomic_reprs']).shape)
 ```
+## Molecule pretraining
+
+Uni-Mol can be pretrained from scratch using the ``run_pretrain`` utility. The
+script is driven by Hydra, so configuration options are supplied on the command
+line. The examples below demonstrate common setups for LMDB and CSV inputs.
+
+### LMDB dataset
+
+```bash
+torchrun --standalone --nproc_per_node=NUM_GPUS \
+    -m unimol_tools.cli.run_pretrain \
+    dataset.train_path=train.lmdb \
+    dataset.valid_path=valid.lmdb \
+    dataset.data_type=lmdb \
+    dataset.dict_path=dict.txt \
+    training.total_steps=10000 \
+    training.batch_size=16 \
+    training.update_freq=1
+```
+
+`dataset.dict_path` is optional. The effective batch size is
+`n_gpu * training.batch_size * training.update_freq`.
+
+### CSV dataset
+
+```bash
+torchrun --standalone --nproc_per_node=NUM_GPUS \
+    -m unimol_tools.cli.run_pretrain \
+    dataset.train_path=train.csv \
+    dataset.valid_path=valid.csv \
+    dataset.data_type=csv \
+    dataset.smiles_column=smiles \
+    training.total_steps=10000 \
+    training.batch_size=16 \
+    training.update_freq=1
+```
+
+To scale across multiple machines, include the appropriate `torchrun`
+arguments, e.g. `--nnodes`, `--node_rank`, `--master_addr` and
+`--master_port`.
+
+Checkpoints and the dictionary are written to the output directory. When GPU
+memory is limited, increase `training.update_freq` to accumulate gradients while
+keeping the effective batch size `n_gpu * training.batch_size * training.update_freq`.
+
 ## Continue training (Re-train)
 
 ```python
 
@@ -1,3 +1,4 @@
+numpy<2.3.0
 numpy>=2.0.0
 pandas>=2.2.2
 scikit-learn>=1.5.0
@@ -6,4 +7,8 @@ joblib
 rdkit>=2024.3.4
 pyyaml
 addict
-tqdm
+tqdm
+hydra-core
+omegaconf
+tensorboard
+lmdb
@@ -5,7 +5,7 @@
 
 setup(
     name="unimol_tools",
-    version="0.1.4.post1",
+    version="0.1.5",
     description=(
         "unimol_tools is a Python package for property prediction with Uni-Mol in molecule, materials and protein."
     ),
@@ -33,6 +33,10 @@
         "scikit-learn>=1.5.0",
         "numba",
         "tqdm",
+        "hydra-core",
+        "omegaconf",
+        "tensorboard",
+        "lmdb",
     ],
     python_requires=">=3.9",
     include_package_data=True,
 
@@ -9,7 +9,7 @@
 from unimol_tools import MolTrain, MolPredict
 
 CSV_URL = 'https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz'
-SDF_URL = 'https://tripod.nih.gov/tox21/challenge/download?id=tox21_10k_data_allsdf&sec='
+SDF_URL = 'https://tripod.nih.gov/tox21/challenge/download?id=tox21_10k_data_allsdf'
 
 
 @pytest.mark.network
 
@@ -10,7 +10,7 @@
 
 VQM24_URL = 'https://zenodo.org/records/15442257/files/DMC.npz?download=1'
 TOX21_CSV_URL = 'https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz'
-TOX21_SDF_URL = 'https://tripod.nih.gov/tox21/challenge/download?id=tox21_10k_data_allsdf&sec='
+TOX21_SDF_URL = 'https://tripod.nih.gov/tox21/challenge/download?id=tox21_10k_data_allsdf'
 
 
 @pytest.mark.network
 
@@ -0,0 +1,15 @@
+import hydra
+from omegaconf import DictConfig
+
+from ..predict import MolPredict
+
+
+@hydra.main(version_base=None, config_path="../config", config_name="predict_config")
+def main(cfg: DictConfig):
+    data_path = cfg.get("data_path")
+    predictor = MolPredict(cfg=cfg)
+    predictor.predict(data=data_path, save_path=cfg.get("save_path"))
+
+
+if __name__ == "__main__":
+    main()