Skip to content

Commit fa0af9d

Browse files
committed
Adding simplified data wrangling files
1 parent 9e8057b commit fa0af9d

File tree

10 files changed

+133419
-0
lines changed

10 files changed

+133419
-0
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Molecular Transformer Baseline
2+
3+
This is an encoder-decoder transformer on SMILES data
4+
5+
## Preparing the data
6+
7+
The data can be prepared ahead of time to simplify the pipeline
8+
9+
---
10+
This example is a simplified encoder-decoder transformer in the `application/nlp` directory.
11+
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import argparse
2+
import data_utils
3+
4+
5+
# Command-line arguments
6+
def add_transformer_architecture_arguments(args: argparse.Namespace):
7+
"""
8+
Adds the command line arguments to specify transformer architecture model
9+
parameters. This is only relevant for the encoder-decoder transformer model.
10+
"""
11+
args.add_argument('--num-attention-heads',
12+
action='store',
13+
default=8,
14+
type=int,
15+
help='number of parallel attention layers (default: 8)',
16+
metavar='NUM')
17+
args.add_argument('--embed-dim',
18+
action='store',
19+
default=512,
20+
type=int,
21+
help='embedding space dimension (default: 512)',
22+
metavar='NUM')
23+
args.add_argument('--feedforward-dim',
24+
action='store',
25+
default=0,
26+
type=int,
27+
help='feedforward network dimension. If zero, set to be '
28+
'4 times the embedding dimension (default: 0)',
29+
metavar='NUM')
30+
args.add_argument('--num-layers',
31+
action='store',
32+
default=6,
33+
type=int,
34+
help='Number of encoder and decoder layers (default: 6)',
35+
metavar='NUM')
36+
37+
38+
def add_dataset_arguments(args: argparse.Namespace, default: str):
39+
"""
40+
Adds dataset-related arguments to an existing argparse object.
41+
"""
42+
args.add_argument('--dataset',
43+
type=str,
44+
default=default,
45+
help=f'Which dataset to use (default: {default})',
46+
choices=data_utils.available_datasets())
47+
args.add_argument('--dataset-fraction',
48+
action='store',
49+
default=1.0,
50+
type=float,
51+
help='Fraction of dataset to use (default: 1.0)',
52+
metavar='NUM')
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import importlib
2+
import os
3+
import sys
4+
from typing import List
5+
6+
dataset_dir = os.path.join(os.path.dirname(__file__), 'datasets')
7+
8+
9+
def available_datasets() -> List[str]:
10+
"""
11+
Returns the available datasets in the dataset folder.
12+
"""
13+
result = []
14+
for file in os.listdir(dataset_dir):
15+
if file.endswith('.py'):
16+
result.append(os.path.basename(file)[:-3])
17+
return result
18+
19+
20+
def load_dataset(name: str):
21+
"""
22+
Loads a dataset by importing the requested module.
23+
"""
24+
sys.path.append(dataset_dir)
25+
return importlib.import_module(name)
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
"""
2+
The QM9 dataset, stored as pre-tokenized binary files for optimized processing.
3+
"""
4+
import os
5+
import os.path
6+
import pickle
7+
8+
import numpy as np
9+
from pretokenize.SMILES_tokenizer import MolTokenizer
10+
11+
sequence_length = int(os.getenv('QM9_SEQUENCE_LENGTH', default='32'))
12+
13+
# ----------------------------------------------
14+
# Setup
15+
# ----------------------------------------------
16+
17+
# Load the datasets
18+
data_dir = os.getenv(
19+
'QM9_DATA_DIR',
20+
'/p/vast1/lbann/datasets/FLASK/qm9')
21+
22+
tokenizer = MolTokenizer("SMILES_vocab.json")
23+
tokenizer.load_vocab_file()
24+
25+
dataset_train = np.load(os.path.join(data_dir, 'QM9_Pretokenize.py'))
26+
27+
_vocab_size = 46
28+
29+
pad_index = tokenizer.token_to_id('<pad>')
30+
bos_index = tokenizer.token_to_id('<bos>')
31+
eos_index = tokenizer.token_to_id('<eos>')
32+
33+
# ----------------------------------------------
34+
# Sample access functions
35+
# ----------------------------------------------
36+
37+
def num_train_samples():
38+
return dataset_train.shape[0]
39+
40+
def get_train_sample(i):
41+
data = dataset_train[i]
42+
43+
return
44+
45+
def sample_dims():
46+
return (2 * sequence_length + 1, )
47+
48+
def vocab_size():
49+
return _vocab_size
50+
51+
52+
if __name__ == '__main__':
53+
print('Training samples:', num_train_samples())
54+
print('Training sample 101:')
55+
print(get_train_sample(101))
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import numpy as np
2+
from SMILES_tokenizer import MolTokenizer
3+
4+
5+
def random_zero_array(arr, probability, mask):
6+
return np.where(np.random.random(arr.shape) < probability, mask, arr)
7+
8+
9+
def main():
10+
tokenizer = MolTokenizer("SMILES_vocab.json")
11+
tokenizer.load_vocab_file()
12+
with open("QM9_smiles.txt", 'r') as smiles_data:
13+
smiles_data = smiles_data.readlines()
14+
num_samples = len(smiles_data)
15+
max_length = 32
16+
17+
tokenized_data = np.ones((num_samples, max_length)) * tokenizer.encode(tokenizer.pad_token)
18+
tokenized_data[:, 0] = tokenizer.encode(tokenizer.sep_token)
19+
20+
for i, smiles in enumerate(smiles_data, start=1):
21+
tokens = tokenizer.tokenize(smiles)
22+
tokens = random_zero_array(tokens, 0.15, tokenizer.encode(tokenizer.mask_token))
23+
tokenized_data[i, :len(tokens)] = tokens
24+
tokenized_data[i, len(tokens)] = tokenizer.encode(tokenizer.cls_token)
25+
26+
np.save('QM9_Pretokenized.npy', tokenized_data)
27+
28+
if __name__ == '__main__':
29+
main()
30+

0 commit comments

Comments
 (0)