Skip to content

Commit 34d5fca

Browse files
committed
Add research for getting tinystories working
1 parent ec1fada commit 34d5fca

17 files changed

Lines changed: 26370 additions & 0 deletions

demos/tinystories/CMakeLists.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
chai_add_executable(TinyStoriesTest
2+
${CMAKE_CURRENT_SOURCE_DIR}/testArgmaxDecode.chpl
3+
${PROJECT_ROOT_DIR}/lib
4+
)
5+
6+
chai_add_executable(TinyStoriesTestConv2d
7+
${CMAKE_CURRENT_SOURCE_DIR}/testConv2d.chpl
8+
${PROJECT_ROOT_DIR}/lib
9+
)
10+
11+
chai_add_executable(OliverHowTo
12+
${CMAKE_CURRENT_SOURCE_DIR}/oh2.chpl
13+
${PROJECT_ROOT_DIR}/lib
14+
)

demos/tinystories/oh2.chpl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
use Tensor;
2+
3+
proc main() {
4+
writeln(ndarray.arange(1,2,3));
5+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
cmake_minimum_required(VERSION 4.0)
2+
project(libtorch_example)
3+
4+
find_package(Torch REQUIRED)
5+
6+
add_executable(libtorch_example generate.cpp)
7+
8+
# Include path for nlohmann/json
9+
target_include_directories(libtorch_example PRIVATE ${PROJECT_SOURCE_DIR}/external/json)
10+
11+
target_link_libraries(libtorch_example "${TORCH_LIBRARIES}")
12+
set_property(TARGET libtorch_example PROPERTY CXX_STANDARD 17)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import torch
2+
print(torch.__path__)
3+
print(torch.utils.cmake_prefix_path)
4+
5+
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import torch
2+
from collections import Counter
3+
from torch.utils.data import Dataset, DataLoader
4+
from datasets import load_dataset
5+
import spacy
6+
from torch.nn.utils.rnn import pad_sequence
7+
from tqdm import tqdm
8+
import random
9+
random.seed(0)
10+
11+
MAX_LEN = 300
12+
MAX_STORIES = 250000
13+
14+
class Vocabulary:
15+
def __init__(self, corpus, tokenizer):
16+
self.tokenizer = tokenizer
17+
self.word2idx, self.idx2word = self.build_vocab(corpus)
18+
19+
def __len__(self):
20+
return len(self.word2idx)
21+
22+
def text2idx(self, text):
23+
tokens = [str(x).strip().lower() for x in self.tokenizer(text)]
24+
return [self.word2idx[t] if t in self.word2idx.keys() else self.word2idx['<UNK>'] for t in tokens]
25+
26+
def idx2text(self, idxs):
27+
return [self.idx2word[i] if i in self.idx2word.keys() else '<UNK>' for i in idxs]
28+
29+
30+
def build_vocab(self,corpus):
31+
cntr = Counter()
32+
for datapoint in tqdm(corpus):
33+
cntr.update( [str(x).strip().lower() for x in self.tokenizer(datapoint)] )
34+
35+
tokens = [t for t,c in cntr.items() if c >= 30]
36+
word2idx = {t:i+4 for i,t in enumerate(tokens)}
37+
idx2word = {i+4:t for i,t in enumerate(tokens)}
38+
39+
word2idx['<PAD>'] = 0 #add padding token
40+
idx2word[0] = '<PAD>'
41+
42+
word2idx['<SOS>'] = 1 #add padding token
43+
idx2word[1] = '<SOS>'
44+
45+
word2idx['<EOS>'] = 2 #add padding token
46+
idx2word[2] = '<EOS>'
47+
48+
word2idx['<UNK>'] = 3 #add padding token
49+
idx2word[3] = '<UNK>'
50+
51+
52+
return word2idx, idx2word
53+
54+
class TinyStories(Dataset):
55+
56+
def __init__(self,split="train", vocab = None):
57+
58+
print("Loading data...")
59+
dataset = load_dataset("roneneldan/TinyStories", split=split)
60+
self.data = [x["text"] for x in random.sample(list(dataset), MAX_STORIES)]
61+
62+
63+
if vocab == None:
64+
print("Building vocab...")
65+
self.vocab = Vocabulary(self.data, spacy.load('en_core_web_sm').tokenizer)
66+
else:
67+
self.vocab = vocab
68+
69+
def __len__(self):
70+
return len(self.data)
71+
72+
def __getitem__(self, idx):
73+
x = self.vocab.text2idx(self.data[idx])
74+
l = min(MAX_LEN, len(x))
75+
numeralized = [self.vocab.word2idx['<SOS>']]+x[:l]+[self.vocab.word2idx['<EOS>']]
76+
return torch.tensor(numeralized)
77+
78+
@staticmethod
79+
def pad_collate(batch):
80+
xx_pad = pad_sequence(batch, batch_first=True, padding_value=0)
81+
82+
return xx_pad
83+
84+
def getTinyStoriesDataloadersAndVocab(batch_size=128):
85+
train = TinyStories(split="train")
86+
87+
collate = TinyStories.pad_collate
88+
train_loader = DataLoader(train, batch_size=batch_size, num_workers=8, shuffle=True, collate_fn=collate, drop_last=True)
89+
90+
return train_loader, train.vocab
91+
92+
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import torch
2+
from models.TransformerLM import *
3+
from data.TinyStories import *
4+
from spacy.tokenizer import Tokenizer
5+
6+
# safely unpickle Vocabulary object
7+
torch.serialization.add_safe_globals([Vocabulary, Tokenizer])
8+
9+
# load checkpoint
10+
CHKPT_PATH = "./chkpts/2ZJPbu_TinyStories"
11+
chkpt = torch.load(CHKPT_PATH, map_location=torch.device('cpu'))
12+
13+
# rebuild model
14+
config = chkpt["config"]
15+
vocab = chkpt["vocab"]
16+
vocab_size = len(vocab)
17+
18+
model = TransformerLM(vocab_size, config["d_model"], config["n_heads"], config["n_layers"])
19+
model.load_state_dict(chkpt["model_state_dict"])
20+
model.eval()
21+
22+
# script and save model
23+
scripted_model = torch.jit.script(model)
24+
scripted_model.save("model.pt")

0 commit comments

Comments
 (0)