-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_datasets.py
More file actions
54 lines (43 loc) · 2.06 KB
/
load_datasets.py
File metadata and controls
54 lines (43 loc) · 2.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
from datasets import Dataset
import random
import torch
import tqdm
def load_data(model_id, num_samples, len, batch_size):
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
tokenized_dataset = tokenizer("\n\n".join(dataset['text']), return_tensors='pt')
sampled_data = []
sampled_mask = []
random.seed(42)
for _ in range(num_samples):
i = random.randint(0,tokenized_dataset.input_ids.shape[1]-len-1)
j = i + len
sampled_data.append(tokenized_dataset.input_ids[0,i:j].tolist())
sampled_mask.append(tokenized_dataset.attention_mask[0,i:j].tolist())
data = {"input_ids": sampled_data, "attention_mask": sampled_mask}
ds = Dataset.from_dict(data)
ds.set_format(type="torch", columns=["input_ids","attention_mask"])
loader = DataLoader(ds, batch_size = batch_size)
return loader
def get_c4(model_id):
dataset = load_dataset('allenai/c4', 'en', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train', use_auth_token=False)
dataset = dataset.shuffle()
dataset = dataset.select(range(128))
print(dataset.shape)
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(example):
return tokenizer(example["text"], truncation = True, max_length = 2048)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
def collate_fn(batch):
input_ids = [item["input_ids"] for item in batch]
padded = tokenizer.pad({"input_ids": input_ids},
return_tensors="pt",
padding="longest")
return padded
dataloader = DataLoader(tokenized_dataset, collate_fn=collate_fn, batch_size=128, shuffle=True)
return dataloader