-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcombine_datasets.py
More file actions
23 lines (20 loc) · 894 Bytes
/
combine_datasets.py
File metadata and controls
23 lines (20 loc) · 894 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import datasets
import json
from random import shuffle
if __name__ == '__main__':
with open("alpaca-gpt4-data.json", encoding='latin1') as f:
alpaca_list = json.load(f)
with open("gpt4all-j-data.json", encoding='latin1') as f:
gpt4all = json.load(f)
with open("dolly2.json", encoding='latin1') as f:
dolly = json.load(f)
with open("sharegpt_split.json", encoding='latin1') as f:
sharegpt_list = json.load(f)
with open("hh-data.json", encoding='latin1') as f:
hh = json.load(f)
total = alpaca_list + gpt4all + dolly + sharegpt_list + hh
shuffle(total)
dataset = datasets.Dataset.from_dict({'text': total}).push_to_hub('ChatCombined')
# combined = alpaca_list + sharegpt_list
# shuffle(combined)
# dataset = datasets.Dataset.from_dict({'text': combined}).push_to_hub('InstructFollowing')