-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
28 lines (25 loc) · 1.1 KB
/
utils.py
File metadata and controls
28 lines (25 loc) · 1.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from datasets import Dataset, DatasetDict
def read_txt(input_txt: str):
with open(input_txt, 'r', encoding='utf-8') as fin:
for lineno, line in enumerate(fin, 1):
parts = line.strip().split('=', 2)
if len(parts) != 3:
print(f"⚠️ Skipping malformed line {lineno}")
continue
_, schema, ontology = [p.strip() for p in parts]
yield schema, ontology
def build_examples(train_path, test_path, system_prompt) -> DatasetDict:
train_data = list(read_txt(train_path))
test_data = list(read_txt(test_path))
def row(input: str, output: str):
return {
"user_text": input,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": input},
{"role": "assistant", "content": output},
]
}
train = list(map(lambda x: row(x[0], x[1]), train_data))
eval_ = list(map(lambda x: row(x[0], x[1]), test_data))
return DatasetDict({"train": Dataset.from_list(train), "eval": Dataset.from_list(eval_)})