Skip to content

Commit 0abcb9a

Browse files
IcyFeather233IcyFeather
authored andcommitted
add llm-benchmarks proposal
Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> add opencompass and llm singletask learning bench Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update llm single task learning bench readme Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> add government benchmark Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update government benchmark Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update llm government benchmark implementation Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update llm government benchmark implementation Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update government README Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update llm benchmark format Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update government benchmark Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update government benchmark dataset Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> add llm-benchmarks proposal Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update llm benchmark proposal Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update llm benchmark proposal Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update llm benchmark proposal Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> translate llm-benchmark proposal Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update proposal, add opencompass tutorial Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update government benchmark sedna package Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update government benchmark Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update llm benchmark format Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> update llm benchmark format Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> fix pylint check problem Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> fix pylint check problem Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> fix pylint check problem Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> trans Chinese comments to English Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com> add government llm benchmark Signed-off-by: IcyFeather <mengzhuo.happy@gmail.com>
1 parent 5c48872 commit 0abcb9a

32 files changed

Lines changed: 2114 additions & 9 deletions

File tree

core/common/constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class DatasetFormat(Enum):
2525
CSV = "csv"
2626
TXT = "txt"
2727
JSON = "json"
28+
JSONL = "jsonl"
2829

2930

3031
class ParadigmType(Enum):

core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,5 +84,8 @@ def _inference(self, job, trained_model):
8484
inference_output_dir = os.path.join(self.workspace, "output/inference/")
8585
os.environ["RESULT_SAVED_URL"] = inference_output_dir
8686
job.load(trained_model)
87-
infer_res = job.predict(inference_dataset.x)
87+
if hasattr(inference_dataset, 'need_other_info'):
88+
infer_res = job.predict(inference_dataset)
89+
else:
90+
infer_res = job.predict(inference_dataset.x)
8891
return infer_res

core/testenvmanager/dataset/dataset.py

Lines changed: 69 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,16 @@
1616

1717
import os
1818
import tempfile
19-
2019
import pandas as pd
21-
from sedna.datasources import CSVDataParse, TxtDataParse, JSONDataParse
22-
20+
# pylint: disable=no-name-in-module
21+
# pylint: disable=too-many-instance-attributes
22+
from sedna.datasources import (
23+
CSVDataParse,
24+
TxtDataParse,
25+
JSONDataParse,
26+
JsonlDataParse,
27+
JSONMetaDataParse,
28+
)
2329
from core.common import utils
2430
from core.common.constant import DatasetFormat
2531

@@ -38,12 +44,28 @@ class Dataset:
3844
def __init__(self, config):
3945
self.train_url: str = ""
4046
self.test_url: str = ""
47+
self.train_index: str = ""
48+
self.test_index: str = ""
49+
self.train_data: str = ""
50+
self.test_data: str = ""
51+
self.train_data_info: str = ""
52+
self.test_data_info: str = ""
4153
self.label: str = ""
4254
self._parse_config(config)
4355

4456
def _check_fields(self):
45-
self._check_dataset_url(self.train_url)
46-
self._check_dataset_url(self.test_url)
57+
if self.train_index:
58+
self._check_dataset_url(self.train_index)
59+
if self.test_index:
60+
self._check_dataset_url(self.test_index)
61+
if self.train_data:
62+
self._check_dataset_url(self.train_data)
63+
if self.test_data:
64+
self._check_dataset_url(self.test_data)
65+
if self.train_data_info:
66+
self._check_dataset_url(self.train_data_info)
67+
if self.test_data_info:
68+
self._check_dataset_url(self.test_data_info)
4769

4870
def _parse_config(self, config):
4971
for attr, value in config.items():
@@ -103,6 +125,20 @@ def _process_index_file(self, file_url):
103125

104126
return None
105127

128+
def _process_data_file(self, file_url):
129+
file_format = utils.get_file_format(file_url)
130+
if file_format == DatasetFormat.JSONL.value:
131+
return file_url
132+
133+
return None
134+
135+
def _process_data_info_file(self, file_url):
136+
file_format = utils.get_file_format(file_url)
137+
if file_format == DatasetFormat.JSON.value:
138+
return file_url
139+
140+
return None
141+
106142
def process_dataset(self):
107143
"""
108144
process dataset:
@@ -111,9 +147,26 @@ def process_dataset(self):
111147
in the index file(e.g.: txt index file).
112148
113149
"""
150+
if self.train_index:
151+
self.train_url = self._process_index_file(self.train_index)
152+
elif self.train_data:
153+
self.train_url = self._process_data_file(self.train_data)
154+
elif self.train_data_info:
155+
self.train_url = self._process_data_info_file(self.train_data_info)
156+
# raise NotImplementedError('to be done')
157+
else:
158+
raise NotImplementedError('not one of train_index/train_data/train_data_info')
159+
160+
if self.test_index:
161+
self.test_url = self._process_index_file(self.test_index)
162+
elif self.test_data:
163+
self.test_url = self._process_data_file(self.test_data)
164+
elif self.test_data_info:
165+
self.test_url = self._process_data_info_file(self.test_data_info)
166+
# raise NotImplementedError('to be done')
167+
else:
168+
raise NotImplementedError('not one of test_index/test_data/test_data_info')
114169

115-
self.train_url = self._process_index_file(self.train_url)
116-
self.test_url = self._process_index_file(self.test_url)
117170

118171
# pylint: disable=too-many-arguments
119172
def split_dataset(self, dataset_url, dataset_format, ratio, method="default",
@@ -388,6 +441,11 @@ def load_data(cls, file: str, data_type: str, label=None, use_raw=False, feature
388441
e.g.: TxtDataParse, CSVDataParse.
389442
390443
"""
444+
if file.split('/')[-1] == "metadata.json":
445+
data = JSONMetaDataParse(data_type=data_type, func=feature_process)
446+
data.parse(file)
447+
return data
448+
391449
data_format = utils.get_file_format(file)
392450

393451
data = None
@@ -397,11 +455,14 @@ def load_data(cls, file: str, data_type: str, label=None, use_raw=False, feature
397455

398456
if data_format == DatasetFormat.TXT.value:
399457
data = TxtDataParse(data_type=data_type, func=feature_process)
400-
#print(file)
401458
data.parse(file, use_raw=use_raw)
402459

403460
if data_format == DatasetFormat.JSON.value:
404461
data = JSONDataParse(data_type=data_type, func=feature_process)
405462
data.parse(file)
406463

464+
if data_format == DatasetFormat.JSONL.value:
465+
data = JsonlDataParse(data_type=data_type, func=feature_process)
466+
data.parse(file)
467+
407468
return data
16.2 KB
Loading
181 KB
Loading
53.6 KB
Loading
159 KB
Loading
64.3 KB
Loading

0 commit comments

Comments
 (0)