diff --git a/core/common/constant.py b/core/common/constant.py index aa111ddb..d8723ae4 100644 --- a/core/common/constant.py +++ b/core/common/constant.py @@ -26,6 +26,7 @@ class DatasetFormat(Enum): CSV = "csv" TXT = "txt" JSON = "json" + JSONL = "jsonl" class ParadigmType(Enum): diff --git a/core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py b/core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py index 19972538..3194b7cd 100644 --- a/core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py +++ b/core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py @@ -84,5 +84,8 @@ def _inference(self, job, trained_model): inference_output_dir = os.path.join(self.workspace, "output/inference/") os.environ["RESULT_SAVED_URL"] = inference_output_dir job.load(trained_model) - infer_res = job.predict(inference_dataset.x) + if hasattr(inference_dataset, 'need_other_info'): + infer_res = job.predict(inference_dataset) + else: + infer_res = job.predict(inference_dataset.x) return infer_res diff --git a/core/testenvmanager/dataset/dataset.py b/core/testenvmanager/dataset/dataset.py index e07f5601..2edc960f 100644 --- a/core/testenvmanager/dataset/dataset.py +++ b/core/testenvmanager/dataset/dataset.py @@ -16,10 +16,16 @@ import os import tempfile - import pandas as pd -from sedna.datasources import CSVDataParse, TxtDataParse, JSONDataParse - +# pylint: disable=no-name-in-module +# pylint: disable=too-many-instance-attributes +from sedna.datasources import ( + CSVDataParse, + TxtDataParse, + JSONDataParse, + JsonlDataParse, + JSONMetaDataParse, +) from core.common import utils from core.common.constant import DatasetFormat @@ -38,12 +44,28 @@ class Dataset: def __init__(self, config): self.train_url: str = "" self.test_url: str = "" + self.train_index: str = "" + self.test_index: str = "" + self.train_data: str = "" + self.test_data: str = "" + self.train_data_info: str = "" + self.test_data_info: str = "" self.label: str = "" self._parse_config(config) def _check_fields(self): - self._check_dataset_url(self.train_url) - self._check_dataset_url(self.test_url) + if self.train_index: + self._check_dataset_url(self.train_index) + if self.test_index: + self._check_dataset_url(self.test_index) + if self.train_data: + self._check_dataset_url(self.train_data) + if self.test_data: + self._check_dataset_url(self.test_data) + if self.train_data_info: + self._check_dataset_url(self.train_data_info) + if self.test_data_info: + self._check_dataset_url(self.test_data_info) def _parse_config(self, config): for attr, value in config.items(): @@ -108,6 +130,20 @@ def _process_index_file(self, file_url): return None + def _process_data_file(self, file_url): + file_format = utils.get_file_format(file_url) + if file_format == DatasetFormat.JSONL.value: + return file_url + + return None + + def _process_data_info_file(self, file_url): + file_format = utils.get_file_format(file_url) + if file_format == DatasetFormat.JSON.value: + return file_url + + return None + def process_dataset(self): """ process dataset: @@ -116,9 +152,26 @@ def process_dataset(self): in the index file(e.g.: txt index file). """ + if self.train_index: + self.train_url = self._process_index_file(self.train_index) + elif self.train_data: + self.train_url = self._process_data_file(self.train_data) + elif self.train_data_info: + self.train_url = self._process_data_info_file(self.train_data_info) + # raise NotImplementedError('to be done') + else: + raise NotImplementedError('not one of train_index/train_data/train_data_info') + + if self.test_index: + self.test_url = self._process_index_file(self.test_index) + elif self.test_data: + self.test_url = self._process_data_file(self.test_data) + elif self.test_data_info: + self.test_url = self._process_data_info_file(self.test_data_info) + # raise NotImplementedError('to be done') + else: + raise NotImplementedError('not one of test_index/test_data/test_data_info') - self.train_url = self._process_index_file(self.train_url) - self.test_url = self._process_index_file(self.test_url) # pylint: disable=too-many-arguments def split_dataset( @@ -514,6 +567,11 @@ def load_data( e.g.: TxtDataParse, CSVDataParse. """ + if file.split('/')[-1] == "metadata.json": + data = JSONMetaDataParse(data_type=data_type, func=feature_process) + data.parse(file) + return data + data_format = utils.get_file_format(file) data = None @@ -523,11 +581,14 @@ def load_data( if data_format == DatasetFormat.TXT.value: data = TxtDataParse(data_type=data_type, func=feature_process) - # print(file) data.parse(file, use_raw=use_raw) if data_format == DatasetFormat.JSON.value: data = JSONDataParse(data_type=data_type, func=feature_process) data.parse(file) + if data_format == DatasetFormat.JSONL.value: + data = JsonlDataParse(data_type=data_type, func=feature_process) + data.parse(file) + return data diff --git a/examples/government/singletask_learning_bench/README.md b/examples/government/singletask_learning_bench/README.md new file mode 100644 index 00000000..22dfbfed --- /dev/null +++ b/examples/government/singletask_learning_bench/README.md @@ -0,0 +1,104 @@ +# Government BenchMark + +## Introduction + +This is the work for Domain-specific Large Model Benchmark: + +Constructs a suite for the government sector, including test datasets, evaluation metrics, testing environments, and usage guidelines. + +This Benchmark consists of two parts: subjective evaluation data and objective evaluation data. + +## Design + +### Metadata Format + +| Name | Field Name | Option | Description | +| --- | --- | --- | --- | +| Data Name | dataset | Required | Name of the dataset | +| Data Description | description | Optional | Dataset description, such as usage scope, sample size, etc. | +| First-level Dimension | level_1_dim | Required | Should fill in "Single Modal" or "Multi-Modal" | +| Second-level Dimension | level_2_dim | Required | For "Single Modal", fill in "Text", "Image", or "Audio". For "Multi-Modal", fill in "Text-Image", "Text-Audio", "Image-Audio", or "Text-Image-Audio" | +| Third-level Dimension | level_3_dim | Optional | Should be filled if all samples in the dataset have the same third-level dimension. If filled, content should be based on the standards shown in the normative reference document | +| Fourth-level Dimension | level_4_dim | Optional | Should be filled if all samples in the dataset have the same third-level dimension. If filled, content should be based on the standards shown in the normative reference document | + +metadata example: + +```json +{ + "dataset": "Medical BenchMark", + "description": "xxx", + "level_1_dim": "single-modal", + "level_2_dim": "text", + "level_3_dim": "Q&A", + "level_4_dim": "medical" +} +``` + +### Data format: + +|name|Option|information| +|---|---|---| +|prompt|Optional|the background of the LLM testing| +|query|Required|the testing question| +|response|Required|the answer of the question| +|explanation|Optional|the explanation of the answer| +|judge_prompt|Optional|the prompt of the judge model| +|level_1_dim|Optional|single-modal or multi-modal| +|level_2_dim|Optional|single-modal: text, image, video; multi-modal: text-image, text-video, text-image-video| +|level_3_dim|Required|details| +|level_4_dim|Required|details| + +data example: + +```json +{ + "prompt": "Please think step by step and answer the question.", + "question": "Which one is the correct answer of xxx? A. xxx B. xxx C. xxx D. xxx", + "response": "C", + "explanation": "xxx", + "level_1_dim": "single-modal", + "level_2_dim": "text", + "level_3_dim": "knowledge Q&A", + "level_4_dim": "medical knowledge" +} +``` + + +## Change to Core Code + +![](./imgs/structure.png) + +## Prepare Datasets + +You can download dataset in [kaggle](https://www.kaggle.com/datasets/kubeedgeianvs/the-government-affairs-dataset-govaff/data?select=government_benchmark) + +``` +dataset/government +├── objective +│ ├── test_data +│ │ ├── data.jsonl +│ │ └── metadata.json +│ └── train_data +└── subjective + ├── test_data + │ ├── data_full.jsonl + │ ├── data.jsonl + │ └── metadata.json + └── train_data +``` + +## Prepare Environment + +You should change your sedna package like this: [my sedna repo commit](https://github.com/IcyFeather233/sedna/commit/e13b82363c03dc771fca4922a24798554ca32a9f) + +Or you can replace the file in `yourpath/anaconda3/envs/ianvs/lib/python3.x/site-packages/sedna` with `examples/resources/sedna-llm.zip` + +## Run Ianvs + +### Objective + +`ianvs -f examples/government/singletask_learning_bench/objective/benchmarkingjob.yaml` + +### Subjective + +`ianvs -f examples/government/singletask_learning_bench/subjective/benchmarkingjob.yaml` \ No newline at end of file diff --git a/examples/government/singletask_learning_bench/imgs/structure.png b/examples/government/singletask_learning_bench/imgs/structure.png new file mode 100644 index 00000000..22c1695e Binary files /dev/null and b/examples/government/singletask_learning_bench/imgs/structure.png differ diff --git a/examples/government/singletask_learning_bench/objective/benchmarkingjob.yaml b/examples/government/singletask_learning_bench/objective/benchmarkingjob.yaml new file mode 100644 index 00000000..38c8f2c5 --- /dev/null +++ b/examples/government/singletask_learning_bench/objective/benchmarkingjob.yaml @@ -0,0 +1,72 @@ +benchmarkingjob: + # job name of bechmarking; string type; + name: "benchmarkingjob" + # the url address of job workspace that will reserve the output of tests; string type; + workspace: "/home/icyfeather/project/ianvs/workspace" + + # the url address of test environment configuration file; string type; + # the file format supports yaml/yml; + testenv: "./examples/government/singletask_learning_bench/objective/testenv/testenv.yaml" + + # the configuration of test object + test_object: + # test type; string type; + # currently the option of value is "algorithms",the others will be added in succession. + type: "algorithms" + # test algorithm configuration files; list type; + algorithms: + # algorithm name; string type; + - name: "politic_bench_singletask_learning" + # the url address of test algorithm configuration file; string type; + # the file format supports yaml/yml; + url: "./examples/government/singletask_learning_bench/objective/testalgorithms/gen/gen_algorithm.yaml" + + # the configuration of ranking leaderboard + rank: + # rank leaderboard with metric of test case's evaluation and order ; list type; + # the sorting priority is based on the sequence of metrics in the list from front to back; + sort_by: [ { "acc": "descend" } ] + + # visualization configuration + visualization: + # mode of visualization in the leaderboard; string type; + # There are quite a few possible dataitems in the leaderboard. Not all of them can be shown simultaneously on the screen. + # In the leaderboard, we provide the "selected_only" mode for the user to configure what is shown or is not shown. + mode: "selected_only" + # method of visualization for selected dataitems; string type; + # currently the options of value are as follows: + # 1> "print_table": print selected dataitems; + method: "print_table" + + # selected dataitem configuration + # The user can add his/her interested dataitems in terms of "paradigms", "modules", "hyperparameters" and "metrics", + # so that the selected columns will be shown. + selected_dataitem: + # currently the options of value are as follows: + # 1> "all": select all paradigms in the leaderboard; + # 2> paradigms in the leaderboard, e.g., "singletasklearning" + paradigms: [ "all" ] + # currently the options of value are as follows: + # 1> "all": select all modules in the leaderboard; + # 2> modules in the leaderboard, e.g., "basemodel" + modules: [ "all" ] + # currently the options of value are as follows: + # 1> "all": select all hyperparameters in the leaderboard; + # 2> hyperparameters in the leaderboard, e.g., "momentum" + hyperparameters: [ "all" ] + # currently the options of value are as follows: + # 1> "all": select all metrics in the leaderboard; + # 2> metrics in the leaderboard, e.g., "f1_score" + metrics: [ "acc" ] + + # model of save selected and all dataitems in workspace; string type; + # currently the options of value are as follows: + # 1> "selected_and_all": save selected and all dataitems; + # 2> "selected_only": save selected dataitems; + save_mode: "selected_and_all" + + + + + + diff --git a/examples/government/singletask_learning_bench/objective/testalgorithms/gen/basemodel.py b/examples/government/singletask_learning_bench/objective/testalgorithms/gen/basemodel.py new file mode 100644 index 00000000..b6340ec3 --- /dev/null +++ b/examples/government/singletask_learning_bench/objective/testalgorithms/gen/basemodel.py @@ -0,0 +1,105 @@ +# Copyright 2022 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import, division + +import os +import tempfile +import time +import zipfile +import logging + +import numpy as np +import random +from tqdm import tqdm +from sedna.common.config import Context +from sedna.common.class_factory import ClassType, ClassFactory +from core.common.log import LOGGER + + +from transformers import AutoModelForCausalLM, AutoTokenizer +device = "cuda" # the device to load the model onto + + +logging.disable(logging.WARNING) + +__all__ = ["BaseModel"] + +os.environ['BACKEND_TYPE'] = 'TORCH' + + +@ClassFactory.register(ClassType.GENERAL, alias="gen") +class BaseModel: + + def __init__(self, **kwargs): + self.model = AutoModelForCausalLM.from_pretrained( + "/home/icyfeather/models/Qwen2-0.5B-Instruct", + torch_dtype="auto", + device_map="auto" + ) + self.tokenizer = AutoTokenizer.from_pretrained("/home/icyfeather/models/Qwen2-0.5B-Instruct") + + def train(self, train_data, valid_data=None, **kwargs): + LOGGER.info("BaseModel train") + + + def save(self, model_path): + LOGGER.info("BaseModel save") + + def predict(self, data, input_shape=None, **kwargs): + LOGGER.info("BaseModel predict") + LOGGER.info(f"Dataset: {data.dataset_name}") + LOGGER.info(f"Description: {data.description}") + LOGGER.info(f"Data Level 1 Dim: {data.level_1_dim}") + LOGGER.info(f"Data Level 2 Dim: {data.level_2_dim}") + + answer_list = [] + for line in tqdm(data.x, desc="Processing", unit="question"): + # 3-shot + indices = random.sample([i for i, l in enumerate(data.x) if l != line], 3) + history = [] + for idx in indices: + history.append({"role": "user", "content": data.x[idx]}) + history.append({"role": "assistant", "content": data.y[idx]}) + history.append({"role": "user", "content": line}) + response = self._infer(history) + answer_list.append(response) + return answer_list + + def load(self, model_url=None): + LOGGER.info("BaseModel load") + + def evaluate(self, data, model_path, **kwargs): + LOGGER.info("BaseModel evaluate") + + def _infer(self, messages): + text = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + model_inputs = self.tokenizer([text], return_tensors="pt").to(device) + + generated_ids = self.model.generate( + model_inputs.input_ids, + max_new_tokens=512, + temperature = 0.1, + top_p = 0.9 + ) + generated_ids = [ + output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) + ] + + response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + return response diff --git a/examples/government/singletask_learning_bench/objective/testalgorithms/gen/gen_algorithm.yaml b/examples/government/singletask_learning_bench/objective/testalgorithms/gen/gen_algorithm.yaml new file mode 100644 index 00000000..3167cbe8 --- /dev/null +++ b/examples/government/singletask_learning_bench/objective/testalgorithms/gen/gen_algorithm.yaml @@ -0,0 +1,18 @@ +algorithm: + # paradigm name; string type; + # currently the options of value are as follows: + # 1> "singletasklearning" + # 2> "incrementallearning" + paradigm_type: "singletasklearning" + + # algorithm module configuration in the paradigm; list type; + modules: + # kind of algorithm module; string type; + # currently the options of value are as follows: + # 1> "basemodel" + - type: "basemodel" + # name of python module; string type; + # example: basemodel.py has BaseModel module that the alias is "FPN" for this benchmarking; + name: "gen" + # the url address of python module; string type; + url: "./examples/government/singletask_learning_bench/objective/testalgorithms/gen/basemodel.py" \ No newline at end of file diff --git a/examples/government/singletask_learning_bench/objective/testenv/acc.py b/examples/government/singletask_learning_bench/objective/testenv/acc.py new file mode 100644 index 00000000..a4041f48 --- /dev/null +++ b/examples/government/singletask_learning_bench/objective/testenv/acc.py @@ -0,0 +1,39 @@ +# Copyright 2022 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sedna.common.class_factory import ClassType, ClassFactory + +__all__ = ["acc"] + +def get_last_letter(input_string): + if not input_string or not any(char.isalpha() for char in input_string): + return None + + for char in reversed(input_string): + if 'A' <= char <= 'D': + return char + + return None + + +@ClassFactory.register(ClassType.GENERAL, alias="acc") +def acc(y_true, y_pred): + y_pred = [get_last_letter(pred) for pred in y_pred] + y_true = [get_last_letter(pred) for pred in y_true] + + same_elements = [y_pred[i] == y_true[i] for i in range(len(y_pred))] + + acc = sum(same_elements) / len(same_elements) + + return acc diff --git a/examples/government/singletask_learning_bench/objective/testenv/testenv.yaml b/examples/government/singletask_learning_bench/objective/testenv/testenv.yaml new file mode 100644 index 00000000..e3a13834 --- /dev/null +++ b/examples/government/singletask_learning_bench/objective/testenv/testenv.yaml @@ -0,0 +1,14 @@ +testenv: + # dataset configuration + dataset: + # the url address of train dataset index; string type; + train_data: "/home/icyfeather/Projects/ianvs/dataset/government/objective/train_data/data.jsonl" + # the url address of test dataset index; string type; + test_data_info: "/home/icyfeather/Projects/ianvs/dataset/government/objective/test_data/metadata.json" + + # metrics configuration for test case's evaluation; list type; + metrics: + # metric name; string type; + - name: "acc" + # the url address of python file + url: "./examples/government/singletask_learning_bench/objective/testenv/acc.py" diff --git a/examples/government/singletask_learning_bench/subjective/benchmarkingjob.yaml b/examples/government/singletask_learning_bench/subjective/benchmarkingjob.yaml new file mode 100644 index 00000000..26008c3c --- /dev/null +++ b/examples/government/singletask_learning_bench/subjective/benchmarkingjob.yaml @@ -0,0 +1,72 @@ +benchmarkingjob: + # job name of bechmarking; string type; + name: "benchmarkingjob" + # the url address of job workspace that will reserve the output of tests; string type; + workspace: "/home/icyfeather/project/ianvs/workspace" + + # the url address of test environment configuration file; string type; + # the file format supports yaml/yml; + testenv: "./examples/government/singletask_learning_bench/subjective/testenv/testenv.yaml" + + # the configuration of test object + test_object: + # test type; string type; + # currently the option of value is "algorithms",the others will be added in succession. + type: "algorithms" + # test algorithm configuration files; list type; + algorithms: + # algorithm name; string type; + - name: "politic_bench_singletask_learning" + # the url address of test algorithm configuration file; string type; + # the file format supports yaml/yml; + url: "./examples/government/singletask_learning_bench/subjective/testalgorithms/gen/gen_algorithm.yaml" + + # the configuration of ranking leaderboard + rank: + # rank leaderboard with metric of test case's evaluation and order ; list type; + # the sorting priority is based on the sequence of metrics in the list from front to back; + sort_by: [ { "llm_judgement": "descend" } ] + + # visualization configuration + visualization: + # mode of visualization in the leaderboard; string type; + # There are quite a few possible dataitems in the leaderboard. Not all of them can be shown simultaneously on the screen. + # In the leaderboard, we provide the "selected_only" mode for the user to configure what is shown or is not shown. + mode: "selected_only" + # method of visualization for selected dataitems; string type; + # currently the options of value are as follows: + # 1> "print_table": print selected dataitems; + method: "print_table" + + # selected dataitem configuration + # The user can add his/her interested dataitems in terms of "paradigms", "modules", "hyperparameters" and "metrics", + # so that the selected columns will be shown. + selected_dataitem: + # currently the options of value are as follows: + # 1> "all": select all paradigms in the leaderboard; + # 2> paradigms in the leaderboard, e.g., "singletasklearning" + paradigms: [ "all" ] + # currently the options of value are as follows: + # 1> "all": select all modules in the leaderboard; + # 2> modules in the leaderboard, e.g., "basemodel" + modules: [ "all" ] + # currently the options of value are as follows: + # 1> "all": select all hyperparameters in the leaderboard; + # 2> hyperparameters in the leaderboard, e.g., "momentum" + hyperparameters: [ "all" ] + # currently the options of value are as follows: + # 1> "all": select all metrics in the leaderboard; + # 2> metrics in the leaderboard, e.g., "f1_score" + metrics: [ "llm_judgement" ] + + # model of save selected and all dataitems in workspace; string type; + # currently the options of value are as follows: + # 1> "selected_and_all": save selected and all dataitems; + # 2> "selected_only": save selected dataitems; + save_mode: "selected_and_all" + + + + + + diff --git a/examples/government/singletask_learning_bench/subjective/testalgorithms/gen/basemodel.py b/examples/government/singletask_learning_bench/subjective/testalgorithms/gen/basemodel.py new file mode 100644 index 00000000..ee7f2585 --- /dev/null +++ b/examples/government/singletask_learning_bench/subjective/testalgorithms/gen/basemodel.py @@ -0,0 +1,131 @@ +# Copyright 2022 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import, division + +import os +import tempfile +import time +import zipfile +import logging + +import numpy as np +import random +from tqdm import tqdm +from sedna.common.config import Context +from sedna.common.class_factory import ClassType, ClassFactory +from core.common.log import LOGGER +from openai import OpenAI + +from transformers import AutoModelForCausalLM, AutoTokenizer +device = "cuda" # the device to load the model onto + + +logging.disable(logging.WARNING) + +__all__ = ["BaseModel"] + +os.environ['BACKEND_TYPE'] = 'TORCH' + + +@ClassFactory.register(ClassType.GENERAL, alias="gen") +class BaseModel: + + def __init__(self, **kwargs): + self.model = AutoModelForCausalLM.from_pretrained( + "/home/icyfeather/models/Qwen2-0.5B-Instruct", + torch_dtype="auto", + device_map="auto" + ) + self.tokenizer = AutoTokenizer.from_pretrained("/home/icyfeather/models/Qwen2-0.5B-Instruct") + + def train(self, train_data, valid_data=None, **kwargs): + LOGGER.info("BaseModel train") + + + def save(self, model_path): + LOGGER.info("BaseModel save") + + def predict(self, data, input_shape=None, **kwargs): + LOGGER.info("BaseModel predict") + LOGGER.info(f"Dataset: {data.dataset_name}") + LOGGER.info(f"Description: {data.description}") + LOGGER.info(f"Data Level 1 Dim: {data.level_1_dim}") + LOGGER.info(f"Data Level 2 Dim: {data.level_2_dim}") + + answer_list = [] + for line in tqdm(data.x, desc="Processing", unit="question"): + history = [] + history.append({"role": "user", "content": line}) + response = self._infer(history) + answer_list.append(response) + + judgement_list = [] + + # evaluate by llm + for index in tqdm(range(len(answer_list)), desc="Evaluating", ascii=False, ncols=75): + prompt = data.judge_prompts[index] + answer_list[index] + judgement = self._openai_generate(prompt) + judgement_list.append(judgement) + + return judgement_list + + def load(self, model_url=None): + LOGGER.info("BaseModel load") + + def evaluate(self, data, model_path, **kwargs): + LOGGER.info("BaseModel evaluate") + + def _infer(self, messages): + text = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + model_inputs = self.tokenizer([text], return_tensors="pt").to(device) + + generated_ids = self.model.generate( + model_inputs.input_ids, + max_new_tokens=512, + temperature = 0.1, + top_p = 0.9 + ) + generated_ids = [ + output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) + ] + + response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + return response + + + def _openai_generate(self, user_question, system=None): + key = os.getenv("DEEPSEEK_API_KEY") + if not key: + raise ValueError("You should set DEEPSEEK_API_KEY in your env.") + client = OpenAI(api_key=key, base_url="https://api.deepseek.com") + + messages = [] + if system: + messages.append({"role": "system", "content": system}) + messages.append({"role": "user", "content": user_question}) + + response = client.chat.completions.create( + model="deepseek-chat", + messages=messages, + stream=False + ) + + res = response.choices[0].message.content + + return res \ No newline at end of file diff --git a/examples/government/singletask_learning_bench/subjective/testalgorithms/gen/gen_algorithm.yaml b/examples/government/singletask_learning_bench/subjective/testalgorithms/gen/gen_algorithm.yaml new file mode 100644 index 00000000..f20e9047 --- /dev/null +++ b/examples/government/singletask_learning_bench/subjective/testalgorithms/gen/gen_algorithm.yaml @@ -0,0 +1,18 @@ +algorithm: + # paradigm name; string type; + # currently the options of value are as follows: + # 1> "singletasklearning" + # 2> "incrementallearning" + paradigm_type: "singletasklearning" + + # algorithm module configuration in the paradigm; list type; + modules: + # kind of algorithm module; string type; + # currently the options of value are as follows: + # 1> "basemodel" + - type: "basemodel" + # name of python module; string type; + # example: basemodel.py has BaseModel module that the alias is "FPN" for this benchmarking; + name: "gen" + # the url address of python module; string type; + url: "./examples/government/singletask_learning_bench/subjective/testalgorithms/gen/basemodel.py" \ No newline at end of file diff --git a/examples/government/singletask_learning_bench/subjective/testenv/llm_judgement.py b/examples/government/singletask_learning_bench/subjective/testenv/llm_judgement.py new file mode 100644 index 00000000..97cbc72a --- /dev/null +++ b/examples/government/singletask_learning_bench/subjective/testenv/llm_judgement.py @@ -0,0 +1,42 @@ +# Copyright 2022 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from sedna.common.class_factory import ClassType, ClassFactory +from core.common.log import LOGGER + +__all__ = ["llm_judgement"] + +def extract_comprehensive_score(input_str): + # extract overall points + match = re.search(r"'Overall Points': (\d+)", input_str) + if match: + return int(match.group(1)) + else: + return None + + +@ClassFactory.register(ClassType.GENERAL, alias="llm_judgement") +def llm_judgement(y_true, y_pred): + y_pred = [extract_comprehensive_score(pred) for pred in y_pred] + + valid_scores = [score for score in y_pred if score is not None] + + LOGGER.info(f"Extracted {len(valid_scores)} datas from {len(y_pred)} datas") + + if valid_scores: + average_score = sum(valid_scores) / len(valid_scores) + return average_score + else: + return -1 diff --git a/examples/government/singletask_learning_bench/subjective/testenv/testenv.yaml b/examples/government/singletask_learning_bench/subjective/testenv/testenv.yaml new file mode 100644 index 00000000..f197b2fb --- /dev/null +++ b/examples/government/singletask_learning_bench/subjective/testenv/testenv.yaml @@ -0,0 +1,14 @@ +testenv: + # dataset configuration + dataset: + # the url address of train dataset index; string type; + train_data: "/home/icyfeather/Projects/ianvs/dataset/government/subjective/train_data/data.jsonl" + # the url address of test dataset index; string type; + test_data_info: "/home/icyfeather/Projects/ianvs/dataset/government/subjective/test_data/metadata.json" + + # metrics configuration for test case's evaluation; list type; + metrics: + # metric name; string type; + - name: "llm_judgement" + # the url address of python file + url: "./examples/government/singletask_learning_bench/subjective/testenv/llm_judgement.py" diff --git a/examples/llm_simple_qa/README.md b/examples/llm_simple_qa/README.md new file mode 100644 index 00000000..dbaf845a --- /dev/null +++ b/examples/llm_simple_qa/README.md @@ -0,0 +1,84 @@ +# README + +## Simple QA + +### Prepare Data + +The data of simple-qa example structure is: + +``` +. +├── test_data +│ └── data.jsonl +└── train_data + └── data.jsonl +``` + +`train_data/data.jsonl` is empty, and the `test_data/data.jsonl` is as follows: + +``` +{ + "question": "If Xiao Ming has 5 apples, and he gives 3 to Xiao Hua, how many apples does Xiao Ming have left?\nA. 2\nB. 3\nC. 4\nD. 5", + "answer": "A" +} +{ + "question": "Which of the following numbers is the smallest prime number?\nA. 0\nB. 1\nC. 2\nD. 4", + "answer": "C" +} +{ + "question": "A rectangle has a length of 10 centimeters and a width of 5 centimeters, what is its perimeter in centimeters?\nA. 20 centimeters\nB. 30 centimeters\nC. 40 centimeters\nD. 50 centimeters", + "answer": "B" +} +{ + "question": "Which of the following fractions is closest to 1?\nA. 1/2\nB. 3/4\nC. 4/5\nD. 5/6", + "answer": "D" +} +{ + "question": "If a number plus 10 equals 30, what is the number?\nA. 20\nB. 21\nC. 22\nD. 23", + "answer": "A" +} +{ + "question": "Which of the following expressions has the largest result?\nA. 3 + 4\nB. 5 - 2\nC. 6 * 2\nD. 7 ÷ 2", + "answer": "C" +} +{ + "question": "A class has 24 students, and if each student brings 2 books, how many books are there in total?\nA. 48\nB. 36\nC. 24\nD. 12", + "answer": "A" +} +{ + "question": "Which of the following is the correct multiplication rhyme?\nA. Three threes are seven\nB. Four fours are sixteen\nC. Five fives are twenty-five\nD. Six sixes are thirty-six", + "answer": "B" +} +{ + "question": "If one number is three times another number, and this number is 15, what is the other number?\nA. 5\nB. 10\nC. 15\nD. 45", + "answer": "A" +} +{ + "question": "Which of the following shapes has the longest perimeter?\nA. Square\nB. Rectangle\nC. Circle\nD. Triangle", + "answer": "C" +} +``` + +### Prepare Environment + +You need to install the changed-sedna package, which added `JsonlDataParse` in `sedna.datasources` + +Replace the file in `yourpath/anaconda3/envs/ianvs/lib/python3.x/site-packages/sedna` with `examples/resources/sedna-with-jsonl.zip` + + +### Run Ianvs + +Run the following command: + +`ianvs -f examples/llm/singletask_learning_bench/simple_qa/benchmarkingjob.yaml` + +## OpenCompass Evaluation + +### Prepare Environment + +`pip install examples/resources/opencompass-0.2.5-py3-none-any.whl` + +### Run Evaluation + +`python run_op.py examples/llm/singletask_learning_bench/simple_qa/testalgorithms/gen/op_eval.py` + diff --git a/examples/llm_simple_qa/benchmarkingjob.yaml b/examples/llm_simple_qa/benchmarkingjob.yaml new file mode 100644 index 00000000..78961e52 --- /dev/null +++ b/examples/llm_simple_qa/benchmarkingjob.yaml @@ -0,0 +1,72 @@ +benchmarkingjob: + # job name of bechmarking; string type; + name: "benchmarkingjob" + # the url address of job workspace that will reserve the output of tests; string type; + workspace: "/home/icyfeather/project/ianvs/workspace" + + # the url address of test environment configuration file; string type; + # the file format supports yaml/yml; + testenv: "./examples/llm/singletask_learning_bench/simple_qa/testenv/testenv.yaml" + + # the configuration of test object + test_object: + # test type; string type; + # currently the option of value is "algorithms",the others will be added in succession. + type: "algorithms" + # test algorithm configuration files; list type; + algorithms: + # algorithm name; string type; + - name: "simple_qa_singletask_learning" + # the url address of test algorithm configuration file; string type; + # the file format supports yaml/yml; + url: "./examples/llm/singletask_learning_bench/simple_qa/testalgorithms/gen/gen_algorithm.yaml" + + # the configuration of ranking leaderboard + rank: + # rank leaderboard with metric of test case's evaluation and order ; list type; + # the sorting priority is based on the sequence of metrics in the list from front to back; + sort_by: [ { "acc": "descend" } ] + + # visualization configuration + visualization: + # mode of visualization in the leaderboard; string type; + # There are quite a few possible dataitems in the leaderboard. Not all of them can be shown simultaneously on the screen. + # In the leaderboard, we provide the "selected_only" mode for the user to configure what is shown or is not shown. + mode: "selected_only" + # method of visualization for selected dataitems; string type; + # currently the options of value are as follows: + # 1> "print_table": print selected dataitems; + method: "print_table" + + # selected dataitem configuration + # The user can add his/her interested dataitems in terms of "paradigms", "modules", "hyperparameters" and "metrics", + # so that the selected columns will be shown. + selected_dataitem: + # currently the options of value are as follows: + # 1> "all": select all paradigms in the leaderboard; + # 2> paradigms in the leaderboard, e.g., "singletasklearning" + paradigms: [ "all" ] + # currently the options of value are as follows: + # 1> "all": select all modules in the leaderboard; + # 2> modules in the leaderboard, e.g., "basemodel" + modules: [ "all" ] + # currently the options of value are as follows: + # 1> "all": select all hyperparameters in the leaderboard; + # 2> hyperparameters in the leaderboard, e.g., "momentum" + hyperparameters: [ "all" ] + # currently the options of value are as follows: + # 1> "all": select all metrics in the leaderboard; + # 2> metrics in the leaderboard, e.g., "f1_score" + metrics: [ "acc" ] + + # model of save selected and all dataitems in workspace; string type; + # currently the options of value are as follows: + # 1> "selected_and_all": save selected and all dataitems; + # 2> "selected_only": save selected dataitems; + save_mode: "selected_and_all" + + + + + + diff --git a/examples/llm_simple_qa/testalgorithms/gen/basemodel.py b/examples/llm_simple_qa/testalgorithms/gen/basemodel.py new file mode 100644 index 00000000..fdeedc98 --- /dev/null +++ b/examples/llm_simple_qa/testalgorithms/gen/basemodel.py @@ -0,0 +1,98 @@ +# Copyright 2022 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import, division, print_function + +import os +import tempfile +import time +import zipfile +import logging + +import numpy as np +from sedna.common.config import Context +from sedna.common.class_factory import ClassType, ClassFactory + + +from transformers import AutoModelForCausalLM, AutoTokenizer +device = "cuda" # the device to load the model onto + + +logging.disable(logging.WARNING) + +__all__ = ["BaseModel"] + +os.environ['BACKEND_TYPE'] = 'TORCH' + + +@ClassFactory.register(ClassType.GENERAL, alias="gen") +class BaseModel: + + def __init__(self, **kwargs): + self.model = AutoModelForCausalLM.from_pretrained( + "/home/icyfeather/models/Qwen2-0.5B-Instruct", + torch_dtype="auto", + device_map="auto" + ) + self.tokenizer = AutoTokenizer.from_pretrained("/home/icyfeather/models/Qwen2-0.5B-Instruct") + + def train(self, train_data, valid_data=None, **kwargs): + print("BaseModel doesn't need to train") + + + def save(self, model_path): + print("BaseModel doesn't need to save") + + def predict(self, data, input_shape=None, **kwargs): + print("BaseModel predict") + answer_list = [] + for line in data: + response = self._infer(line) + answer_list.append(response) + return answer_list + + def load(self, model_url=None): + print("BaseModel load") + + def evaluate(self, data, model_path, **kwargs): + print("BaseModel evaluate") + + def _infer(self, prompt, system=None): + if system: + messages = [ + {"role": "system", "content": system}, + {"role": "user", "content": prompt} + ] + else: + messages = [ + {"role": "user", "content": prompt} + ] + text = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + model_inputs = self.tokenizer([text], return_tensors="pt").to(device) + + generated_ids = self.model.generate( + model_inputs.input_ids, + max_new_tokens=512 + ) + generated_ids = [ + output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) + ] + + response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + + return response diff --git a/examples/llm_simple_qa/testalgorithms/gen/gen_algorithm.yaml b/examples/llm_simple_qa/testalgorithms/gen/gen_algorithm.yaml new file mode 100644 index 00000000..6536ceb9 --- /dev/null +++ b/examples/llm_simple_qa/testalgorithms/gen/gen_algorithm.yaml @@ -0,0 +1,18 @@ +algorithm: + # paradigm name; string type; + # currently the options of value are as follows: + # 1> "singletasklearning" + # 2> "incrementallearning" + paradigm_type: "singletasklearning" + + # algorithm module configuration in the paradigm; list type; + modules: + # kind of algorithm module; string type; + # currently the options of value are as follows: + # 1> "basemodel" + - type: "basemodel" + # name of python module; string type; + # example: basemodel.py has BaseModel module that the alias is "FPN" for this benchmarking; + name: "gen" + # the url address of python module; string type; + url: "./examples/llm/singletask_learning_bench/simple_qa/testalgorithms/gen/basemodel.py" \ No newline at end of file diff --git a/examples/llm_simple_qa/testalgorithms/gen/op_eval.py b/examples/llm_simple_qa/testalgorithms/gen/op_eval.py new file mode 100644 index 00000000..dc6d9c04 --- /dev/null +++ b/examples/llm_simple_qa/testalgorithms/gen/op_eval.py @@ -0,0 +1,21 @@ +from mmengine.config import read_base +from opencompass.models import HuggingFacewithChatTemplate +# import sys +# sys.path.append('/home/icyfeather/project/ianvs') + +with read_base(): + from core.op_extra.datasets.cmmlu.cmmlu_gen import cmmlu_datasets + +datasets = [*cmmlu_datasets] + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen1.5-1.8b-chat-hf', + path='/home/icyfeather/models/Qwen1.5-1.8B-Chat', + max_out_len=1024, + batch_size=2, + run_cfg=dict(num_gpus=1), + stop_words=['<|im_end|>', '<|im_start|>'], + ) +] diff --git a/examples/llm_simple_qa/testenv/acc.py b/examples/llm_simple_qa/testenv/acc.py new file mode 100644 index 00000000..beccdadf --- /dev/null +++ b/examples/llm_simple_qa/testenv/acc.py @@ -0,0 +1,40 @@ +# Copyright 2022 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sedna.common.class_factory import ClassType, ClassFactory + +__all__ = ["acc"] + +def get_last_letter(input_string): + if not input_string or not any(char.isalpha() for char in input_string): + return None + + for char in reversed(input_string): + if 'A' <= char <= 'D': + return char + + return None + + +@ClassFactory.register(ClassType.GENERAL, alias="acc") +def acc(y_true, y_pred): + y_pred = [get_last_letter(pred) for pred in y_pred] + print(y_true) + print(y_pred) + + same_elements = [y_pred[i] == y_true[i] for i in range(len(y_pred))] + + acc = sum(same_elements) / len(same_elements) + + return acc diff --git a/examples/llm_simple_qa/testenv/testenv.yaml b/examples/llm_simple_qa/testenv/testenv.yaml new file mode 100644 index 00000000..0bc7239f --- /dev/null +++ b/examples/llm_simple_qa/testenv/testenv.yaml @@ -0,0 +1,14 @@ +testenv: + # dataset configuration + dataset: + # the url address of train dataset index; string type; + train_data: "/home/icyfeather/Projects/ianvs/dataset/llm_simple_qa/train_data/data.jsonl" + # the url address of test dataset index; string type; + test_data: "/home/icyfeather/Projects/ianvs/dataset/llm_simple_qa/test_data/data.jsonl" + + # metrics configuration for test case's evaluation; list type; + metrics: + # metric name; string type; + - name: "acc" + # the url address of python file + url: "./examples/llm/singletask_learning_bench/simple_qa/testenv/acc.py" diff --git a/examples/resources/sedna-llm.zip b/examples/resources/sedna-llm.zip new file mode 100644 index 00000000..8ea3c0d3 Binary files /dev/null and b/examples/resources/sedna-llm.zip differ