@@ -21,7 +21,7 @@ In the dynamic landscape of generative NLP, traditional text processing pipeline
21
21
![ license] ( https://img.shields.io/github/license/ibm/unitxt )
22
22
![ python] ( https://img.shields.io/badge/python-3.8%20|%203.9-blue )
23
23
![ tests] ( https://img.shields.io/github/actions/workflow/status/ibm/unitxt/library_tests.yml?branch=main&label=tests )
24
- [ ![ codecov ] ( https://codecov .io/gh/ IBM/unitxt/branch/main/graph/ badge.svg?token=mlrWq9cwz3 )] ( https://codecov .io/gh /IBM/unitxt )
24
+ [ ![ Coverage Status ] ( https://coveralls .io/repos/github/ IBM/unitxt/badge.svg )] ( https://coveralls .io/github /IBM/unitxt )
25
25
![ Read the Docs] ( https://img.shields.io/readthedocs/unitxt )
26
26
[ ![ downloads] ( https://static.pepy.tech/personalized-badge/unitxt?period=total&units=international_system&left_color=grey&right_color=green&left_text=downloads )] ( https://pepy.tech/project/unitxt )
27
27
@@ -48,80 +48,61 @@ Then launch the ui by running:
48
48
unitxt-explore
49
49
```
50
50
51
- # 🦄 Example
51
+ # 🦄 Example
52
52
53
53
This is a simple example of running end-to-end evaluation in self contained python code over user data.
54
54
55
55
See more examples in examples subdirectory.
56
56
57
57
``` python
58
- from unitxt import get_logger
59
- from unitxt.api import evaluate, load_dataset
60
- from unitxt.blocks import Task, TaskCard
61
- from unitxt.inference import HFPipelineBasedInferenceEngine
62
- from unitxt.loaders import LoadFromDictionary
63
- from unitxt.templates import InputOutputTemplate, TemplatesDict
64
- from unitxt.text_utils import print_dict
65
-
66
- logger = get_logger()
67
-
68
- # Set up question answer pairs in a dictionary
69
- data = {
70
- " test" : [
71
- {" question" : " What is the capital of Texas?" , " answer" : " Austin" },
72
- {" question" : " What is the color of the sky?" , " answer" : " Blue" },
73
- ]
74
- }
75
-
76
- card = TaskCard(
77
- # Load the data from the dictionary. Data can be also loaded from HF, CSV files, COS and other sources using different loaders.
78
- loader = LoadFromDictionary(data = data),
79
- # Define the QA task input and output and metrics.
80
- task = Task(
81
- input_fields = {" question" : str },
82
- reference_fields = {" answer" : str },
83
- prediction_type = str ,
84
- metrics = [" metrics.accuracy" ],
85
- ),
58
+ # Import required components
59
+ from unitxt import evaluate, create_dataset
60
+ from unitxt.blocks import Task, InputOutputTemplate
61
+ from unitxt.inference import HFAutoModelInferenceEngine
62
+
63
+ # Question-answer dataset
64
+ data = [
65
+ {" question" : " What is the capital of Texas?" , " answer" : " Austin" },
66
+ {" question" : " What is the color of the sky?" , " answer" : " Blue" },
67
+ ]
68
+
69
+ # Define the task and evaluation metric
70
+ task = Task(
71
+ input_fields = {" question" : str },
72
+ reference_fields = {" answer" : str },
73
+ prediction_type = str ,
74
+ metrics = [" metrics.accuracy" ],
86
75
)
87
76
88
- # Create a simple template that formats the input.
89
- # Add lowercase normalization as a post processor on the model prediction.
90
-
77
+ # Create a template to format inputs and outputs
91
78
template = InputOutputTemplate(
92
79
instruction = " Answer the following question." ,
93
80
input_format = " {question} " ,
94
81
output_format = " {answer} " ,
95
82
postprocessors = [" processors.lower_case" ],
96
83
)
97
- # Verbalize the dataset using the template
98
- dataset = load_dataset(card = card, template = template)
99
- test_dataset = dataset[" test" ]
100
84
85
+ # Prepare the dataset
86
+ dataset = create_dataset(
87
+ task = task,
88
+ template = template,
89
+ format = " formats.chat_api" ,
90
+ test_set = data,
91
+ split = " test" ,
92
+ )
101
93
102
- # Infer using flan t5 base using HF API
103
- # can be replaced with any prediction code,
104
- # including the built in WMLInferenceEngine and OpenAiInferenceEngine.
105
- model_name = " google/flan-t5-base"
106
- inference_model = HFPipelineBasedInferenceEngine(
107
- model_name = model_name, max_new_tokens = 32
94
+ # Set up the model (supports Hugging Face, WatsonX, OpenAI, etc.)
95
+ model = HFAutoModelInferenceEngine(
96
+ model_name = " Qwen/Qwen1.5-0.5B-Chat" , max_new_tokens = 32
108
97
)
109
- predictions = inference_model.infer(test_dataset)
110
- evaluated_dataset = evaluate(predictions = predictions, data = test_dataset)
111
98
112
- # Print results
113
- for instance in evaluated_dataset:
114
- print_dict(
115
- instance,
116
- keys_to_print = [
117
- " source" , # input to the model
118
- " prediction" , # model prediction
119
- " processed_prediction" , # model prediction after post processing
120
- " references" , # reference answer
121
- " score" , # scores (per instance and global)
122
- ],
123
- )
99
+ # Generate predictions and evaluate
100
+ predictions = model(dataset)
101
+ results = evaluate(predictions = predictions, data = dataset)
124
102
103
+ # Print results
104
+ print (" Global Results:\n " , results.global_scores.summary)
105
+ print (" Instance Results:\n " , results.instance_scores.summary)
125
106
```
126
107
127
108
# 🦄 Contributors
0 commit comments