Skip to content

Commit 67103ef

Browse files
authored
Merge branch 'main' into llm-judge-prepare
2 parents 6fa0ca5 + 5198d89 commit 67103ef

File tree

80 files changed

+962
-327
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+962
-327
lines changed

.github/workflows/catalog_preparation.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,15 @@ jobs:
3030
- uses: actions/setup-python@v5
3131
with:
3232
python-version: '3.9'
33+
cache: 'pip'
3334

35+
- run: echo "blis==0" > constraints.txt
3436
- run: curl -LsSf https://astral.sh/uv/install.sh | sh
35-
- run: uv pip install --system ".[tests]"
37+
- run: uv pip install --upgrade --system torch --index-url https://download.pytorch.org/whl/cpu
38+
- run: uv pip install --system -c constraints.txt -e ".[tests]"
39+
- run: |
40+
pip install --only-binary :all: spacy
41+
3642
- name: Hugging Face Login
3743
run: |
3844
for i in {1..5}; do

.github/workflows/library_eager_execution_tests.yml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,16 @@ jobs:
3131
- uses: actions/setup-python@v5
3232
with:
3333
python-version: '3.9'
34-
# cache: 'pip' # caching pip dependencies
34+
cache: 'pip'
35+
36+
- run: echo "blis==0" > constraints.txt
3537
- run: curl -LsSf https://astral.sh/uv/install.sh | sh
36-
- run: uv pip install --system ".[tests]"
37-
- run: uv pip install --system coverage[toml]
38+
- run: uv pip install --upgrade --system torch --index-url https://download.pytorch.org/whl/cpu
39+
- run: uv pip install --system -c constraints.txt -e ".[tests]"
40+
- run: |
41+
pip install --only-binary :all: spacy
42+
43+
- run: pip install coverage[toml]
3844

3945
- name: Run Tests
4046
run: coverage run --omit=*/preparation -m unittest discover -s tests/library -p "test_*.py"

.github/workflows/library_tests.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,15 @@ jobs:
3030
- uses: actions/setup-python@v5
3131
with:
3232
python-version: '3.9'
33+
cache: 'pip'
3334

35+
- run: echo "blis==0" > constraints.txt
3436
- run: curl -LsSf https://astral.sh/uv/install.sh | sh
35-
- run: uv pip install --system -e ".[tests]"
37+
- run: uv pip install --upgrade --system torch --index-url https://download.pytorch.org/whl/cpu
38+
- run: uv pip install --system -c constraints.txt -e ".[tests]"
39+
- run: |
40+
pip install --only-binary :all: spacy
41+
3642
- run: pip install coverage[toml]
3743

3844
- name: Run Tests

.github/workflows/test_helm.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ jobs:
2424
python-version: '3.9'
2525

2626
- run: curl -LsSf https://astral.sh/uv/install.sh | sh
27+
- run: uv pip install --upgrade --system "diskcache"
2728
- run: uv pip install --upgrade --system "crfm-helm[unitxt]>=0.5.3"
2829
- run: uv pip install --system "scikit-learn==1.5.2"
2930

prepare/benchmarks/llama_vision.py

Lines changed: 6 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,28 @@
11
from unitxt.benchmark import Benchmark
22
from unitxt.catalog import add_to_catalog
33
from unitxt.standard import DatasetRecipe
4-
from unitxt.templates import MultipleChoiceTemplate, MultiReferenceTemplate
5-
6-
ai2d_template = MultipleChoiceTemplate(
7-
input_format="{context} Look at the scientific diagram carefully and answer the following question: {question}\n{choices}\nRespond only with the correct option digit.",
8-
choices_separator="\n",
9-
target_field="answer",
10-
enumerator="capitals",
11-
)
12-
doc_vqa_template = MultiReferenceTemplate(
13-
input_format="{context} Read the text in the image carefully and answer the question with the text as seen exactly in the image."
14-
" For yes/no questions, just respond Yes or No. If the answer is numeric, just respond with the number and nothing else. "
15-
"If the answer has multiple words, just respond with the words and absolutely nothing else. Never respond in a sentence or a phrase.\n Question: {question}",
16-
references_field="answers",
17-
)
18-
chart_qa_template = MultiReferenceTemplate(
19-
input_format="{context} {question}\nAnswer the question with a single word.",
20-
references_field="answers",
21-
__description__="lmms-evals default template for chartqa.",
22-
)
234

245
benchmark = Benchmark(
256
subsets={
267
"doc_vqa": DatasetRecipe(
278
card="cards.doc_vqa.lmms_eval",
28-
template=doc_vqa_template,
9+
template="templates.qa.llama_vision.with_context.doc_vqa",
2910
format="formats.chat_api",
3011
),
3112
"info_vqa": DatasetRecipe(
3213
card="cards.info_vqa_lmms_eval",
33-
template=doc_vqa_template,
14+
template="templates.qa.llama_vision.with_context.info_vqa",
3415
format="formats.chat_api",
3516
),
3617
"chart_qa": DatasetRecipe(
3718
card="cards.chart_qa_lmms_eval",
38-
template=chart_qa_template,
19+
template="templates.qa.llama_vision.with_context.chart_qa",
3920
format="formats.chat_api",
4021
),
4122
"ai2d": DatasetRecipe(
42-
card="cards.ai2d", template=ai2d_template, format="formats.chat_api"
23+
card="cards.ai2d",
24+
template="templates.qa.llama_vision.multiple_choice.with_context.ai2d",
25+
format="formats.chat_api"
4326
),
4427
},
4528
)

prepare/benchmarks/vision_full.py

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,7 @@
11
from unitxt.benchmark import Benchmark
22
from unitxt.catalog import add_to_catalog
33
from unitxt.standard import DatasetRecipe
4-
from unitxt.templates import MultipleChoiceTemplate, MultiReferenceTemplate
54

6-
ai2d_llama_vision_template = MultipleChoiceTemplate(
7-
input_format="{context} Look at the scientific diagram carefully and answer the following question: {question}\n{choices}\nRespond only with the correct option digit.",
8-
choices_separator="\n",
9-
target_field="answer",
10-
enumerator="capitals",
11-
)
12-
doc_vqa_llama_vision_template = MultiReferenceTemplate(
13-
input_format="{context} Read the text in the image carefully and answer the question with the text as seen exactly in the image."
14-
" For yes/no questions, just respond Yes or No. If the answer is numeric, just respond with the number and nothing else. "
15-
"If the answer has multiple words, just respond with the words and absolutely nothing else. Never respond in a sentence or a phrase.\n Question: {question}",
16-
references_field="answers",
17-
)
18-
chart_qa_llama_vision_template = MultiReferenceTemplate(
19-
input_format="{context} {question}\nAnswer the question with a single word.",
20-
references_field="answers",
21-
__description__="lmms-evals default template for chartqa.",
22-
)
235
benchmark = Benchmark(
246
subsets={
257
"doc_vqa_default": DatasetRecipe(
@@ -39,22 +21,22 @@
3921
),
4022
"doc_vqa_llama_vision_template": DatasetRecipe(
4123
card="cards.doc_vqa.lmms_eval",
42-
template=doc_vqa_llama_vision_template,
24+
template="templates.qa.llama_vision.with_context.doc_vqa",
4325
format="formats.chat_api",
4426
),
4527
"info_vqa_llama_vision_template": DatasetRecipe(
4628
card="cards.info_vqa_lmms_eval",
47-
template=doc_vqa_llama_vision_template,
29+
template="templates.qa.llama_vision.with_context.info_vqa",
4830
format="formats.chat_api",
4931
),
5032
"chart_qa_llama_vision_template": DatasetRecipe(
5133
card="cards.chart_qa_lmms_eval",
52-
template=chart_qa_llama_vision_template,
34+
template="templates.qa.llama_vision.with_context.chart_qa",
5335
format="formats.chat_api",
5436
),
5537
"ai2d_llama_vision_template": DatasetRecipe(
5638
card="cards.ai2d",
57-
template=ai2d_llama_vision_template,
39+
template="templates.qa.llama_vision.multiple_choice.with_context.ai2d",
5840
format="formats.chat_api",
5941
),
6042
},

prepare/cards/ai2d.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,9 @@
33
from unitxt.catalog import add_to_catalog
44
from unitxt.image_operators import ToImage
55
from unitxt.operators import Cast, Rename, Shuffle
6-
from unitxt.templates import MultipleChoiceTemplate
76
from unitxt.test_utils.card import test_card
87

98
templates = get_from_catalog("templates.qa.multiple_choice.with_context.no_intro.all")
10-
template = MultipleChoiceTemplate(
11-
input_format="{context}\n{question}\n{choices}\nAnswer with the option's letter from the given choices directly.",
12-
choices_separator="\n",
13-
target_field="answer",
14-
enumerator="capitals",
15-
)
169

1710
card = TaskCard(
1811
loader=LoadHF(path="lmms-lab/ai2d"),
@@ -24,7 +17,7 @@
2417
Cast(field="answer", to="int"),
2518
],
2619
task="tasks.qa.multiple_choice.with_context[metrics=[metrics.exact_match_mm]]",
27-
templates=[template, *templates.items],
20+
templates=["templates.qa.multiple_choice.with_context.ai2d", *templates.items],
2821
__tags__={},
2922
__description__=(
3023
"AI2 Diagrams (AI2D) is a dataset of over 5000 grade school science diagrams with over 150000 rich annotations, their ground truth syntactic parses, and more than 15000 corresponding multiple choice questions."

prepare/cards/chart_qa.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,9 @@
55
from unitxt.image_operators import ToImage
66
from unitxt.operators import Rename, Shuffle
77
from unitxt.splitters import RenameSplits
8-
from unitxt.templates import MultiReferenceTemplate
98
from unitxt.test_utils.card import test_card
109

1110
templates = get_from_catalog("templates.qa.with_context.all")
12-
template = MultiReferenceTemplate(
13-
input_format="{context}\n{question}\nAnswer the question using a single word.",
14-
references_field="answers",
15-
__description__="lmms-evals default template for chartqa.",
16-
)
1711

1812
card = TaskCard(
1913
loader=LoadHF(path="HuggingFaceM4/ChartQA"),
@@ -26,7 +20,7 @@
2620
Set(fields={"context_type": "image"}),
2721
],
2822
task="tasks.qa.with_context",
29-
templates=[template, *templates.items],
23+
templates=["templates.qa.with_context.chart_qa", *templates.items],
3024
__tags__={
3125
"license": "GPL-3.0",
3226
"multilinguality": "monolingual",
@@ -53,7 +47,7 @@
5347
Set(fields={"context_type": "image"}),
5448
],
5549
task="tasks.qa.with_context.with_type[metrics=[metrics.relaxed_correctness]]",
56-
templates=[template, *templates.items],
50+
templates=["templates.qa.with_context.chart_qa", *templates.items],
5751
__tags__={
5852
"license": "GPL-3.0",
5953
"multilinguality": "monolingual",

prepare/cards/cola.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from unitxt.test_utils.card import test_card
1010

1111
card = TaskCard(
12-
loader=LoadHF(path="glue", name="cola"),
12+
loader=LoadHF(path="nyu-mll/glue", name="cola"),
1313
preprocess_steps=[
1414
"splitters.small_no_test",
1515
MapInstanceValues(mappers={"label": {"0": "unacceptable", "1": "acceptable"}}),

prepare/cards/doc_vqa.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,9 @@
44
from unitxt.image_operators import ToImage
55
from unitxt.operators import Copy, Shuffle
66
from unitxt.splitters import RenameSplits
7-
from unitxt.templates import MultiReferenceTemplate
87
from unitxt.test_utils.card import test_card
98

109
templates = get_from_catalog("templates.qa.with_context.all")
11-
template = MultiReferenceTemplate(
12-
input_format="{context}\n{question}\nAnswer the question using a single word or phrase.",
13-
references_field="answers",
14-
__description__="lmms-evals default template for docvqa.",
15-
)
1610

1711
for language in ["en", "fr"]:
1812
card = TaskCard(
@@ -28,7 +22,7 @@
2822
Set(fields={"context_type": "image"}),
2923
],
3024
task="tasks.qa.with_context.abstractive[metrics=[metrics.anls]]",
31-
templates=[template, *templates.items],
25+
templates=["templates.qa.with_context.doc_vqa", *templates.items],
3226
__tags__={
3327
"license": "apache-2.0",
3428
"multilinguality": "monolingual",
@@ -57,7 +51,7 @@
5751
Set(fields={"context_type": "image"}),
5852
],
5953
task="tasks.qa.with_context.abstractive[metrics=[metrics.anls]]",
60-
templates=[template, *templates.items],
54+
templates=["templates.qa.with_context.doc_vqa", *templates.items],
6155
__tags__={
6256
"license": "apache-2.0",
6357
"multilinguality": "monolingual",

0 commit comments

Comments
 (0)