Skip to content

Commit ab37ba6

Browse files
committed
fixed a split list
Signed-off-by: dafnapension <[email protected]>
1 parent e74a0d6 commit ab37ba6

File tree

5 files changed

+38
-46
lines changed

5 files changed

+38
-46
lines changed

prepare/cards/chart_qa.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646

4747
card = TaskCard(
48-
loader=LoadHF(path="lmms-lab/ChartQA", splits=["train", "val", "test"]),
48+
loader=LoadHF(path="lmms-lab/ChartQA", splits=["test"]),
4949
preprocess_steps=[
5050
Wrap(field="answer", inside="list", to_field="answers"),
5151
ToImage(field="image", to_field="context"),

prepare/cards/doc_vqa.py

+33
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from unitxt.collections_operators import Explode, Wrap
44
from unitxt.image_operators import ToImage
55
from unitxt.operators import Copy
6+
from unitxt.splitters import RenameSplits
67
from unitxt.templates import MultiReferenceTemplate
78
from unitxt.test_utils.card import test_card
89

@@ -47,3 +48,35 @@
4748

4849
test_card(card)
4950
add_to_catalog(card, f"cards.doc_vqa.{language}", overwrite=True)
51+
52+
53+
card = TaskCard(
54+
loader=LoadHF(
55+
path="lmms-lab/DocVQA",
56+
name="DocVQA",
57+
data_classification_policy=["public"],
58+
splits=["test", "validation"],
59+
),
60+
preprocess_steps=[
61+
RenameSplits(mapper={"validation": "test"}),
62+
ToImage(field="image", to_field="context"),
63+
Set(fields={"context_type": "image"}),
64+
],
65+
task="tasks.qa.with_context.abstractive[metrics=[metrics.anls]]",
66+
templates=[template, *templates.items],
67+
default_template=template,
68+
__tags__={
69+
"license": "apache-2.0",
70+
"multilinguality": "monolingual",
71+
"modalities": ["image", "text"],
72+
"size_categories": "10K<n<100K",
73+
"task_categories": "question-answering",
74+
"task_ids": "extractive-qa",
75+
},
76+
__description__=(
77+
"The doc-vqa Dataset integrates images from the Infographic_vqa dataset sourced from HuggingFaceM4 The Cauldron dataset, as well as images from the dataset AFTDB (Arxiv Figure Table Database) curated by cmarkea. This dataset consists of pairs of images and corresponding text, with each image linked to an average of five questions and answers available in both English and French. These questions and answers were generated using Gemini 1.5 Pro, thereby rendering the dataset well-suited for multimodal tasks involving image-text pairing and multilingual question answering."
78+
),
79+
)
80+
81+
test_card(card)
82+
add_to_catalog(card, "cards.doc_vqa.lmms_eval", overwrite=True)

prepare/cards/doc_vqa_lmms.py

-43
This file was deleted.

src/unitxt/catalog/cards/chart_qa_lmms_eval.json

-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
"__type__": "load_hf",
55
"path": "lmms-lab/ChartQA",
66
"splits": [
7-
"train",
8-
"val",
97
"test"
108
]
119
},

src/unitxt/catalog/cards/doc_vqa/lmms_eval.json

+4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66
"name": "DocVQA",
77
"data_classification_policy": [
88
"public"
9+
],
10+
"splits": [
11+
"test",
12+
"validation"
913
]
1014
},
1115
"preprocess_steps": [

0 commit comments

Comments
 (0)