fixed a split list

dafnapension · dafnapension · commit ab37ba65e220 · 2025-02-09T18:11:45.000+02:00
Signed-off-by: dafnapension &lt;dafnashein@yahoo.com&gt;
diff --git a/prepare/cards/chart_qa.py b/prepare/cards/chart_qa.py
@@ -45,7 +45,7 @@
 
 
 card = TaskCard(
-    loader=LoadHF(path="lmms-lab/ChartQA", splits=["train", "val", "test"]),
+    loader=LoadHF(path="lmms-lab/ChartQA", splits=["test"]),
     preprocess_steps=[
         Wrap(field="answer", inside="list", to_field="answers"),
         ToImage(field="image", to_field="context"),
diff --git a/prepare/cards/doc_vqa.py b/prepare/cards/doc_vqa.py
@@ -3,6 +3,7 @@
 from unitxt.collections_operators import Explode, Wrap
 from unitxt.image_operators import ToImage
 from unitxt.operators import Copy
+from unitxt.splitters import RenameSplits
 from unitxt.templates import MultiReferenceTemplate
 from unitxt.test_utils.card import test_card
 
@@ -47,3 +48,35 @@
 
     test_card(card)
     add_to_catalog(card, f"cards.doc_vqa.{language}", overwrite=True)
+
+
+card = TaskCard(
+    loader=LoadHF(
+        path="lmms-lab/DocVQA",
+        name="DocVQA",
+        data_classification_policy=["public"],
+        splits=["test", "validation"],
+    ),
+    preprocess_steps=[
+        RenameSplits(mapper={"validation": "test"}),
+        ToImage(field="image", to_field="context"),
+        Set(fields={"context_type": "image"}),
+    ],
+    task="tasks.qa.with_context.abstractive[metrics=[metrics.anls]]",
+    templates=[template, *templates.items],
+    default_template=template,
+    __tags__={
+        "license": "apache-2.0",
+        "multilinguality": "monolingual",
+        "modalities": ["image", "text"],
+        "size_categories": "10K<n<100K",
+        "task_categories": "question-answering",
+        "task_ids": "extractive-qa",
+    },
+    __description__=(
+        "The doc-vqa Dataset integrates images from the Infographic_vqa dataset sourced from HuggingFaceM4 The Cauldron dataset, as well as images from the dataset AFTDB (Arxiv Figure Table Database) curated by cmarkea. This dataset consists of pairs of images and corresponding text, with each image linked to an average of five questions and answers available in both English and French. These questions and answers were generated using Gemini 1.5 Pro, thereby rendering the dataset well-suited for multimodal tasks involving image-text pairing and multilingual question answering."
+    ),
+)
+
+test_card(card)
+add_to_catalog(card, "cards.doc_vqa.lmms_eval", overwrite=True)
diff --git a/prepare/cards/doc_vqa_lmms.py b/prepare/cards/doc_vqa_lmms.py
diff --git a/src/unitxt/catalog/cards/chart_qa_lmms_eval.json b/src/unitxt/catalog/cards/chart_qa_lmms_eval.json
@@ -4,8 +4,6 @@
         "__type__": "load_hf",
         "path": "lmms-lab/ChartQA",
         "splits": [
-            "train",
-            "val",
             "test"
         ]
     },
diff --git a/src/unitxt/catalog/cards/doc_vqa/lmms_eval.json b/src/unitxt/catalog/cards/doc_vqa/lmms_eval.json
@@ -6,6 +6,10 @@
         "name": "DocVQA",
         "data_classification_policy": [
             "public"
+        ],
+        "splits": [
+            "test",
+            "validation"
         ]
     },
     "preprocess_steps": [

Original file line number	Diff line number	Diff line change
`@@ -4,8 +4,6 @@`
`4`	`4`	`"__type__": "load_hf",`
`5`	`5`	`"path": "lmms-lab/ChartQA",`
`6`	`6`	`"splits": [`
`7`		`- "train",`
`8`		`- "val",`
`9`	`7`	`"test"`
`10`	`8`	`]`
`11`	`9`	`},`