IBM
diff --git a/‎.github/workflows/catalog_preparation.yml‎
Lines changed: 7 additions & 1 deletion b/‎.github/workflows/catalog_preparation.yml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎.github/workflows/library_eager_execution_tests.yml‎
Lines changed: 9 additions & 3 deletions b/‎.github/workflows/library_eager_execution_tests.yml‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎.github/workflows/library_tests.yml‎
Lines changed: 7 additions & 1 deletion b/‎.github/workflows/library_tests.yml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎.github/workflows/test_helm.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/test_helm.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎prepare/benchmarks/llama_vision.py‎
Lines changed: 6 additions & 23 deletions b/‎prepare/benchmarks/llama_vision.py‎
Lines changed: 6 additions & 23 deletions
diff --git a/‎prepare/benchmarks/vision_full.py‎
Lines changed: 4 additions & 22 deletions b/‎prepare/benchmarks/vision_full.py‎
Lines changed: 4 additions & 22 deletions
diff --git a/‎prepare/cards/ai2d.py‎
Lines changed: 1 addition & 8 deletions b/‎prepare/cards/ai2d.py‎
Lines changed: 1 addition & 8 deletions
diff --git a/‎prepare/cards/chart_qa.py‎
Lines changed: 2 additions & 8 deletions b/‎prepare/cards/chart_qa.py‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎prepare/cards/cola.py‎
Lines changed: 1 addition & 1 deletion b/‎prepare/cards/cola.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎prepare/cards/doc_vqa.py‎
Lines changed: 2 additions & 8 deletions b/‎prepare/cards/doc_vqa.py‎
Lines changed: 2 additions & 8 deletions
@@ -30,9 +30,15 @@ jobs:
     - uses: actions/setup-python@v5
       with:
         python-version: '3.9'
+        cache: 'pip'
 
+    - run: echo "blis==0" > constraints.txt
     - run: curl -LsSf https://astral.sh/uv/install.sh | sh
-    - run: uv pip install --system ".[tests]"
+    - run: uv pip install --upgrade --system torch --index-url https://download.pytorch.org/whl/cpu
+    - run: uv pip install --system -c constraints.txt -e ".[tests]"
+    - run: |
+        pip install --only-binary :all: spacy
+
     - name:  Hugging Face Login
       run: |
         for i in {1..5}; do
 
@@ -31,10 +31,16 @@ jobs:
     - uses: actions/setup-python@v5
       with:
         python-version: '3.9'
-        # cache: 'pip' # caching pip dependencies
+        cache: 'pip'
+
+    - run: echo "blis==0" > constraints.txt
     - run: curl -LsSf https://astral.sh/uv/install.sh | sh
-    - run: uv pip install --system ".[tests]"
-    - run: uv pip install --system coverage[toml]
+    - run: uv pip install --upgrade --system torch --index-url https://download.pytorch.org/whl/cpu
+    - run: uv pip install --system -c constraints.txt -e ".[tests]"
+    - run: |
+        pip install --only-binary :all: spacy
+
+    - run: pip install coverage[toml]
 
     - name: Run Tests
       run:  coverage run --omit=*/preparation -m unittest discover -s tests/library -p "test_*.py"
@@ -30,9 +30,15 @@ jobs:
     - uses: actions/setup-python@v5
       with:
         python-version: '3.9'
+        cache: 'pip'
 
+    - run: echo "blis==0" > constraints.txt
     - run: curl -LsSf https://astral.sh/uv/install.sh | sh
-    - run: uv pip install --system -e ".[tests]"
+    - run: uv pip install --upgrade --system torch --index-url https://download.pytorch.org/whl/cpu
+    - run: uv pip install --system -c constraints.txt -e ".[tests]"
+    - run: |
+        pip install --only-binary :all: spacy
+
     - run: pip install coverage[toml]
 
     - name: Run Tests
 
@@ -24,6 +24,7 @@ jobs:
           python-version: '3.9'
 
       - run: curl -LsSf https://astral.sh/uv/install.sh | sh
+      - run: uv pip install --upgrade --system "diskcache"
       - run: uv pip install --upgrade --system "crfm-helm[unitxt]>=0.5.3"
       - run: uv pip install --system "scikit-learn==1.5.2"
 
 
@@ -1,45 +1,28 @@
 from unitxt.benchmark import Benchmark
 from unitxt.catalog import add_to_catalog
 from unitxt.standard import DatasetRecipe
-from unitxt.templates import MultipleChoiceTemplate, MultiReferenceTemplate
-
-ai2d_template = MultipleChoiceTemplate(
-    input_format="{context} Look at the scientific diagram carefully and answer the following question: {question}\n{choices}\nRespond only with the correct option digit.",
-    choices_separator="\n",
-    target_field="answer",
-    enumerator="capitals",
-)
-doc_vqa_template = MultiReferenceTemplate(
-    input_format="{context} Read the text in the image carefully and answer the question with the text as seen exactly in the image."
-    " For yes/no questions, just respond Yes or No. If the answer is numeric, just respond with the number and nothing else. "
-    "If the answer has multiple words, just respond with the words and absolutely nothing else. Never respond in a sentence or a phrase.\n Question: {question}",
-    references_field="answers",
-)
-chart_qa_template = MultiReferenceTemplate(
-    input_format="{context} {question}\nAnswer the question with a single word.",
-    references_field="answers",
-    __description__="lmms-evals default template for chartqa.",
-)
 
 benchmark = Benchmark(
     subsets={
         "doc_vqa": DatasetRecipe(
             card="cards.doc_vqa.lmms_eval",
-            template=doc_vqa_template,
+            template="templates.qa.llama_vision.with_context.doc_vqa",
             format="formats.chat_api",
         ),
         "info_vqa": DatasetRecipe(
             card="cards.info_vqa_lmms_eval",
-            template=doc_vqa_template,
+            template="templates.qa.llama_vision.with_context.info_vqa",
             format="formats.chat_api",
         ),
         "chart_qa": DatasetRecipe(
             card="cards.chart_qa_lmms_eval",
-            template=chart_qa_template,
+            template="templates.qa.llama_vision.with_context.chart_qa",
             format="formats.chat_api",
         ),
         "ai2d": DatasetRecipe(
-            card="cards.ai2d", template=ai2d_template, format="formats.chat_api"
+            card="cards.ai2d",
+            template="templates.qa.llama_vision.multiple_choice.with_context.ai2d",
+            format="formats.chat_api"
         ),
     },
 )
 
@@ -1,25 +1,7 @@
 from unitxt.benchmark import Benchmark
 from unitxt.catalog import add_to_catalog
 from unitxt.standard import DatasetRecipe
-from unitxt.templates import MultipleChoiceTemplate, MultiReferenceTemplate
 
-ai2d_llama_vision_template = MultipleChoiceTemplate(
-    input_format="{context} Look at the scientific diagram carefully and answer the following question: {question}\n{choices}\nRespond only with the correct option digit.",
-    choices_separator="\n",
-    target_field="answer",
-    enumerator="capitals",
-)
-doc_vqa_llama_vision_template = MultiReferenceTemplate(
-    input_format="{context} Read the text in the image carefully and answer the question with the text as seen exactly in the image."
-                 " For yes/no questions, just respond Yes or No. If the answer is numeric, just respond with the number and nothing else. "
-                 "If the answer has multiple words, just respond with the words and absolutely nothing else. Never respond in a sentence or a phrase.\n Question: {question}",
-    references_field="answers",
-)
-chart_qa_llama_vision_template = MultiReferenceTemplate(
-    input_format="{context} {question}\nAnswer the question with a single word.",
-    references_field="answers",
-    __description__="lmms-evals default template for chartqa.",
-)
 benchmark = Benchmark(
     subsets={
         "doc_vqa_default": DatasetRecipe(
@@ -39,22 +21,22 @@
         ),
         "doc_vqa_llama_vision_template": DatasetRecipe(
             card="cards.doc_vqa.lmms_eval",
-            template=doc_vqa_llama_vision_template,
+            template="templates.qa.llama_vision.with_context.doc_vqa",
             format="formats.chat_api",
         ),
         "info_vqa_llama_vision_template": DatasetRecipe(
             card="cards.info_vqa_lmms_eval",
-            template=doc_vqa_llama_vision_template,
+            template="templates.qa.llama_vision.with_context.info_vqa",
             format="formats.chat_api",
         ),
         "chart_qa_llama_vision_template": DatasetRecipe(
             card="cards.chart_qa_lmms_eval",
-            template=chart_qa_llama_vision_template,
+            template="templates.qa.llama_vision.with_context.chart_qa",
             format="formats.chat_api",
         ),
         "ai2d_llama_vision_template": DatasetRecipe(
             card="cards.ai2d",
-            template=ai2d_llama_vision_template,
+            template="templates.qa.llama_vision.multiple_choice.with_context.ai2d",
             format="formats.chat_api",
         ),
     },
 
@@ -3,16 +3,9 @@
 from unitxt.catalog import add_to_catalog
 from unitxt.image_operators import ToImage
 from unitxt.operators import Cast, Rename, Shuffle
-from unitxt.templates import MultipleChoiceTemplate
 from unitxt.test_utils.card import test_card
 
 templates = get_from_catalog("templates.qa.multiple_choice.with_context.no_intro.all")
-template = MultipleChoiceTemplate(
-    input_format="{context}\n{question}\n{choices}\nAnswer with the option's letter from the given choices directly.",
-    choices_separator="\n",
-    target_field="answer",
-    enumerator="capitals",
-)
 
 card = TaskCard(
     loader=LoadHF(path="lmms-lab/ai2d"),
@@ -24,7 +17,7 @@
         Cast(field="answer", to="int"),
     ],
     task="tasks.qa.multiple_choice.with_context[metrics=[metrics.exact_match_mm]]",
-    templates=[template, *templates.items],
+    templates=["templates.qa.multiple_choice.with_context.ai2d", *templates.items],
     __tags__={},
     __description__=(
         "AI2 Diagrams (AI2D) is a dataset of over 5000 grade school science diagrams with over 150000 rich annotations, their ground truth syntactic parses, and more than 15000 corresponding multiple choice questions."
 
@@ -5,15 +5,9 @@
 from unitxt.image_operators import ToImage
 from unitxt.operators import Rename, Shuffle
 from unitxt.splitters import RenameSplits
-from unitxt.templates import MultiReferenceTemplate
 from unitxt.test_utils.card import test_card
 
 templates = get_from_catalog("templates.qa.with_context.all")
-template = MultiReferenceTemplate(
-    input_format="{context}\n{question}\nAnswer the question using a single word.",
-    references_field="answers",
-    __description__="lmms-evals default template for chartqa.",
-)
 
 card = TaskCard(
     loader=LoadHF(path="HuggingFaceM4/ChartQA"),
@@ -26,7 +20,7 @@
         Set(fields={"context_type": "image"}),
     ],
     task="tasks.qa.with_context",
-    templates=[template, *templates.items],
+    templates=["templates.qa.with_context.chart_qa", *templates.items],
     __tags__={
         "license": "GPL-3.0",
         "multilinguality": "monolingual",
@@ -53,7 +47,7 @@
         Set(fields={"context_type": "image"}),
     ],
     task="tasks.qa.with_context.with_type[metrics=[metrics.relaxed_correctness]]",
-    templates=[template, *templates.items],
+    templates=["templates.qa.with_context.chart_qa", *templates.items],
     __tags__={
         "license": "GPL-3.0",
         "multilinguality": "monolingual",
 
@@ -9,7 +9,7 @@
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
-    loader=LoadHF(path="glue", name="cola"),
+    loader=LoadHF(path="nyu-mll/glue", name="cola"),
     preprocess_steps=[
         "splitters.small_no_test",
         MapInstanceValues(mappers={"label": {"0": "unacceptable", "1": "acceptable"}}),
 
@@ -4,15 +4,9 @@
 from unitxt.image_operators import ToImage
 from unitxt.operators import Copy, Shuffle
 from unitxt.splitters import RenameSplits
-from unitxt.templates import MultiReferenceTemplate
 from unitxt.test_utils.card import test_card
 
 templates = get_from_catalog("templates.qa.with_context.all")
-template = MultiReferenceTemplate(
-    input_format="{context}\n{question}\nAnswer the question using a single word or phrase.",
-    references_field="answers",
-    __description__="lmms-evals default template for docvqa.",
-)
 
 for language in ["en", "fr"]:
     card = TaskCard(
@@ -28,7 +22,7 @@
             Set(fields={"context_type": "image"}),
         ],
         task="tasks.qa.with_context.abstractive[metrics=[metrics.anls]]",
-        templates=[template, *templates.items],
+        templates=["templates.qa.with_context.doc_vqa", *templates.items],
         __tags__={
             "license": "apache-2.0",
             "multilinguality": "monolingual",
@@ -57,7 +51,7 @@
         Set(fields={"context_type": "image"}),
     ],
     task="tasks.qa.with_context.abstractive[metrics=[metrics.anls]]",
-    templates=[template, *templates.items],
+    templates=["templates.qa.with_context.doc_vqa", *templates.items],
     __tags__={
         "license": "apache-2.0",
         "multilinguality": "monolingual",