IBM · elronbandel · Feb 2, 2025 · Jan 29, 2025 · Jan 29, 2025 · Jan 29, 2025
diff --git a/examples/evaluate_benchmark_with_custom_provider.py b/examples/evaluate_benchmark_with_custom_provider.py
@@ -4,7 +4,7 @@
 data = load_dataset(
     "benchmarks.glue[max_samples_per_subset=5, format=formats.chat_api, system_prompt=system_prompts.general.be_concise]",
     split="test",
-    disable_cache=False,
+    use_cache=True,
 )
 
 model = CrossProviderInferenceEngine(

diff --git a/examples/evaluate_bluebench.py b/examples/evaluate_bluebench.py
@@ -4,14 +4,14 @@
 )
 
 with settings.context(
-    disable_hf_datasets_cache=False,
     allow_unverified_code=True,
     mock_inference_mode=True,
 ):
     test_dataset = load_dataset(
-        "benchmarks.bluebench[loader_limit=30,max_samples_per_subset=30]", split="test"
+        "benchmarks.bluebench[loader_limit=30,max_samples_per_subset=30]",
+        split="test",
+        use_cache=True,
     )
-
 # Infer
 model = CrossProviderInferenceEngine(
     model="llama-3-8b-instruct",

diff --git a/examples/evaluate_same_datasets_and_models_with_multiple_providers.py b/examples/evaluate_same_datasets_and_models_with_multiple_providers.py
@@ -55,7 +55,7 @@
                 demos_pool_size=10,
                 loader_limit=1000,
                 max_test_instances=10,
-                disable_cache=False,
+                use_cache=True,
                 split="test",
             )
 

diff --git a/performance/bluebench_profiler.py b/performance/bluebench_profiler.py
@@ -7,20 +7,22 @@
 from io import StringIO
 from typing import Any, Dict, List, Union
 
-from unitxt.api import evaluate, load_recipe
+from unitxt.api import _source_to_dataset, evaluate, load_recipe
 from unitxt.benchmark import Benchmark
 from unitxt.inference import (
     CrossProviderInferenceEngine,
     InferenceEngine,
     TextGenerationInferenceOutput,
 )
 from unitxt.logging_utils import get_logger
-from unitxt.schema import UNITXT_DATASET_SCHEMA, loads_instance
 from unitxt.settings_utils import get_settings
 
 logger = get_logger()
 settings = get_settings()
+
 settings.allow_unverified_code = True
+settings.disable_hf_datasets_cache = False
+settings.mock_inference_mode = True
 
 
 class BlueBenchProfiler:
@@ -65,19 +67,8 @@ def profiler_instantiate_benchmark_recipe(
     def profiler_generate_benchmark_dataset(
         self, benchmark_recipe: Benchmark, split: str, **kwargs
     ) -> List[Dict[str, Any]]:
-        with settings.context(
-            disable_hf_datasets_cache=False,
-            allow_unverified_code=True,
-            mock_inference_mode=True,
-        ):
-            stream = benchmark_recipe()[split]
-
-            dataset = stream.to_dataset(
-                features=UNITXT_DATASET_SCHEMA, disable_cache=False
-            ).with_transform(loads_instance)
-
-            # to charge here for the time of generating all instances
-            return list(dataset)
+        dataset = _source_to_dataset(benchmark_recipe, split=split)
+        return list(dataset)
 
     def profiler_instantiate_model(self) -> InferenceEngine:
         return CrossProviderInferenceEngine(

diff --git a/prepare/cards/CFPB_product.py b/prepare/cards/CFPB_product.py
@@ -40,7 +40,7 @@
 }
 for subset, url in subset_and_urls.items():
     card = TaskCard(
-        loader=LoadCSV(files={"train": url}),
+        loader=LoadCSV(files={"train": url}, streaming=False),
         preprocess_steps=[
             SplitRandomMix(
                 {

diff --git a/prepare/cards/chat_rag_bench.py b/prepare/cards/chat_rag_bench.py
@@ -57,11 +57,12 @@
             "metrics.rouge",
         ]
 
-        test_card(
-            card_for_test,
-            strict=True,
-            demos_taken_from="test",
-        )
+        if subset == "doqa_travel":
+            test_card(
+                card_for_test,
+                strict=True,
+                demos_taken_from="test",
+            )
         add_to_catalog(
             card,
             f"cards.rag.response_generation.chat_rag_bench.{'train.' if split == 'train' else ''}user_assistant_format.{subset}",

diff --git a/prepare/cards/head_qa.py b/prepare/cards/head_qa.py
@@ -1,57 +1,50 @@
-from datasets import get_dataset_config_names
+import unitxt
 from unitxt import add_to_catalog
 from unitxt.blocks import (
     LoadHF,
     Rename,
     Set,
     TaskCard,
 )
-from unitxt.settings_utils import get_settings
 from unitxt.test_utils.card import test_card
 
-settings = get_settings()
-
-dataset_name = "head_qa"
-
-categories = [
-    "biology",
-    "chemistry",
-    "medicine",
-    "nursery",
-    "pharmacology",
-    "psychology",
-]
-for subset in get_dataset_config_names(
-    dataset_name, trust_remote_code=settings.allow_unverified_code
-):
-    card = TaskCard(
-        loader=LoadHF(path=f"{dataset_name}", name=subset),
-        preprocess_steps=[
-            Rename(field_to_field={"qtext": "text", "category": "label"}),
-            Set(
-                fields={
-                    "classes": categories,
-                    "text_type": "question",
-                }
+with unitxt.settings.context(allow_unverified_code=True):
+    for subset in ["es", "en"]:
+        card = TaskCard(
+            loader=LoadHF(path="dvilares/head_qa", name=subset),
+            preprocess_steps=[
+                Rename(field_to_field={"qtext": "text", "category": "label"}),
+                Set(
+                    fields={
+                        "classes": [
+                            "biology",
+                            "chemistry",
+                            "medicine",
+                            "nursery",
+                            "pharmacology",
+                            "psychology",
+                        ],
+                        "text_type": "question",
+                    }
+                ),
+            ],
+            task="tasks.classification.multi_class.topic_classification",
+            templates="templates.classification.multi_class.all",
+            __description__=(
+                "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio de Sanidad, Consumo y Bienestar Social. The dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology… See the full description on the dataset page: https://huggingface.co/datasets/head_qa"
             ),
-        ],
-        task="tasks.classification.multi_class.topic_classification",
-        templates="templates.classification.multi_class.all",
-        __description__=(
-            "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio de Sanidad, Consumo y Bienestar Social. The dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology… See the full description on the dataset page: https://huggingface.co/datasets/head_qa"
-        ),
-        __tags__={
-            "annotations_creators": "no-annotation",
-            "language": ["en", "es"],
-            "language_creators": "expert-generated",
-            "license": "mit",
-            "multilinguality": "monolingual",
-            "region": "us",
-            "size_categories": "1K<n<10K",
-            "source_datasets": "original",
-            "task_categories": "question-answering",
-            "task_ids": "multiple-choice-qa",
-        },
-    )
-    test_card(card, debug=False)
-    add_to_catalog(card, f"cards.{dataset_name}.{subset}", overwrite=True)
+            __tags__={
+                "annotations_creators": "no-annotation",
+                "language": ["en", "es"],
+                "language_creators": "expert-generated",
+                "license": "mit",
+                "multilinguality": "monolingual",
+                "region": "us",
+                "size_categories": "1K<n<10K",
+                "source_datasets": "original",
+                "task_categories": "question-answering",
+                "task_ids": "multiple-choice-qa",
+            },
+        )
+        test_card(card, debug=False)
+        add_to_catalog(card, f"cards.head_qa.{subset}", overwrite=True)
diff --git a/prepare/cards/piqa.py b/prepare/cards/piqa.py
@@ -4,7 +4,7 @@
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
-    loader=LoadHF(path="piqa"),
+    loader=LoadHF(path="piqa", revision="refs/pr/9"),
     preprocess_steps=[
         ListFieldValues(fields=["sol1", "sol2"], to_field="choices"),
         Rename(

diff --git a/prepare/cards/social_iqa.py b/prepare/cards/social_iqa.py
@@ -14,7 +14,9 @@
 with unitxt.settings.context(allow_unverified_code=True):
     card = TaskCard(
         loader=LoadHF(
-            path="allenai/social_i_qa", data_classification_policy=["public"]
+            path="allenai/social_i_qa",
+            data_classification_policy=["public"],
+            revision="refs/pr/3",
         ),
         preprocess_steps=[
             Deduplicate(by=["context", "question", "answerA", "answerB", "answerC"]),

diff --git a/prepare/cards/translation/flores101.py b/prepare/cards/translation/flores101.py
@@ -150,17 +150,8 @@
         task="tasks.translation.directed",
         templates="templates.translation.directed.all",
     )
-
-    test_card(card, demos_taken_from="test")
+    if pair == pairs[0]:
+        test_card(card, demos_taken_from="test")
     add_to_catalog(
         card, f"cards.mt.flores_101.{pair['src']}_{pair['tgt']}", overwrite=True
     )
-
-if __name__ == "__main__":
-    from unitxt import load_dataset
-
-    ds = load_dataset(
-        "card=cards.mt.flores_101.eng_deu,template_card_index=0",
-    )
-
-    ds["test"][0]
diff --git a/prepare/cards/turl_col_type.py b/prepare/cards/turl_col_type.py
@@ -14,6 +14,7 @@
     loader=LoadHF(
         path="ibm/turl_table_col_type",
         data_classification_policy=["public"],
+        streaming=True,
     ),
     task=Task(
         input_fields={

diff --git a/prepare/cards/websrc.py b/prepare/cards/websrc.py
@@ -15,7 +15,7 @@
 )
 
 card = TaskCard(
-    loader=LoadHF(path="rootsautomation/websrc"),
+    loader=LoadHF(path="rootsautomation/websrc", streaming=True),
     preprocess_steps=[
         RenameSplits(mapper={"train": "train", "dev": "test"}),
         "splitters.small_no_dev",

diff --git a/prepare/cards/wiki_bio.py b/prepare/cards/wiki_bio.py
@@ -11,7 +11,7 @@
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
-    loader=LoadHF(path="wiki_bio"),
+    loader=LoadHF(path="wiki_bio", streaming=True),
     preprocess_steps=[
         SplitRandomMix({"train": "train", "validation": "val", "test": "test"}),
         ListToKeyValPairs(

diff --git a/prepare/cards/winogrande.py b/prepare/cards/winogrande.py
@@ -5,7 +5,9 @@
 
 for subtask in ["debiased", "l", "m", "s", "xl", "xs"]:
     card = TaskCard(
-        loader=LoadHF(path="winogrande", name=f"winogrande_{subtask}"),
+        loader=LoadHF(
+            path="winogrande", name=f"winogrande_{subtask}", revision="refs/pr/6"
+        ),
         preprocess_steps=[
             "splitters.small_no_test",
             ListFieldValues(fields=["option1", "option2"], to_field="choices"),

diff --git a/pyproject.toml b/pyproject.toml
@@ -184,6 +184,7 @@ keep-runtime-typing = true
 ".vscode/*" = ["TID251"]
 "tests/*" = ["TID251"]
 "utils/*" = ["TID251"]
+"src/unitxt/api.py" = ["B904"]
 "src/unitxt/__init__.py" = ["F811", "F401"]
 "src/unitxt/metric.py" = ["F811", "F401"]
 "src/unitxt/dataset.py" = ["F811", "F401"]

diff --git a/src/unitxt/__init__.py b/src/unitxt/__init__.py
@@ -5,7 +5,6 @@
     create_dataset,
     evaluate,
     infer,
-    load,
     load_dataset,
     post_process,
     produce,