IBM
diff --git a/‎examples/evaluate_benchmark_with_custom_provider.py
Lines changed: 1 addition & 1 deletion b/‎examples/evaluate_benchmark_with_custom_provider.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/evaluate_bluebench.py
Lines changed: 3 additions & 3 deletions b/‎examples/evaluate_bluebench.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/evaluate_same_datasets_and_models_with_multiple_providers.py
Lines changed: 1 addition & 1 deletion b/‎examples/evaluate_same_datasets_and_models_with_multiple_providers.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎performance/bluebench_profiler.py
Lines changed: 6 additions & 15 deletions b/‎performance/bluebench_profiler.py
Lines changed: 6 additions & 15 deletions
diff --git a/‎prepare/cards/CFPB_product.py
Lines changed: 1 addition & 1 deletion b/‎prepare/cards/CFPB_product.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎prepare/cards/chat_rag_bench.py
Lines changed: 6 additions & 5 deletions b/‎prepare/cards/chat_rag_bench.py
Lines changed: 6 additions & 5 deletions
diff --git a/‎prepare/cards/head_qa.py
Lines changed: 40 additions & 47 deletions b/‎prepare/cards/head_qa.py
Lines changed: 40 additions & 47 deletions
diff --git a/‎prepare/cards/piqa.py
Lines changed: 1 addition & 1 deletion b/‎prepare/cards/piqa.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎prepare/cards/social_iqa.py
Lines changed: 3 additions & 1 deletion b/‎prepare/cards/social_iqa.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎prepare/cards/translation/flores101.py
Lines changed: 2 additions & 11 deletions b/‎prepare/cards/translation/flores101.py
Lines changed: 2 additions & 11 deletions
diff --git a/‎prepare/cards/turl_col_type.py
Lines changed: 1 addition & 0 deletions b/‎prepare/cards/turl_col_type.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎prepare/cards/websrc.py
Lines changed: 1 addition & 1 deletion b/‎prepare/cards/websrc.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎prepare/cards/wiki_bio.py
Lines changed: 1 addition & 1 deletion b/‎prepare/cards/wiki_bio.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎prepare/cards/winogrande.py
Lines changed: 3 additions & 1 deletion b/‎prepare/cards/winogrande.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎pyproject.toml
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/unitxt/__init__.py
Lines changed: 0 additions & 1 deletion b/‎src/unitxt/__init__.py
Lines changed: 0 additions & 1 deletion
@@ -4,7 +4,7 @@
 data = load_dataset(
     "benchmarks.glue[max_samples_per_subset=5, format=formats.chat_api, system_prompt=system_prompts.general.be_concise]",
     split="test",
-    disable_cache=False,
+    use_cache=True,
 )
 
 model = CrossProviderInferenceEngine(
 
@@ -4,14 +4,14 @@
 )
 
 with settings.context(
-    disable_hf_datasets_cache=False,
     allow_unverified_code=True,
     mock_inference_mode=True,
 ):
     test_dataset = load_dataset(
-        "benchmarks.bluebench[loader_limit=30,max_samples_per_subset=30]", split="test"
+        "benchmarks.bluebench[loader_limit=30,max_samples_per_subset=30]",
+        split="test",
+        use_cache=True,
     )
-
 # Infer
 model = CrossProviderInferenceEngine(
     model="llama-3-8b-instruct",
 
@@ -55,7 +55,7 @@
                 demos_pool_size=10,
                 loader_limit=1000,
                 max_test_instances=10,
-                disable_cache=False,
+                use_cache=True,
                 split="test",
             )
 
 
@@ -7,20 +7,22 @@
 from io import StringIO
 from typing import Any, Dict, List, Union
 
-from unitxt.api import evaluate, load_recipe
+from unitxt.api import _source_to_dataset, evaluate, load_recipe
 from unitxt.benchmark import Benchmark
 from unitxt.inference import (
     CrossProviderInferenceEngine,
     InferenceEngine,
     TextGenerationInferenceOutput,
 )
 from unitxt.logging_utils import get_logger
-from unitxt.schema import UNITXT_DATASET_SCHEMA, loads_instance
 from unitxt.settings_utils import get_settings
 
 logger = get_logger()
 settings = get_settings()
+
 settings.allow_unverified_code = True
+settings.disable_hf_datasets_cache = False
+settings.mock_inference_mode = True
 
 
 class BlueBenchProfiler:
@@ -65,19 +67,8 @@ def profiler_instantiate_benchmark_recipe(
     def profiler_generate_benchmark_dataset(
         self, benchmark_recipe: Benchmark, split: str, **kwargs
     ) -> List[Dict[str, Any]]:
-        with settings.context(
-            disable_hf_datasets_cache=False,
-            allow_unverified_code=True,
-            mock_inference_mode=True,
-        ):
-            stream = benchmark_recipe()[split]
-
-            dataset = stream.to_dataset(
-                features=UNITXT_DATASET_SCHEMA, disable_cache=False
-            ).with_transform(loads_instance)
-
-            # to charge here for the time of generating all instances
-            return list(dataset)
+        dataset = _source_to_dataset(benchmark_recipe, split=split)
+        return list(dataset)
 
     def profiler_instantiate_model(self) -> InferenceEngine:
         return CrossProviderInferenceEngine(
 
@@ -40,7 +40,7 @@
 }
 for subset, url in subset_and_urls.items():
     card = TaskCard(
-        loader=LoadCSV(files={"train": url}),
+        loader=LoadCSV(files={"train": url}, streaming=False),
         preprocess_steps=[
             SplitRandomMix(
                 {
 
@@ -57,11 +57,12 @@
             "metrics.rouge",
         ]
 
-        test_card(
-            card_for_test,
-            strict=True,
-            demos_taken_from="test",
-        )
+        if subset == "doqa_travel":
+            test_card(
+                card_for_test,
+                strict=True,
+                demos_taken_from="test",
+            )
         add_to_catalog(
             card,
             f"cards.rag.response_generation.chat_rag_bench.{'train.' if split == 'train' else ''}user_assistant_format.{subset}",
 
@@ -1,57 +1,50 @@
-from datasets import get_dataset_config_names
+import unitxt
 from unitxt import add_to_catalog
 from unitxt.blocks import (
     LoadHF,
     Rename,
     Set,
     TaskCard,
 )
-from unitxt.settings_utils import get_settings
 from unitxt.test_utils.card import test_card
 
-settings = get_settings()
-
-dataset_name = "head_qa"
-
-categories = [
-    "biology",
-    "chemistry",
-    "medicine",
-    "nursery",
-    "pharmacology",
-    "psychology",
-]
-for subset in get_dataset_config_names(
-    dataset_name, trust_remote_code=settings.allow_unverified_code
-):
-    card = TaskCard(
-        loader=LoadHF(path=f"{dataset_name}", name=subset),
-        preprocess_steps=[
-            Rename(field_to_field={"qtext": "text", "category": "label"}),
-            Set(
-                fields={
-                    "classes": categories,
-                    "text_type": "question",
-                }
+with unitxt.settings.context(allow_unverified_code=True):
+    for subset in ["es", "en"]:
+        card = TaskCard(
+            loader=LoadHF(path="dvilares/head_qa", name=subset),
+            preprocess_steps=[
+                Rename(field_to_field={"qtext": "text", "category": "label"}),
+                Set(
+                    fields={
+                        "classes": [
+                            "biology",
+                            "chemistry",
+                            "medicine",
+                            "nursery",
+                            "pharmacology",
+                            "psychology",
+                        ],
+                        "text_type": "question",
+                    }
+                ),
+            ],
+            task="tasks.classification.multi_class.topic_classification",
+            templates="templates.classification.multi_class.all",
+            __description__=(
+                "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio de Sanidad, Consumo y Bienestar Social. The dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology… See the full description on the dataset page: https://huggingface.co/datasets/head_qa"
             ),
-        ],
-        task="tasks.classification.multi_class.topic_classification",
-        templates="templates.classification.multi_class.all",
-        __description__=(
-            "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio de Sanidad, Consumo y Bienestar Social. The dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology… See the full description on the dataset page: https://huggingface.co/datasets/head_qa"
-        ),
-        __tags__={
-            "annotations_creators": "no-annotation",
-            "language": ["en", "es"],
-            "language_creators": "expert-generated",
-            "license": "mit",
-            "multilinguality": "monolingual",
-            "region": "us",
-            "size_categories": "1K<n<10K",
-            "source_datasets": "original",
-            "task_categories": "question-answering",
-            "task_ids": "multiple-choice-qa",
-        },
-    )
-    test_card(card, debug=False)
-    add_to_catalog(card, f"cards.{dataset_name}.{subset}", overwrite=True)
+            __tags__={
+                "annotations_creators": "no-annotation",
+                "language": ["en", "es"],
+                "language_creators": "expert-generated",
+                "license": "mit",
+                "multilinguality": "monolingual",
+                "region": "us",
+                "size_categories": "1K<n<10K",
+                "source_datasets": "original",
+                "task_categories": "question-answering",
+                "task_ids": "multiple-choice-qa",
+            },
+        )
+        test_card(card, debug=False)
+        add_to_catalog(card, f"cards.head_qa.{subset}", overwrite=True)
@@ -4,7 +4,7 @@
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
-    loader=LoadHF(path="piqa"),
+    loader=LoadHF(path="piqa", revision="refs/pr/9"),
     preprocess_steps=[
         ListFieldValues(fields=["sol1", "sol2"], to_field="choices"),
         Rename(
 
@@ -14,7 +14,9 @@
 with unitxt.settings.context(allow_unverified_code=True):
     card = TaskCard(
         loader=LoadHF(
-            path="allenai/social_i_qa", data_classification_policy=["public"]
+            path="allenai/social_i_qa",
+            data_classification_policy=["public"],
+            revision="refs/pr/3",
         ),
         preprocess_steps=[
             Deduplicate(by=["context", "question", "answerA", "answerB", "answerC"]),
 
@@ -150,17 +150,8 @@
         task="tasks.translation.directed",
         templates="templates.translation.directed.all",
     )
-
-    test_card(card, demos_taken_from="test")
+    if pair == pairs[0]:
+        test_card(card, demos_taken_from="test")
     add_to_catalog(
         card, f"cards.mt.flores_101.{pair['src']}_{pair['tgt']}", overwrite=True
     )
-
-if __name__ == "__main__":
-    from unitxt import load_dataset
-
-    ds = load_dataset(
-        "card=cards.mt.flores_101.eng_deu,template_card_index=0",
-    )
-
-    ds["test"][0]
@@ -14,6 +14,7 @@
     loader=LoadHF(
         path="ibm/turl_table_col_type",
         data_classification_policy=["public"],
+        streaming=True,
     ),
     task=Task(
         input_fields={
 
@@ -15,7 +15,7 @@
 )
 
 card = TaskCard(
-    loader=LoadHF(path="rootsautomation/websrc"),
+    loader=LoadHF(path="rootsautomation/websrc", streaming=True),
     preprocess_steps=[
         RenameSplits(mapper={"train": "train", "dev": "test"}),
         "splitters.small_no_dev",
 
@@ -11,7 +11,7 @@
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
-    loader=LoadHF(path="wiki_bio"),
+    loader=LoadHF(path="wiki_bio", streaming=True),
     preprocess_steps=[
         SplitRandomMix({"train": "train", "validation": "val", "test": "test"}),
         ListToKeyValPairs(
 
@@ -5,7 +5,9 @@
 
 for subtask in ["debiased", "l", "m", "s", "xl", "xs"]:
     card = TaskCard(
-        loader=LoadHF(path="winogrande", name=f"winogrande_{subtask}"),
+        loader=LoadHF(
+            path="winogrande", name=f"winogrande_{subtask}", revision="refs/pr/6"
+        ),
         preprocess_steps=[
             "splitters.small_no_test",
             ListFieldValues(fields=["option1", "option2"], to_field="choices"),
 
@@ -184,6 +184,7 @@ keep-runtime-typing = true
 ".vscode/*" = ["TID251"]
 "tests/*" = ["TID251"]
 "utils/*" = ["TID251"]
+"src/unitxt/api.py" = ["B904"]
 "src/unitxt/__init__.py" = ["F811", "F401"]
 "src/unitxt/metric.py" = ["F811", "F401"]
 "src/unitxt/dataset.py" = ["F811", "F401"]
 
@@ -5,7 +5,6 @@
     create_dataset,
     evaluate,
     infer,
-    load,
     load_dataset,
     post_process,
     produce,
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`	`data = load_dataset(`
`5`	`5`	`"benchmarks.glue[max_samples_per_subset=5, format=formats.chat_api, system_prompt=system_prompts.general.be_concise]",`
`6`	`6`	`split="test",`
`7`		`- disable_cache=False,`
	`7`	`+ use_cache=True,`
`8`	`8`	`)`
`9`	`9`
`10`	`10`	`model = CrossProviderInferenceEngine(`
Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@`
`55`	`55`	`demos_pool_size=10,`
`56`	`56`	`loader_limit=1000,`
`57`	`57`	`max_test_instances=10,`
`58`		`- disable_cache=False,`
	`58`	`+ use_cache=True,`
`59`	`59`	`split="test",`
`60`	`60`	`)`
`61`	`61`
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@`
`40`	`40`	`}`
`41`	`41`	`for subset, url in subset_and_urls.items():`
`42`	`42`	`card = TaskCard(`
`43`		`- loader=LoadCSV(files={"train": url}),`
	`43`	`+ loader=LoadCSV(files={"train": url}, streaming=False),`
`44`	`44`	`preprocess_steps=[`
`45`	`45`	`SplitRandomMix(`
`46`	`46`	`{`
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`)`
`16`	`16`
`17`	`17`	`card = TaskCard(`
`18`		`- loader=LoadHF(path="rootsautomation/websrc"),`
	`18`	`+ loader=LoadHF(path="rootsautomation/websrc", streaming=True),`
`19`	`19`	`preprocess_steps=[`
`20`	`20`	`RenameSplits(mapper={"train": "train", "dev": "test"}),`
`21`	`21`	`"splitters.small_no_dev",`