Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revisit huggingface cache policy - BREAKING CHANGE #1564

Merged
merged 35 commits into from
Feb 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
db2b74b
Revisit huggingface cache policy
elronbandel Jan 29, 2025
c863ee7
Enable streaming for LoadFromHFSpace and clean up commented code
elronbandel Jan 29, 2025
0e36f1a
Disable Hugging Face datasets cache in CatalogPreparationTestCase
elronbandel Jan 29, 2025
e672ca1
Enable streaming for wiki_bio loader in TaskCard and update JSON conf…
elronbandel Jan 29, 2025
81873b7
Merge branch 'main' into hf-cache
elronbandel Jan 29, 2025
3d91e20
Add conditional test card execution for 'doqa_travel' subset in chat_…
elronbandel Jan 29, 2025
0968633
Merge branch 'hf-cache' of https://github.com/IBM/unitxt into hf-cache
elronbandel Jan 29, 2025
119d07e
Enhance memory and performance logging in catalog preparation tests
elronbandel Jan 29, 2025
6b84f81
Return parallel execution to 1 and adjust modulo for deterministic te…
elronbandel Jan 29, 2025
a43910c
Try 1
elronbandel Jan 30, 2025
b5a5ff0
try 1 fixed
elronbandel Jan 30, 2025
1a421af
trial 2
elronbandel Jan 30, 2025
db75df8
Stop testing social iqa until problem resolved
elronbandel Jan 30, 2025
412e90b
Update social iqa card to use specific revision and enable testing
elronbandel Jan 30, 2025
f6e5388
Refactor translation card testing logic and remove unused dataset loa…
elronbandel Jan 30, 2025
a0e7d0d
Update head_qa card loader path and streamline dataset configuration
elronbandel Jan 30, 2025
a6fd3dd
Enable streaming for websrc card loader in configuration
elronbandel Jan 30, 2025
700b26a
Add revision reference to Winogrande card loaders
elronbandel Jan 30, 2025
4e5fd67
Add revision reference to PIQA card loader
elronbandel Jan 30, 2025
edc0ae7
Update
elronbandel Jan 30, 2025
5e3e4cf
Another trial
elronbandel Jan 30, 2025
a94be8f
Refactor dataset loading to support dynamic streaming and improve con…
elronbandel Feb 2, 2025
95db421
Add streaming support to turl_col_type configuration
elronbandel Feb 2, 2025
f20529b
Remove unused skip files from test preparation
elronbandel Feb 2, 2025
2d53e07
Merge branch 'main' into hf-cache
elronbandel Feb 2, 2025
1935ef0
Refactor LoadHF class to improve dataset filtering and add streaming …
elronbandel Feb 2, 2025
c2ed1b6
Merge branch 'hf-cache' of https://github.com/IBM/unitxt into hf-cache
elronbandel Feb 2, 2025
e94b812
Update load_dataset function documentation to clarify caching behavio…
elronbandel Feb 2, 2025
4c0b494
Update dataset loading to support caching and streaming options
elronbandel Feb 2, 2025
fdd1a2e
Import UnitxtDataset in load_dataset function for improved dataset ha…
elronbandel Feb 2, 2025
0c31189
Remove unused load function import from __init__.py
elronbandel Feb 2, 2025
7f1f762
Remove streaming option from SEED-Bench loader configuration
elronbandel Feb 2, 2025
060af54
Refactor dataset loading to utilize caching and improve dataset handling
elronbandel Feb 2, 2025
937552b
Add missing imports for dataset module functionality
elronbandel Feb 2, 2025
4dfb45f
Increase loader cache size to improve performance and update test set…
elronbandel Feb 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/evaluate_benchmark_with_custom_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
data = load_dataset(
"benchmarks.glue[max_samples_per_subset=5, format=formats.chat_api, system_prompt=system_prompts.general.be_concise]",
split="test",
disable_cache=False,
use_cache=True,
)

model = CrossProviderInferenceEngine(
Expand Down
6 changes: 3 additions & 3 deletions examples/evaluate_bluebench.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
)

with settings.context(
disable_hf_datasets_cache=False,
allow_unverified_code=True,
mock_inference_mode=True,
):
test_dataset = load_dataset(
"benchmarks.bluebench[loader_limit=30,max_samples_per_subset=30]", split="test"
"benchmarks.bluebench[loader_limit=30,max_samples_per_subset=30]",
split="test",
use_cache=True,
)

# Infer
model = CrossProviderInferenceEngine(
model="llama-3-8b-instruct",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
demos_pool_size=10,
loader_limit=1000,
max_test_instances=10,
disable_cache=False,
use_cache=True,
split="test",
)

Expand Down
21 changes: 6 additions & 15 deletions performance/bluebench_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,22 @@
from io import StringIO
from typing import Any, Dict, List, Union

from unitxt.api import evaluate, load_recipe
from unitxt.api import _source_to_dataset, evaluate, load_recipe
from unitxt.benchmark import Benchmark
from unitxt.inference import (
CrossProviderInferenceEngine,
InferenceEngine,
TextGenerationInferenceOutput,
)
from unitxt.logging_utils import get_logger
from unitxt.schema import UNITXT_DATASET_SCHEMA, loads_instance
from unitxt.settings_utils import get_settings

logger = get_logger()
settings = get_settings()

settings.allow_unverified_code = True
settings.disable_hf_datasets_cache = False
settings.mock_inference_mode = True


class BlueBenchProfiler:
Expand Down Expand Up @@ -65,19 +67,8 @@ def profiler_instantiate_benchmark_recipe(
def profiler_generate_benchmark_dataset(
self, benchmark_recipe: Benchmark, split: str, **kwargs
) -> List[Dict[str, Any]]:
with settings.context(
disable_hf_datasets_cache=False,
allow_unverified_code=True,
mock_inference_mode=True,
):
stream = benchmark_recipe()[split]

dataset = stream.to_dataset(
features=UNITXT_DATASET_SCHEMA, disable_cache=False
).with_transform(loads_instance)

# to charge here for the time of generating all instances
return list(dataset)
dataset = _source_to_dataset(benchmark_recipe, split=split)
return list(dataset)

def profiler_instantiate_model(self) -> InferenceEngine:
return CrossProviderInferenceEngine(
Expand Down
2 changes: 1 addition & 1 deletion prepare/cards/CFPB_product.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
}
for subset, url in subset_and_urls.items():
card = TaskCard(
loader=LoadCSV(files={"train": url}),
loader=LoadCSV(files={"train": url}, streaming=False),
preprocess_steps=[
SplitRandomMix(
{
Expand Down
11 changes: 6 additions & 5 deletions prepare/cards/chat_rag_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,12 @@
"metrics.rouge",
]

test_card(
card_for_test,
strict=True,
demos_taken_from="test",
)
if subset == "doqa_travel":
test_card(
card_for_test,
strict=True,
demos_taken_from="test",
)
add_to_catalog(
card,
f"cards.rag.response_generation.chat_rag_bench.{'train.' if split == 'train' else ''}user_assistant_format.{subset}",
Expand Down
87 changes: 40 additions & 47 deletions prepare/cards/head_qa.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,50 @@
from datasets import get_dataset_config_names
import unitxt
from unitxt import add_to_catalog
from unitxt.blocks import (
LoadHF,
Rename,
Set,
TaskCard,
)
from unitxt.settings_utils import get_settings
from unitxt.test_utils.card import test_card

settings = get_settings()

dataset_name = "head_qa"

categories = [
"biology",
"chemistry",
"medicine",
"nursery",
"pharmacology",
"psychology",
]
for subset in get_dataset_config_names(
dataset_name, trust_remote_code=settings.allow_unverified_code
):
card = TaskCard(
loader=LoadHF(path=f"{dataset_name}", name=subset),
preprocess_steps=[
Rename(field_to_field={"qtext": "text", "category": "label"}),
Set(
fields={
"classes": categories,
"text_type": "question",
}
with unitxt.settings.context(allow_unverified_code=True):
for subset in ["es", "en"]:
card = TaskCard(
loader=LoadHF(path="dvilares/head_qa", name=subset),
preprocess_steps=[
Rename(field_to_field={"qtext": "text", "category": "label"}),
Set(
fields={
"classes": [
"biology",
"chemistry",
"medicine",
"nursery",
"pharmacology",
"psychology",
],
"text_type": "question",
}
),
],
task="tasks.classification.multi_class.topic_classification",
templates="templates.classification.multi_class.all",
__description__=(
"HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio de Sanidad, Consumo y Bienestar Social. The dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology… See the full description on the dataset page: https://huggingface.co/datasets/head_qa"
),
],
task="tasks.classification.multi_class.topic_classification",
templates="templates.classification.multi_class.all",
__description__=(
"HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio de Sanidad, Consumo y Bienestar Social. The dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology… See the full description on the dataset page: https://huggingface.co/datasets/head_qa"
),
__tags__={
"annotations_creators": "no-annotation",
"language": ["en", "es"],
"language_creators": "expert-generated",
"license": "mit",
"multilinguality": "monolingual",
"region": "us",
"size_categories": "1K<n<10K",
"source_datasets": "original",
"task_categories": "question-answering",
"task_ids": "multiple-choice-qa",
},
)
test_card(card, debug=False)
add_to_catalog(card, f"cards.{dataset_name}.{subset}", overwrite=True)
__tags__={
"annotations_creators": "no-annotation",
"language": ["en", "es"],
"language_creators": "expert-generated",
"license": "mit",
"multilinguality": "monolingual",
"region": "us",
"size_categories": "1K<n<10K",
"source_datasets": "original",
"task_categories": "question-answering",
"task_ids": "multiple-choice-qa",
},
)
test_card(card, debug=False)
add_to_catalog(card, f"cards.head_qa.{subset}", overwrite=True)
2 changes: 1 addition & 1 deletion prepare/cards/piqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from unitxt.test_utils.card import test_card

card = TaskCard(
loader=LoadHF(path="piqa"),
loader=LoadHF(path="piqa", revision="refs/pr/9"),
preprocess_steps=[
ListFieldValues(fields=["sol1", "sol2"], to_field="choices"),
Rename(
Expand Down
4 changes: 3 additions & 1 deletion prepare/cards/social_iqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
with unitxt.settings.context(allow_unverified_code=True):
card = TaskCard(
loader=LoadHF(
path="allenai/social_i_qa", data_classification_policy=["public"]
path="allenai/social_i_qa",
data_classification_policy=["public"],
revision="refs/pr/3",
),
preprocess_steps=[
Deduplicate(by=["context", "question", "answerA", "answerB", "answerC"]),
Expand Down
13 changes: 2 additions & 11 deletions prepare/cards/translation/flores101.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,17 +150,8 @@
task="tasks.translation.directed",
templates="templates.translation.directed.all",
)

test_card(card, demos_taken_from="test")
if pair == pairs[0]:
test_card(card, demos_taken_from="test")
add_to_catalog(
card, f"cards.mt.flores_101.{pair['src']}_{pair['tgt']}", overwrite=True
)

if __name__ == "__main__":
from unitxt import load_dataset

ds = load_dataset(
"card=cards.mt.flores_101.eng_deu,template_card_index=0",
)

ds["test"][0]
1 change: 1 addition & 0 deletions prepare/cards/turl_col_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
loader=LoadHF(
path="ibm/turl_table_col_type",
data_classification_policy=["public"],
streaming=True,
),
task=Task(
input_fields={
Expand Down
2 changes: 1 addition & 1 deletion prepare/cards/websrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
)

card = TaskCard(
loader=LoadHF(path="rootsautomation/websrc"),
loader=LoadHF(path="rootsautomation/websrc", streaming=True),
preprocess_steps=[
RenameSplits(mapper={"train": "train", "dev": "test"}),
"splitters.small_no_dev",
Expand Down
2 changes: 1 addition & 1 deletion prepare/cards/wiki_bio.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from unitxt.test_utils.card import test_card

card = TaskCard(
loader=LoadHF(path="wiki_bio"),
loader=LoadHF(path="wiki_bio", streaming=True),
preprocess_steps=[
SplitRandomMix({"train": "train", "validation": "val", "test": "test"}),
ListToKeyValPairs(
Expand Down
4 changes: 3 additions & 1 deletion prepare/cards/winogrande.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@

for subtask in ["debiased", "l", "m", "s", "xl", "xs"]:
card = TaskCard(
loader=LoadHF(path="winogrande", name=f"winogrande_{subtask}"),
loader=LoadHF(
path="winogrande", name=f"winogrande_{subtask}", revision="refs/pr/6"
),
preprocess_steps=[
"splitters.small_no_test",
ListFieldValues(fields=["option1", "option2"], to_field="choices"),
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ keep-runtime-typing = true
".vscode/*" = ["TID251"]
"tests/*" = ["TID251"]
"utils/*" = ["TID251"]
"src/unitxt/api.py" = ["B904"]
"src/unitxt/__init__.py" = ["F811", "F401"]
"src/unitxt/metric.py" = ["F811", "F401"]
"src/unitxt/dataset.py" = ["F811", "F401"]
Expand Down
1 change: 0 additions & 1 deletion src/unitxt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
create_dataset,
evaluate,
infer,
load,
load_dataset,
post_process,
produce,
Expand Down
Loading