Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Lazy Loaders #1536

Merged
merged 21 commits into from
Feb 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
242d1b9
try lazy loadHF first
dafnapension Jan 21, 2025
bc772b8
reduce benchmark profiling to generating the dataset only. Not inferr…
dafnapension Jan 21, 2025
b9c3c4f
try procrastination for load csv too
dafnapension Jan 21, 2025
39a73a4
added split cache for the generators, and log limit once per data and…
dafnapension Jan 22, 2025
709ac74
make sklearn loader too - a lazy loader
dafnapension Jan 22, 2025
a4121c7
adjust to new readers for csv
dafnapension Jan 23, 2025
e741ac2
Merge branch 'main' into lazy_loadHF
elronbandel Jan 30, 2025
8ce505d
Merge remote-tracking branch 'origin/main' into lazy_loadHF
elronbandel Feb 3, 2025
24d8f49
Enhance LoadHF class to support optional splits and improve dataset l…
elronbandel Feb 4, 2025
151218d
Refactor LoadHF class to improve dataset loading and implement limit …
elronbandel Feb 4, 2025
42e1ad3
Refactor LoadHF class to streamline dataset loading and enhance split…
elronbandel Feb 4, 2025
a83be91
Merge branch 'main' into lazy_loadHF
elronbandel Feb 4, 2025
493b28c
Merge branch 'main' into lazy_loadHF
elronbandel Feb 6, 2025
27f9c4f
Remove unused import and update line number in secrets baseline
elronbandel Feb 6, 2025
294eabf
Refactor load_data method to simplify error handling and remove unnec…
elronbandel Feb 6, 2025
9ba5f1b
Merge origin/main
elronbandel Feb 10, 2025
931138f
Refactor loaders to implement LazyLoader class and update load_iterab…
elronbandel Feb 10, 2025
d21f16e
Merge remote-tracking branch 'origin/main' into lazy_loadHF
elronbandel Feb 10, 2025
85c2cab
Update exception handling in test_failed_load_csv to catch general ex…
elronbandel Feb 10, 2025
692297c
Refactor LoadHF class to streamline data loading and enhance error ha…
elronbandel Feb 10, 2025
102034b
Merge branch 'main' into lazy_loadHF
elronbandel Feb 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 3 additions & 25 deletions performance/bluebench_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,7 @@ def profiler_do_the_profiling(self, dataset_query: str, split: str, **kwargs):
benchmark_recipe=benchmark_recipe, split=split, **kwargs
)

model = self.profiler_instantiate_model()

predictions = self.profiler_infer_predictions(model=model, dataset=dataset)

evaluation_result = self.profiler_evaluate_predictions(
predictions=predictions, dataset=dataset
)
logger.critical(f"length of evaluation_result: {len(evaluation_result)}")
logger.critical(f"length of bluebench generated dataset: {len(dataset)}")


dataset_query = "benchmarks.bluebench[loader_limit=30,max_samples_per_subset=30]"
Expand Down Expand Up @@ -154,44 +147,29 @@ def main():
pst.strip_dirs()
pst.sort_stats("name") # sort by function name
pst.print_stats(
"profile_benchmark_blue_bench|profiler_instantiate_benchmark_recipe|profiler_generate_benchmark_dataset|profiler_instantiate_model|profiler_infer_predictions|profiler_evaluate_predictions|load_data|load_iterables"
"profile_benchmark_blue_bench|profiler_instantiate_benchmark_recipe|profiler_generate_benchmark_dataset|load_data|load_iterables"
)
s = f.getvalue()
assert s.split("\n")[7].split()[3] == "cumtime"
overall_tot_time = find_cummtime_of(
"profile_benchmark_blue_bench", "bluebench_profiler.py", s
)
load_time = find_cummtime_of("load_data", "loaders.py", s)
just_load_no_initial_ms_time = find_cummtime_of(
"load_iterables", "loaders.py", s
)

instantiate_benchmark_time = find_cummtime_of(
"profiler_instantiate_benchmark_recipe", "bluebench_profiler.py", s
)
generate_benchmark_dataset_time = find_cummtime_of(
"profiler_generate_benchmark_dataset", "bluebench_profiler.py", s
)
instantiate_model_time = find_cummtime_of(
"profiler_instantiate_model", "bluebench_profiler.py", s
)
inference_time = find_cummtime_of(
"profiler_infer_predictions", "bluebench_profiler.py", s
)
evaluation_time = find_cummtime_of(
"profiler_evaluate_predictions", "bluebench_profiler.py", s
)

# Data to be written
dictionary = {
"dataset_query": dataset_query,
"total_time": overall_tot_time,
"load_time": load_time,
"load_time_no_initial_ms": just_load_no_initial_ms_time,
"instantiate_benchmark_time": instantiate_benchmark_time,
"generate_benchmark_dataset_time": generate_benchmark_dataset_time,
"instantiate_model_time": instantiate_model_time,
"inference_time": inference_time,
"evaluation_time": evaluation_time,
"used_eager_mode": settings.use_eager_execution,
"performance.prof file": temp_prof_file_path,
}
Expand Down
41 changes: 10 additions & 31 deletions performance/compare_benchmark_performance_results.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import argparse
import json
import os
import sys

# Argument parser to get file paths from the command line
Expand All @@ -23,24 +22,11 @@
print(f'dataset_query = "{main_perf["dataset_query"]}"')
print(f"used_eager_mode in main = {main_perf['used_eager_mode']}")
print(f"used_eager_mode in PR = {pr_perf['used_eager_mode']}")
print(f"use Mocked inference = {os.environ['UNITXT_MOCK_INFERENCE_MODE']}")

ratio1 = (
(pr_perf["generate_benchmark_dataset_time"] - pr_perf["load_time_no_initial_ms"])
/ (
main_perf["generate_benchmark_dataset_time"]
- main_perf["load_time_no_initial_ms"]
)
if (
main_perf["generate_benchmark_dataset_time"]
- main_perf["load_time_no_initial_ms"]
)
> 0
else 1
)
ratio2 = (
pr_perf["evaluation_time"] / main_perf["evaluation_time"]
if main_perf["evaluation_time"] > 0
(pr_perf["generate_benchmark_dataset_time"] - pr_perf["load_time"])
/ (main_perf["generate_benchmark_dataset_time"] - main_perf["load_time"])
if (main_perf["generate_benchmark_dataset_time"] - main_perf["load_time"]) > 0
else 1
)
# Markdown table formatting
Expand All @@ -49,26 +35,19 @@
line2 = "--------------------|-------------|-------------|---------------\n"
line3 = f" Total time | {main_perf['total_time']:>11} | {pr_perf['total_time']:>11} | {pr_perf['total_time'] / main_perf['total_time']:.2f}\n"
ratio_line4 = (
pr_perf["load_time_no_initial_ms"] / main_perf["load_time_no_initial_ms"]
if main_perf["load_time_no_initial_ms"] > 0
else 1
pr_perf["load_time"] / main_perf["load_time"] if main_perf["load_time"] > 0 else 1
)
line4 = f" Load time | {main_perf['load_time_no_initial_ms']:>11} | {pr_perf['load_time_no_initial_ms']:>11} | {ratio_line4:.2f}\n"
line4 = f" Load time | {main_perf['load_time']:>11} | {pr_perf['load_time']:>11} | {ratio_line4:.2f}\n"
line5 = f" DS Gen. inc. Load | {main_perf['generate_benchmark_dataset_time']:>11} | {pr_perf['generate_benchmark_dataset_time']:>11} | {pr_perf['generate_benchmark_dataset_time'] / main_perf['generate_benchmark_dataset_time']:.2f}\n"
line6 = f" DS Gen. exc. Load | {round(main_perf['generate_benchmark_dataset_time'] - main_perf['load_time_no_initial_ms'], 3):>11} | {round(pr_perf['generate_benchmark_dataset_time'] - pr_perf['load_time_no_initial_ms'], 3):>11} | {ratio1:.2f}\n"
line7 = f" Inference time | {main_perf['inference_time']:>11} | {pr_perf['inference_time']:>11} | {pr_perf['inference_time'] / main_perf['inference_time']:.2f}\n"
line8 = f" Evaluate time | {main_perf['evaluation_time']:>11} | {pr_perf['evaluation_time']:>11} | {ratio2:.2f}\n"
line9 = f" Benchmark Instant. | {main_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time'] / main_perf['instantiate_benchmark_time']:.2f}\n"
line10 = f" Model Instantiation| {main_perf['instantiate_model_time']:>11} | {pr_perf['instantiate_model_time']:>11} | {pr_perf['instantiate_model_time'] / main_perf['instantiate_model_time']:.2f}\n"
line6 = f" DS Gen. exc. Load | {round(main_perf['generate_benchmark_dataset_time'] - main_perf['load_time'], 3):>11} | {round(pr_perf['generate_benchmark_dataset_time'] - pr_perf['load_time'], 3):>11} | {ratio1:.2f}\n"
line7 = f" Benchmark Instant. | {main_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time'] / main_perf['instantiate_benchmark_time']:.2f}\n"

print("### Performance Comparison Results, time expressed in seconds:\n")
print(line1 + line2 + line3 + line4 + line5 + line6 + line7 + line8 + line9 + line10)
print(line1 + line2 + line3 + line4 + line5 + line6 + line7)
print("\n\n")
# Performance degradation check (5% threshold)
if ratio1 > 1.05 or ratio2 > 1.05:
print(
"\n**Warning**: Performance degradation in Dataset Generation and/or Evaluation exceeds 5%!"
)
if ratio1 > 1.05:
print("\n**Warning**: Performance degradation in Dataset Generation exceeds 5%!")
print(
"Explore branch performance via 'python performance/bluebench_profiler.py --output_file=<path to json file>',"
"followed by 'snakeviz <the performance.prof file specified in the output json file>'."
Expand Down
2 changes: 1 addition & 1 deletion prepare/cards/universal_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
loader=LoadHF(
path="universalner/universal_ner",
name=sub_task,
requirements_list=["conllu"],
requirements=["conllu"],
),
preprocess_steps=[
# The dataset is sorted by classes
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ keep-runtime-typing = true
"src/unitxt/metric.py" = ["F811", "F401"]
"src/unitxt/dataset.py" = ["F811", "F401"]
"src/unitxt/blocks.py" = ["F811", "F401"]
"tests/library/test_loaders.py" = ["N802", "N803"]
"tests/library/test_loaders.py" = ["N802", "N803", "RUF015"]
"tests/library/test_dataclass.py" = ["F811", "E731"]
"src/unitxt/validate.py" = ["B024"]
"src/unitxt/standard.py" = ["C901"]
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/ceb/gja.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "ceb_gja",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/da/ddt.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "da_ddt",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/de/pud.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "de_pud",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/en/ewt.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "en_ewt",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/en/pud.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "en_pud",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/hr/set.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "hr_set",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/pt/bosque.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "pt_bosque",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/pt/pud.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "pt_pud",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/ru/pud.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "ru_pud",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/sk/snk.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "sk_snk",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/sr/set.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "sr_set",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/sv/pud.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "sv_pud",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/sv/talbanken.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "sv_talbanken",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/tl/trg.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "tl_trg",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/tl/ugnayan.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "tl_ugnayan",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/zh/gsd.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "zh_gsd",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/zh/gsdsimp.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "zh_gsdsimp",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/universal_ner/zh/pud.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"__type__": "load_hf",
"path": "universalner/universal_ner",
"name": "zh_pud",
"requirements_list": [
"requirements": [
"conllu"
]
},
Expand Down
6 changes: 1 addition & 5 deletions src/unitxt/fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,7 @@ def prepare_subsets(self):
for i in range(len(self.subsets)):
self.named_subsets[i] = self.subsets[i]
else:
for name, origin in self.subsets.items():
try:
self.named_subsets[name] = origin
except Exception as e:
raise RuntimeError(f"Exception in subset: {name}") from e
self.named_subsets = self.subsets

def splits(self) -> List[str]:
self.prepare_subsets()
Expand Down
Loading
Loading