Skip to content

Commit e8db73c

Browse files
committed
all passing
1 parent 3edf658 commit e8db73c

8 files changed

Lines changed: 340 additions & 186 deletions

File tree

benchmarks/benchmark_objdet.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
import requests
1010
from tqdm import tqdm
1111

12-
from valor_lite.object_detection import BoundingBox, DataLoader, Detection
12+
from valor_lite.object_detection import BoundingBox, Detection
13+
from valor_lite.object_detection.loader import Loader
1314

1415

1516
def _get_bbox_extrema(
@@ -156,7 +157,7 @@ def write_results_to_file(write_path: Path, results: list[dict]):
156157

157158
@time_it
158159
def ingest(
159-
manager: DataLoader,
160+
loader: Loader,
160161
gt_path: Path,
161162
pd_path: Path,
162163
limit: int,
@@ -184,12 +185,12 @@ def ingest(
184185
elif len(detections) < chunk_size or chunk_size == -1:
185186
continue
186187

187-
timer, _ = time_it(manager.add_bounding_boxes)(detections)
188+
timer, _ = time_it(loader.add_bounding_boxes)(detections)
188189
accumulated_time += timer
189190
detections = []
190191

191192
if detections:
192-
timer, _ = time_it(manager.add_bounding_boxes)(detections)
193+
timer, _ = time_it(loader.add_bounding_boxes)(detections)
193194
accumulated_time += timer
194195

195196
return accumulated_time
@@ -297,18 +298,25 @@ def run_benchmarking_analysis(
297298
pd_filename = prediction_caches[pd_type]
298299

299300
# === Base Evaluation ===
300-
manager = DataLoader()
301+
loader = Loader.create(
302+
".valor/objdet_benchmark",
303+
batch_size=1_000,
304+
rows_per_file=10_000,
305+
delete_if_exists=True,
306+
)
301307

302308
# ingest + preprocess
303309
(ingest_time, preprocessing_time,) = ingest(
304-
manager=manager,
310+
loader=loader,
305311
gt_path=current_directory / Path(gt_filename),
306312
pd_path=current_directory / Path(pd_filename),
307313
limit=limit,
308314
chunk_size=chunk_size,
309315
) # type: ignore - time_it wrapper
310316

311-
finalization_time, evaluator = time_it(manager.finalize)()
317+
finalization_time, evaluator = time_it(loader.finalize)(
318+
batch_size=10_000
319+
)
312320

313321
if ingest_time > ingestion_timeout and ingestion_timeout != -1:
314322
raise TimeoutError(
@@ -322,7 +330,7 @@ def run_benchmarking_analysis(
322330
)
323331
if eval_time > evaluation_timeout and evaluation_timeout != -1:
324332
raise TimeoutError(
325-
f"Base evaluation timed out with {evaluator.metadata.number_of_datums} datums."
333+
f"Base evaluation timed out with {evaluator.info.number_of_datums} datums."
326334
)
327335

328336
# evaluate - base metrics + detailed
@@ -337,16 +345,16 @@ def run_benchmarking_analysis(
337345
and evaluation_timeout != -1
338346
):
339347
raise TimeoutError(
340-
f"Detailed evaluation timed out with {evaluator.metadata.number_of_datums} datums."
348+
f"Detailed evaluation timed out with {evaluator.info.number_of_datums} datums."
341349
)
342350

343351
results.append(
344352
Benchmark(
345353
limit=limit,
346-
n_datums=evaluator.metadata.number_of_datums,
347-
n_groundtruths=evaluator.metadata.number_of_ground_truths,
348-
n_predictions=evaluator.metadata.number_of_predictions,
349-
n_labels=evaluator.metadata.number_of_labels,
354+
n_datums=evaluator.info.number_of_datums,
355+
n_groundtruths=evaluator.info.number_of_groundtruth_annotations,
356+
n_predictions=evaluator.info.number_of_prediction_annotations,
357+
n_labels=evaluator.info.number_of_labels,
350358
gt_type=gt_type,
351359
pd_type=pd_type,
352360
chunk_size=chunk_size,

src/valor_lite/cache.py

Lines changed: 79 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -68,21 +68,41 @@ def __init__(self, path: str | Path):
6868
self._path = Path(path)
6969

7070
@property
71-
def files(self) -> list[str]:
71+
def path(self) -> Path:
72+
return self._path
73+
74+
@property
75+
def files(self) -> list[Path]:
76+
if not self.path.exists():
77+
return []
78+
elif not self.path.is_dir():
79+
raise NotADirectoryError(
80+
f"Path exists but is not a directory: {self._path}"
81+
)
82+
7283
files = []
7384
for entry in os.listdir(self._path):
7485
full_path = os.path.join(self._path, entry)
7586
if os.path.isfile(full_path):
76-
files.append(full_path)
87+
files.append(Path(full_path))
7788
return files
7889

7990
@property
8091
def num_files(self) -> int:
8192
return len(self.files)
8293

8394
@property
84-
def dataset_files(self) -> list[str]:
85-
return glob.glob(f"{self._path}/*.parquet")
95+
def dataset_files(self) -> list[Path]:
96+
if not self.path.exists():
97+
return []
98+
elif not self.path.is_dir():
99+
raise NotADirectoryError(
100+
f"Path exists but is not a directory: {self._path}"
101+
)
102+
103+
return [
104+
Path(filepath) for filepath in glob.glob(f"{self._path}/*.parquet")
105+
]
86106

87107
@property
88108
def num_dataset_files(self) -> int:
@@ -92,16 +112,19 @@ def num_dataset_files(self) -> int:
92112
def _generate_config_path(path: str | Path) -> Path:
93113
return Path(path) / ".cfg"
94114

95-
@staticmethod
96-
def _get_dataset_from_path(path: str | Path) -> ds.Dataset:
97-
return ds.dataset(path, format="parquet")
98-
99115

100116
class CacheReader(CacheFiles):
101-
def __init__(self, path: str | Path):
117+
def __init__(
118+
self,
119+
path: str | Path,
120+
batch_size: int,
121+
rows_per_file: int,
122+
compression: str,
123+
):
102124
self._path = Path(path)
103-
self._cfg = None
104-
self._dataset = None
125+
self._batch_size = batch_size
126+
self._rows_per_file = rows_per_file
127+
self._compression = compression
105128

106129
# validate path
107130
if not self._path.exists():
@@ -111,45 +134,48 @@ def __init__(self, path: str | Path):
111134
f"Path exists but is not a directory: {self._path}"
112135
)
113136

137+
@classmethod
138+
def load(cls, path: str | Path):
139+
def _retrieve(config: dict, key: str):
140+
if value := config.get(key, None):
141+
return value
142+
raise KeyError(
143+
f"'{key}' is not defined within {cls._generate_config_path(path)}"
144+
)
145+
146+
cfg_path = cls._generate_config_path(path)
147+
with open(cfg_path, "r") as f:
148+
cfg = json.load(f)
149+
batch_size = _retrieve(cfg, "batch_size")
150+
rows_per_file = _retrieve(cfg, "rows_per_file")
151+
compression = _retrieve(cfg, "compression")
152+
153+
return cls(
154+
path=path,
155+
batch_size=batch_size,
156+
rows_per_file=rows_per_file,
157+
compression=compression,
158+
)
159+
114160
@property
115161
def dataset(self) -> ds.Dataset:
116-
if not self._dataset:
117-
self._dataset = ds.dataset(
118-
self._path,
119-
format="parquet",
120-
)
121-
return self._dataset
162+
return ds.dataset(self._path, format="parquet")
122163

123164
@property
124165
def schema(self) -> pa.Schema:
125166
return self.dataset.schema
126167

127-
@property
128-
def config(self) -> dict:
129-
if self._cfg is None:
130-
cfg_path = self._generate_config_path(self._path)
131-
with open(cfg_path, "r") as f:
132-
self._cfg = json.load(f)
133-
return self._cfg
134-
135-
def _read_config(self, key: str):
136-
if value := self.config.get(key, None):
137-
return value
138-
raise KeyError(
139-
f"'{key}' is not defined within {self._generate_config_path(self._path)}"
140-
)
141-
142168
@property
143169
def batch_size(self) -> int:
144-
return int(self._read_config("batch_size"))
170+
return self._batch_size
145171

146172
@property
147173
def rows_per_file(self) -> int:
148-
return int(self._read_config("rows_per_file"))
174+
return self._rows_per_file
149175

150176
@property
151177
def compression(self) -> str:
152-
return str(self._read_config("compression"))
178+
return self._compression
153179

154180

155181
class CacheWriter(CacheFiles):
@@ -209,7 +235,7 @@ def create(
209235
@classmethod
210236
def load(cls, path: str | Path):
211237
cfg_path = cls._generate_config_path(path)
212-
dataset = cls._get_dataset_from_path(path)
238+
dataset = ds.dataset(path, format="parquet")
213239
with open(cfg_path, "r") as f:
214240
cfg = json.load(f)
215241
return cls(
@@ -218,6 +244,23 @@ def load(cls, path: str | Path):
218244
**cfg,
219245
)
220246

247+
@classmethod
248+
def delete(cls, path: str | Path):
249+
path = Path(path)
250+
if not path.exists():
251+
return
252+
cache = cls.load(path)
253+
# delete config file
254+
cfg_path = cls._generate_config_path(path)
255+
if cfg_path.exists() and cfg_path.is_file():
256+
cfg_path.unlink()
257+
# delete parquet files
258+
for file in cache.dataset_files:
259+
if file.exists() and file.is_file() and file.suffix == ".parquet":
260+
file.unlink()
261+
# delete empty cache directory
262+
path.rmdir()
263+
221264
def write_rows(
222265
self,
223266
rows: list[dict[str, Any]],
@@ -286,10 +329,6 @@ def flush(self):
286329
self._count = 0
287330
self._close_writer()
288331

289-
def delete(self):
290-
for file in self.files:
291-
Path(file).unlink()
292-
293332
def _next_filename(self) -> Path:
294333
files = self.dataset_files
295334
if not files:

src/valor_lite/object_detection/evaluator.py

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -68,30 +68,16 @@ def __init__(
6868
number_of_groundtruths_per_label
6969
)
7070

71-
@property
72-
def path(self) -> Path:
73-
return self._path
74-
75-
@property
76-
def detailed(self) -> CacheReader:
77-
return self._detailed_cache
78-
79-
@property
80-
def ranked(self) -> CacheReader:
81-
return self._ranked_cache
82-
83-
@property
84-
def info(self) -> EvaluatorInfo:
85-
return self._info
86-
8771
@classmethod
8872
def load(
8973
cls,
9074
path: str | Path,
9175
index_to_label_override: dict[int, str] | None = None,
9276
):
93-
detailed_cache = CacheReader(cls._generate_detailed_cache_path(path))
94-
ranked_cache = CacheReader(cls._generate_ranked_cache_path(path))
77+
detailed_cache = CacheReader.load(
78+
cls._generate_detailed_cache_path(path)
79+
)
80+
ranked_cache = CacheReader.load(cls._generate_ranked_cache_path(path))
9581

9682
# build evaluator meta
9783
(
@@ -215,6 +201,30 @@ def filter(
215201
index_to_label_override=self._index_to_label,
216202
)
217203

204+
def delete(self):
205+
"""
206+
Delete evaluator cache.
207+
"""
208+
from valor_lite.object_detection.loader import Loader
209+
210+
Loader.delete(self.path)
211+
212+
@property
213+
def path(self) -> Path:
214+
return self._path
215+
216+
@property
217+
def detailed(self) -> CacheReader:
218+
return self._detailed_cache
219+
220+
@property
221+
def ranked(self) -> CacheReader:
222+
return self._ranked_cache
223+
224+
@property
225+
def info(self) -> EvaluatorInfo:
226+
return self._info
227+
218228
@staticmethod
219229
def generate_meta(
220230
dataset: ds.Dataset,

src/valor_lite/object_detection/legacy.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import tempfile
22
from dataclasses import asdict, dataclass
3-
from pathlib import Path
43

54
import numpy as np
65
import pyarrow.compute as pc
@@ -334,20 +333,3 @@ class DataLoader(CachedLoader):
334333
def finalize(self) -> Evaluator: # type: ignore - switching type
335334
evaluator = super().finalize()
336335
return Evaluator.load(evaluator.path)
337-
338-
@classmethod
339-
def filter(
340-
cls,
341-
path: str | Path,
342-
evaluator: CachedEvaluator,
343-
filter_expr: Filter,
344-
) -> Evaluator:
345-
evaluator = super().filter(
346-
path=path,
347-
evaluator=evaluator,
348-
filter_expr=filter_expr,
349-
)
350-
return Evaluator.load(
351-
path=path,
352-
index_to_label_override=evaluator._index_to_label,
353-
)

0 commit comments

Comments
 (0)