Skip to content

Commit 8847d1b

Browse files
authored
Batch results file based on file size (#283)
* add ChunkMap dict wrapper class * add batch size and chunk size args * implement new chunk management * estimate size from existing number of rows * [copilot] estimate and write across multiple files * replace get with defaultdfict * make bytes per row a method of chunkmap * adjust logic of chunking * add tests * fix char * formatting * fix typing * change chunk size to output file size * replace comment * rename concat_batch_results * formatting * rearrange metrics in from_existing_results * revert that last commit * simplify file searching * change most references to 'chunks' to 'output file' or 'chunked output file' * store bytes per row in outputfilemap * format * fix bug * fix test bug * formatting * change to megabytes * switch to mb * remove pre-emptive rotation and adjust limit * adjust multiple chunk test * format * address single row bug * fix naming * Update CHANGELOG for version 2.2.0
1 parent 746e50d commit 8847d1b

File tree

8 files changed

+604
-96
lines changed

8 files changed

+604
-96
lines changed

CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
**2.2.0 - 01/02/26**
2+
3+
- Add output data file splitting
4+
15
**2.1.26 - 12/24/25**
26

37
- Fix build failure due to unsafe import of private psutil types

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
"vivarium_dependencies[lint]",
6060
"types-setuptools",
6161
"types-psutil",
62+
"pyarrow-stubs",
6263
]
6364

6465
test_requirements = [

src/vivarium_cluster_tools/psimulate/cli.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ def psimulate() -> None:
5050
cluster.with_hardware,
5151
redis_dbs.with_max_workers,
5252
redis_dbs.with_redis,
53+
results.with_batch_size,
54+
results.with_output_file_size,
5355
results.with_no_batch,
5456
results.backup_freq,
5557
cli_tools.with_verbose_and_pdb,
@@ -133,6 +135,8 @@ def run(
133135
),
134136
max_workers=options["max_workers"],
135137
redis_processes=options["redis"],
138+
batch_size=options["batch_size"],
139+
output_file_size=options["output_file_size"],
136140
no_batch=options["no_batch"],
137141
backup_freq=options["backup_freq"],
138142
extra_args={
@@ -178,6 +182,8 @@ def restart(
178182
),
179183
max_workers=options["max_workers"],
180184
redis_processes=options["redis"],
185+
batch_size=options["batch_size"],
186+
output_file_size=options["output_file_size"],
181187
no_batch=options["no_batch"],
182188
backup_freq=options["backup_freq"],
183189
extra_args={
@@ -238,6 +244,8 @@ def expand(
238244
),
239245
max_workers=options["max_workers"],
240246
redis_processes=options["redis"],
247+
batch_size=options["batch_size"],
248+
output_file_size=options["output_file_size"],
241249
no_batch=options["no_batch"],
242250
backup_freq=options["backup_freq"],
243251
extra_args={
@@ -316,6 +324,8 @@ def test(
316324
),
317325
max_workers=options["max_workers"],
318326
redis_processes=options["redis"],
327+
batch_size=options["batch_size"],
328+
output_file_size=options["output_file_size"],
319329
no_batch=options["no_batch"],
320330
backup_freq=options["backup_freq"],
321331
extra_args={

src/vivarium_cluster_tools/psimulate/results/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,15 @@
44
==================
55
66
"""
7+
78
from vivarium_cluster_tools.psimulate.results.cli_options import (
89
backup_freq,
10+
with_batch_size,
911
with_no_batch,
1012
with_no_cleanup,
13+
with_output_file_size,
14+
)
15+
from vivarium_cluster_tools.psimulate.results.processing import (
16+
OutputFileMap,
17+
write_results_batch,
1118
)
12-
from vivarium_cluster_tools.psimulate.results.processing import write_results_batch

src/vivarium_cluster_tools/psimulate/results/cli_options.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,34 @@
66
Command line options for configuring results handling in psimulate runs.
77
88
"""
9+
910
import click
1011

1112
from vivarium_cluster_tools.cli_tools import MINUTES_OR_NONE
1213

14+
DEFAULT_BATCH_SIZE = 200
15+
DEFAULT_OUTPUT_FILE_SIZE_MB = 500.0
16+
17+
with_batch_size = click.option(
18+
"--batch-size",
19+
type=int,
20+
default=DEFAULT_BATCH_SIZE,
21+
show_default=True,
22+
help="Number of simulation jobs to accumulate before writing results to disk.",
23+
)
24+
with_output_file_size = click.option(
25+
"--output-file-size",
26+
"output_file_size",
27+
type=float,
28+
default=DEFAULT_OUTPUT_FILE_SIZE_MB,
29+
show_default=True,
30+
help="Maximum file size in MB per result file (within each type of result). When exceeded, a new file is started.",
31+
callback=lambda ctx, param, value: int(value * 1024 * 1024), # Convert MB to bytes
32+
)
1333
with_no_batch = click.option(
14-
"--no-batch", is_flag=True, help="Don't batch results, write them as they come in."
34+
"--no-batch",
35+
is_flag=True,
36+
help="Write results immediately as they come in (batch_size=0). Files are still split by size per --output_file_size.",
1537
)
1638
with_no_cleanup = click.option(
1739
"--no-cleanup",

0 commit comments

Comments
 (0)