Skip to content
Open
162 changes: 94 additions & 68 deletions concat-on-disk.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
"\n",
"import anndata\n",
"from anndata.tests.helpers import gen_typed_df\n",
"from anndata.experimental import write_elem\n",
"from anndata.experimental import write_dispatched\n",
"from anndata.experimental import concat_on_disk\n",
"from dask.distributed.diagnostics.memray import memray_scheduler, memray_workers"
]
Expand Down Expand Up @@ -282,11 +282,37 @@
"metadata": {},
"outputs": [],
"source": [
"# TODO: Refer to Ilan's notebook for the code below\n",
"# write a short description of below function also\n",
"# also why/when writing chunked is better but not needed\n",
"\n",
"def write_chunked(func, store, k, elem, dataset_kwargs, iospec):\n",
" \"\"\"Write callback that chunks X and layers\"\"\"\n",
"\n",
" def set_chunks(d, chunks=None):\n",
" \"\"\"Helper function for setting dataset_kwargs. Makes a copy of d.\"\"\"\n",
" d = dict(d)\n",
" if chunks is not None:\n",
" d[\"chunks\"] = chunks\n",
" else:\n",
" d.pop(\"chunks\", None)\n",
" return d\n",
"\n",
" if iospec.encoding_type == \"array\":\n",
" if 'layers' in k or k.endswith('X'):\n",
" dataset_kwargs = set_chunks(dataset_kwargs, (1000, 1000))\n",
" else:\n",
" dataset_kwargs = set_chunks(dataset_kwargs, None)\n",
"\n",
" func(store, k, elem, dataset_kwargs=dataset_kwargs)\n",
"\n",
"def write_data_to_zarr(X, shape_type, array_name, outdir, file_id):\n",
" outfile = outdir / f\"{file_id:02d}_{shape_type}_{array_name}.zarr\"\n",
" adata = create_adata(X)\n",
" z = zarr.open_group(outfile, mode=\"w\")\n",
" write_elem(z, \"/\", adata)\n",
" \n",
"\n",
" write_dispatched(z, \"/\", adata, callback=write_chunked)\n",
" zarr.consolidate_metadata(z.store)\n",
" return f\"wrote {X.shape[0]}x{X.shape[1]}_{array_name} -> {str(outfile)}\\n\"\n",
"\n",
Expand Down Expand Up @@ -333,33 +359,33 @@
"name": "stdout",
"output_type": "stream",
"text": [
"wrote 13747x10000_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/01_fat_csc.zarr\n",
"wrote 13747x10000_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/02_fat_csr.zarr\n",
"wrote 12361x10000_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/03_fat_np.zarr\n",
"wrote 10000x13069_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/04_tall_csc.zarr\n",
"wrote 10000x13069_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/05_tall_csr.zarr\n",
"wrote 10000x12903_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/06_tall_np.zarr\n",
"wrote 10000x10000_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/07_square_csc.zarr\n",
"wrote 10000x10000_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/08_square_csr.zarr\n",
"wrote 10000x10000_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/09_square_np.zarr\n",
"wrote 13588x10000_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/10_fat_csc.zarr\n",
"wrote 13588x10000_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/11_fat_csr.zarr\n",
"wrote 13086x10000_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/12_fat_np.zarr\n",
"wrote 10000x13531_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/13_tall_csc.zarr\n",
"wrote 10000x13531_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/14_tall_csr.zarr\n",
"wrote 10000x13661_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/15_tall_np.zarr\n",
"wrote 10000x10000_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/16_square_csc.zarr\n",
"wrote 10000x10000_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/17_square_csr.zarr\n",
"wrote 10000x10000_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/18_square_np.zarr\n",
"wrote 13539x10000_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/19_fat_csc.zarr\n",
"wrote 13539x10000_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/20_fat_csr.zarr\n",
"wrote 13777x10000_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/21_fat_np.zarr\n",
"wrote 10000x12984_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/22_tall_csc.zarr\n",
"wrote 10000x12984_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/23_tall_csr.zarr\n",
"wrote 10000x12289_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/24_tall_np.zarr\n",
"wrote 10000x10000_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/25_square_csc.zarr\n",
"wrote 10000x10000_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/26_square_csr.zarr\n",
"wrote 10000x10000_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp75esna2p/27_square_np.zarr\n"
"wrote 12454x10000_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/01_fat_csc.zarr\n",
"wrote 12454x10000_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/02_fat_csr.zarr\n",
"wrote 13757x10000_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/03_fat_np.zarr\n",
"wrote 10000x12370_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/04_tall_csc.zarr\n",
"wrote 10000x12370_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/05_tall_csr.zarr\n",
"wrote 10000x13235_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/06_tall_np.zarr\n",
"wrote 10000x10000_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/07_square_csc.zarr\n",
"wrote 10000x10000_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/08_square_csr.zarr\n",
"wrote 10000x10000_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/09_square_np.zarr\n",
"wrote 12558x10000_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/10_fat_csc.zarr\n",
"wrote 12558x10000_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/11_fat_csr.zarr\n",
"wrote 13957x10000_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/12_fat_np.zarr\n",
"wrote 10000x12236_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/13_tall_csc.zarr\n",
"wrote 10000x12236_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/14_tall_csr.zarr\n",
"wrote 10000x12851_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/15_tall_np.zarr\n",
"wrote 10000x10000_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/16_square_csc.zarr\n",
"wrote 10000x10000_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/17_square_csr.zarr\n",
"wrote 10000x10000_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/18_square_np.zarr\n",
"wrote 12888x10000_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/19_fat_csc.zarr\n",
"wrote 12888x10000_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/20_fat_csr.zarr\n",
"wrote 13955x10000_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/21_fat_np.zarr\n",
"wrote 10000x12558_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/22_tall_csc.zarr\n",
"wrote 10000x12558_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/23_tall_csr.zarr\n",
"wrote 10000x13348_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/24_tall_np.zarr\n",
"wrote 10000x10000_csc -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/25_square_csc.zarr\n",
"wrote 10000x10000_csr -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/26_square_csr.zarr\n",
"wrote 10000x10000_np -> /var/folders/w4/rlbyb2md7y50tspf85v1lc440000gn/T/tmp6_ga523e/27_square_np.zarr\n"
]
}
],
Expand Down Expand Up @@ -597,15 +623,15 @@
"text": [
"Dataset: sparse 0\n",
"Concatenating 6 files with sizes:\n",
"['78MiB', '78MiB', '106MiB', '78MiB', '107MiB', '106MiB']\n",
"Total size: 556MiB\n",
"['78MiB', '78MiB', '101MiB', '98MiB', '97MiB', '78MiB']\n",
"Total size: 533MiB\n",
"Concatenation finished\n",
"Peak Memory: 17 MiB\n",
"Peak Memory: 16 MiB\n",
"--------------------------------------------------\n",
"Dataset: sparse 1\n",
"Concatenating 6 files with sizes:\n",
"['101MiB', '106MiB', '78MiB', '78MiB', '78MiB', '102MiB']\n",
"Total size: 546MiB\n",
"['97MiB', '98MiB', '78MiB', '78MiB', '96MiB', '78MiB']\n",
"Total size: 527MiB\n",
"Concatenation finished\n",
"Peak Memory: 16 MiB\n",
"--------------------------------------------------\n"
Expand All @@ -628,23 +654,23 @@
"text": [
"Dataset: dense 0\n",
"Concatenating 6 files with sizes:\n",
"['668MiB', '827MiB', '668MiB', '920MiB', '875MiB', '668MiB']\n",
"Total size: 4630MiB\n",
"['668MiB', '668MiB', '919MiB', '668MiB', '932MiB', '932MiB']\n",
"Total size: 4789MiB\n",
"Concatenation finished\n",
"Peak Memory: 2740 MiB\n",
"Peak Memory: 388 MiB\n",
"--------------------------------------------------\n",
"Dataset: dense 1\n",
"Concatenating 6 files with sizes:\n",
"['912MiB', '823MiB', '668MiB', '668MiB', '668MiB', '864MiB']\n",
"Total size: 4606MiB\n",
"['859MiB', '668MiB', '885MiB', '668MiB', '892MiB', '668MiB']\n",
"Total size: 4641MiB\n",
"Concatenation finished\n",
"Peak Memory: 3450 MiB\n",
"Peak Memory: 344 MiB\n",
"--------------------------------------------------\n"
Comment on lines 656 to 668
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These results are after doing the changes here: scverse/anndata#1169

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As @ivirshup said: you also added write_chunked, are you sure the anndata changes are necessary?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I tried with it only first. If you try running the notebook on main branch of anndata you will get the old results.

]
}
],
"source": [
"dataset_max_mem(max_arg=\"2000MiB\", datasets=datasets_aligned, array_type=\"dense\");"
"dataset_max_mem(max_arg=\"4000MiB\", datasets=datasets_aligned, array_type=\"dense\");"
]
},
{
Expand Down Expand Up @@ -673,17 +699,17 @@
"text": [
"Dataset: sparse 0\n",
"Concatenating 9 files with sizes:\n",
"['78MiB', '78MiB', '106MiB', '102MiB', '106MiB', '102MiB', '78MiB', '107MiB', '106MiB']\n",
"Total size: 867MiB\n",
"['78MiB', '78MiB', '101MiB', '98MiB', '98MiB', '97MiB', '96MiB', '97MiB', '78MiB']\n",
"Total size: 825MiB\n",
"Concatenation finished\n",
"Peak Memory: 286 MiB\n",
"Peak Memory: 273 MiB\n",
"--------------------------------------------------\n",
"Dataset: sparse 1\n",
"Concatenating 9 files with sizes:\n",
"['106MiB', '101MiB', '106MiB', '106MiB', '108MiB', '78MiB', '78MiB', '78MiB', '102MiB']\n",
"Total size: 867MiB\n",
"['97MiB', '101MiB', '97MiB', '98MiB', '98MiB', '78MiB', '78MiB', '96MiB', '78MiB']\n",
"Total size: 825MiB\n",
"Concatenation finished\n",
"Peak Memory: 445 MiB\n",
"Peak Memory: 425 MiB\n",
"--------------------------------------------------\n"
]
}
Expand All @@ -704,23 +730,23 @@
"text": [
"Dataset: dense 0\n",
"Concatenating 9 files with sizes:\n",
"['912MiB', '823MiB', '827MiB', '668MiB', '668MiB', '920MiB', '875MiB', '668MiB', '864MiB']\n",
"Total size: 7230MiB\n",
"['859MiB', '668MiB', '885MiB', '668MiB', '919MiB', '892MiB', '668MiB', '932MiB', '932MiB']\n",
"Total size: 7426MiB\n",
"Concatenation finished\n",
"Peak Memory: 2931 MiB\n",
"Peak Memory: 662 MiB\n",
"--------------------------------------------------\n",
"Dataset: dense 1\n",
"Concatenating 9 files with sizes:\n",
"['912MiB', '823MiB', '827MiB', '668MiB', '668MiB', '920MiB', '875MiB', '668MiB', '864MiB']\n",
"Total size: 7230MiB\n",
"['859MiB', '668MiB', '885MiB', '668MiB', '919MiB', '892MiB', '668MiB', '932MiB', '932MiB']\n",
"Total size: 7426MiB\n",
"Concatenation finished\n",
"Peak Memory: 3152 MiB\n",
"Peak Memory: 389 MiB\n",
"--------------------------------------------------\n"
]
}
],
"source": [
"dataset_max_mem(max_arg=\"2000MiB\", datasets=datasets_unaligned, array_type=\"dense\");"
"dataset_max_mem(max_arg=\"4000MiB\", datasets=datasets_unaligned, array_type=\"dense\");"
]
},
{
Expand All @@ -738,7 +764,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 16,
"id": "7ba971e1",
"metadata": {},
"outputs": [],
Expand All @@ -751,7 +777,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 17,
"id": "f2b1afa8",
"metadata": {},
"outputs": [
Expand All @@ -761,17 +787,17 @@
"text": [
"Dataset: sparse 0\n",
"Concatenating 3 files with sizes:\n",
"['78MiB', '78MiB', '106MiB']\n",
"Total size: 263MiB\n",
"['78MiB', '78MiB', '101MiB']\n",
"Total size: 258MiB\n",
"Concatenation finished\n",
"Peak Memory: 11 MiB\n",
"--------------------------------------------------\n",
"Dataset: sparse 1\n",
"Concatenating 3 files with sizes:\n",
"['106MiB', '101MiB', '106MiB']\n",
"Total size: 315MiB\n",
"['97MiB', '101MiB', '97MiB']\n",
"Total size: 296MiB\n",
"Concatenation finished\n",
"Peak Memory: 38 MiB\n",
"Peak Memory: 39 MiB\n",
"--------------------------------------------------\n"
]
}
Expand All @@ -782,7 +808,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 18,
"id": "57823bff",
"metadata": {},
"outputs": [
Expand All @@ -792,17 +818,17 @@
"text": [
"Dataset: sparse 0\n",
"Concatenating 3 files with sizes:\n",
"['78MiB', '78MiB', '106MiB']\n",
"Total size: 263MiB\n",
"['78MiB', '78MiB', '101MiB']\n",
"Total size: 258MiB\n",
"Concatenation finished\n",
"Peak Memory: 11 MiB\n",
"--------------------------------------------------\n",
"Dataset: sparse 1\n",
"Concatenating 3 files with sizes:\n",
"['106MiB', '101MiB', '106MiB']\n",
"Total size: 315MiB\n",
"['97MiB', '101MiB', '97MiB']\n",
"Total size: 296MiB\n",
"Concatenation finished\n",
"Peak Memory: 432 MiB\n",
"Peak Memory: 416 MiB\n",
"--------------------------------------------------\n"
]
}
Expand All @@ -822,7 +848,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 19,
"id": "05125e6e",
"metadata": {},
"outputs": [],
Expand Down