Skip to content

Commit c9c49ef

Browse files
danielsfmorriscb
authored andcommitted
specify dtype in DataFrame returned by get_gene_data
this can reduce the memory footprint by a factor of many (scanning through the full WHB 10X data, I was able to get a process that used to require >= 50 GB to use only 10 GB)
1 parent f9e1008 commit c9c49ef

File tree

1 file changed

+15
-5
lines changed

1 file changed

+15
-5
lines changed

src/abc_atlas_access/abc_atlas_cache/anndata_utils.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,10 @@ def get_gene_data(
5050
# Create a mask for the requested genes.
5151
gene_mask = np.isin(all_genes.gene_symbol, selected_genes)
5252
gene_filtered = all_genes[gene_mask]
53-
# Initialize our output DataFrame.
54-
output_gene_data = pd.DataFrame(index=all_cells.index,
55-
columns=gene_filtered.index)
53+
54+
# wait to create output dataframe until we have read in the
55+
# first chunk and know the dtype we need
56+
output_gene_data = None
5657

5758
num_total_cells = len(all_cells)
5859

@@ -88,9 +89,18 @@ def get_gene_data(
8889
cell_mask = cell_indexes.isin(all_cells.index)
8990
subcell_indexes = cell_indexes[cell_mask]
9091
num_processed_cells += len(subcell_indexes)
92+
93+
chunk = chunk.toarray()[cell_mask, :][:, gene_mask]
94+
95+
if output_gene_data is None:
96+
output_gene_data = pd.DataFrame(
97+
index=all_cells.index,
98+
columns=gene_filtered.index,
99+
dtype=chunk.dtype
100+
)
101+
91102
output_gene_data.loc[
92-
subcell_indexes, gene_filtered.index] = \
93-
chunk.toarray()[cell_mask][:, gene_mask]
103+
subcell_indexes, gene_filtered.index] = chunk
94104

95105
expression_data.file.close()
96106
del expression_data # Clean up our loaded file.

0 commit comments

Comments
 (0)