Skip to content

Commit 6191918

Browse files
authored
Remove n_obs subsampling (#29)
* Update script.py to remove n_obs_limit subsampling * Update CHANGELOG.md
1 parent 06d5439 commit 6191918

2 files changed

Lines changed: 12 additions & 8 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@
5656

5757
* Add `dataset_organism` to training input files (PR #24)
5858

59+
* Remove n_obs_limit default setting.
60+
5961
## BUG FIXES
6062

6163
* Update the nextflow workflow dependencies (PR #17).

src/data_processors/process_dataset/script.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
'output_test': "output/processed_datasets/test.h5ad",
1111
'train_frac': 0.9,
1212
'seed': 0,
13-
'n_obs_limit': 4000
13+
'n_obs_limit': None
1414
}
1515
meta = {
1616
"name": "process_dataset",
@@ -31,20 +31,22 @@
3131
# limit to max number of observations
3232
adata_output = adata.copy()
3333

34+
#subsample to largest batch, by default
3435
if "batch" in adata.obs:
3536
print(f">> Subsampling observations by largest batch", flush=True)
3637
batch_counts = adata.obs.groupby('batch').size()
3738
sorted_batches = batch_counts.sort_values(ascending=False)
3839
selected_batch = sorted_batches.index[0]
3940
adata_output = adata[adata.obs["batch"]==selected_batch,:].copy()
4041

41-
if adata_output.n_obs > par["n_obs_limit"]:
42-
print(f">> Randomly subsampling observations to {par['n_obs_limit']}", flush=True)
43-
print(f">> Setting seed to {par['seed']}", flush=True)
44-
random.seed(par["seed"])
45-
obs_filt = np.ones(dtype=np.bool_, shape=adata_output.n_obs)
46-
obs_index = np.random.choice(np.where(obs_filt)[0], par["n_obs_limit"], replace=False)
47-
adata_output = adata_output[obs_index].copy()
42+
#downsample above chosen n_obs_limit
43+
#if adata_output.n_obs > par["n_obs_limit"]:
44+
#print(f">> Randomly subsampling observations to {par['n_obs_limit']}", flush=True)
45+
#print(f">> Setting seed to {par['seed']}", flush=True)
46+
#random.seed(par["seed"])
47+
#obs_filt = np.ones(dtype=np.bool_, shape=adata_output.n_obs)
48+
#obs_index = np.random.choice(np.where(obs_filt)[0], par["n_obs_limit"], replace=False)
49+
#adata_output = adata_output[obs_index].copy()
4850

4951
# remove all layers except for counts
5052
print(">> Remove all layers except for counts", flush=True)

0 commit comments

Comments
 (0)