|
10 | 10 | 'output_test': "output/processed_datasets/test.h5ad", |
11 | 11 | 'train_frac': 0.9, |
12 | 12 | 'seed': 0, |
13 | | - 'n_obs_limit': 4000 |
| 13 | + 'n_obs_limit': None |
14 | 14 | } |
15 | 15 | meta = { |
16 | 16 | "name": "process_dataset", |
|
31 | 31 | # limit to max number of observations |
32 | 32 | adata_output = adata.copy() |
33 | 33 |
|
| 34 | +#subsample to largest batch, by default |
34 | 35 | if "batch" in adata.obs: |
35 | 36 | print(f">> Subsampling observations by largest batch", flush=True) |
36 | 37 | batch_counts = adata.obs.groupby('batch').size() |
37 | 38 | sorted_batches = batch_counts.sort_values(ascending=False) |
38 | 39 | selected_batch = sorted_batches.index[0] |
39 | 40 | adata_output = adata[adata.obs["batch"]==selected_batch,:].copy() |
40 | 41 |
|
41 | | -if adata_output.n_obs > par["n_obs_limit"]: |
42 | | - print(f">> Randomly subsampling observations to {par['n_obs_limit']}", flush=True) |
43 | | - print(f">> Setting seed to {par['seed']}", flush=True) |
44 | | - random.seed(par["seed"]) |
45 | | - obs_filt = np.ones(dtype=np.bool_, shape=adata_output.n_obs) |
46 | | - obs_index = np.random.choice(np.where(obs_filt)[0], par["n_obs_limit"], replace=False) |
47 | | - adata_output = adata_output[obs_index].copy() |
| 42 | +#downsample above chosen n_obs_limit |
| 43 | +#if adata_output.n_obs > par["n_obs_limit"]: |
| 44 | + #print(f">> Randomly subsampling observations to {par['n_obs_limit']}", flush=True) |
| 45 | + #print(f">> Setting seed to {par['seed']}", flush=True) |
| 46 | + #random.seed(par["seed"]) |
| 47 | + #obs_filt = np.ones(dtype=np.bool_, shape=adata_output.n_obs) |
| 48 | + #obs_index = np.random.choice(np.where(obs_filt)[0], par["n_obs_limit"], replace=False) |
| 49 | + #adata_output = adata_output[obs_index].copy() |
48 | 50 |
|
49 | 51 | # remove all layers except for counts |
50 | 52 | print(">> Remove all layers except for counts", flush=True) |
|
0 commit comments