Remove n_obs subsampling (#29)

wes-lewis · web-flow · commit 61919187b12a · 2025-03-19T17:49:07.000-04:00
* Update script.py to remove n_obs_limit subsampling

* Update CHANGELOG.md
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -56,6 +56,8 @@
 
 * Add `dataset_organism` to training input files (PR #24)
 
+* Remove n_obs_limit default setting.
+
 ## BUG FIXES
 
 * Update the nextflow workflow dependencies (PR #17).
diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
@@ -10,7 +10,7 @@
     'output_test': "output/processed_datasets/test.h5ad",
     'train_frac': 0.9,
     'seed': 0,
-    'n_obs_limit': 4000
+    'n_obs_limit': None
 }
 meta = {
     "name": "process_dataset",
@@ -31,20 +31,22 @@
 # limit to max number of observations
 adata_output = adata.copy()
 
+#subsample to largest batch, by default
 if "batch" in adata.obs:
     print(f">> Subsampling observations by largest batch", flush=True)
     batch_counts = adata.obs.groupby('batch').size()
     sorted_batches = batch_counts.sort_values(ascending=False)
     selected_batch = sorted_batches.index[0]
     adata_output = adata[adata.obs["batch"]==selected_batch,:].copy()
 
-if adata_output.n_obs > par["n_obs_limit"]:
-    print(f">> Randomly subsampling observations to {par['n_obs_limit']}", flush=True)
-    print(f">> Setting seed to {par['seed']}", flush=True)
-    random.seed(par["seed"])
-    obs_filt = np.ones(dtype=np.bool_, shape=adata_output.n_obs)
-    obs_index = np.random.choice(np.where(obs_filt)[0], par["n_obs_limit"], replace=False)
-    adata_output = adata_output[obs_index].copy()
+#downsample above chosen n_obs_limit
+#if adata_output.n_obs > par["n_obs_limit"]:
+    #print(f">> Randomly subsampling observations to {par['n_obs_limit']}", flush=True)
+    #print(f">> Setting seed to {par['seed']}", flush=True)
+    #random.seed(par["seed"])
+    #obs_filt = np.ones(dtype=np.bool_, shape=adata_output.n_obs)
+    #obs_index = np.random.choice(np.where(obs_filt)[0], par["n_obs_limit"], replace=False)
+    #adata_output = adata_output[obs_index].copy()
 
 # remove all layers except for counts
 print(">> Remove all layers except for counts", flush=True)