-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmake_perturbations.py
More file actions
107 lines (89 loc) · 3.28 KB
/
make_perturbations.py
File metadata and controls
107 lines (89 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import pandas as pd
from itertools import product
import json, hashlib
from pathlib import Path
from config import CONFIG
def make_run_id(params):
s = json.dumps(params, sort_keys=True)
return hashlib.md5(s.encode()).hexdigest()[:8]
def estimate_memory_requirement(row):
base_mem = 4 # GB baseline
# Factor 1: smoothing (biggest memory consumer due to dense matrix ops)
smooth_mem = row['smooth'] * 2.0
# Factor 2: neighborhood size (affects adjacency matrix size)
if row['n_neighbors'] >= 20:
neighbor_mem = 2.0
elif row['n_neighbors'] >= 15:
neighbor_mem = 1.0
else:
neighbor_mem = 0.5
# Factor 3: QC stringency (Lower quantiles = more spots retained = more memory)
avg_qc = (row['gene_q'] + row['umi_q']) / 2
if avg_qc <= 0.05:
qc_mem = 1.5
elif avg_qc <= 0.075:
qc_mem = 1.0
else:
qc_mem = 0.5
total_mem = base_mem + smooth_mem + neighbor_mem + qc_mem
# round up to nearest standard allocation: 8, 16, 32 GB (gave cusion of -1 for each just to be safe)
if total_mem <= 7:
return 8
elif total_mem <= 15:
return 16
else:
return 32
# Generate all perturbation combinations
rows = []
for gene_q, umi_q, norm, smooth, k, res in product(
CONFIG["qc"]["gene_quantiles"],
CONFIG["qc"]["umi_quantiles"],
CONFIG["normalization"]["methods"],
CONFIG["spatial"]["smoothing_steps"],
CONFIG["graph"]["n_neighbors"],
CONFIG["graph"]["leiden_res"],
):
params = dict(
gene_q=gene_q,
umi_q=umi_q,
norm=norm,
smooth=smooth,
n_neighbors=k,
leiden_res=res,
)
params["run_id"] = make_run_id(params)
rows.append(params)
df = pd.DataFrame(rows)
df['estimated_mem_gb'] = df.apply(estimate_memory_requirement, axis=1)
df.to_parquet("perturbations.parquet", index=False)
print(f"Wrote {len(df)} perturbations")
print("\n" + "="*70)
print("MEMORY REQUIREMENT ANALYSIS")
print("="*70)
mem_counts = df['estimated_mem_gb'].value_counts().sort_index()
print(f"\nTotal perturbations: {len(df)}")
print("\nMemory tier breakdown:")
for mem_gb, count in mem_counts.items():
pct = 100 * count / len(df)
print(f" {mem_gb:2d} GB: {count:4d} tasks ({pct:5.1f}%)")
print("\nCreating memory-stratified index files...")
for mem_gb in sorted(df['estimated_mem_gb'].unique()):
tier_df = df[df['estimated_mem_gb'] == mem_gb].reset_index(drop=True)
output_file = f"perturbations_{mem_gb}gb_indices.txt"
tier_df[['run_id']].to_csv(output_file, index=False, header=False)
print(f" Created {output_file}: {len(tier_df)} tasks")
# 32 GB tasks should be avoided but this is here for security
if 32 in mem_counts.index:
print("\n" + "-"*70)
print("HIGH MEMORY (32 GB) TASKS - Parameter patterns:")
print("-"*70)
high_mem = df[df['estimated_mem_gb'] == 32]
print("\nBy smoothing steps:")
for smooth, count in high_mem['smooth'].value_counts().sort_index().items():
print(f" smooth={smooth}: {count} tasks")
print("\nBy n_neighbors:")
for k, count in high_mem['n_neighbors'].value_counts().sort_index().items():
print(f" n_neighbors={k}: {count} tasks")
print("\n" + "="*70)
print("✓ Perturbations and memory allocation complete!")
print("="*70)