OpenClaw-Medical-Skills/skills/scrna-qc/qc_analysis.py at main · opencare-skillhub/OpenClaw-Medical-Skills · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# COPYRIGHT NOTICE
# This file is part of the "Universal Biomedical Skills" project.
# Copyright (c) 2026 MD BABU MIA, PhD <md.babu.mia@mssm.edu>
# All Rights Reserved.
#
# This code is proprietary and confidential.
# Unauthorized copying of this file, via any medium is strictly prohibited.
#
# Provenance: Authenticated by MD BABU MIA

#!/usr/bin/env python3
"""
Quality Control Analysis for Single-Cell RNA-seq Data
Following scverse best practices from:
https://www.sc-best-practices.org/preprocessing_visualization/quality_control.html

This is a convenience script that runs a complete QC workflow using the
modular functions from qc_core.py and qc_plotting.py.
"""

import anndata as ad
import scanpy as sc
import sys
import os
import argparse
import json
from datetime import datetime

# Import our modular utilities
from qc_core import (
    calculate_qc_metrics,
    build_qc_masks,
    filter_cells,
    filter_genes,
    print_qc_summary
)
from qc_plotting import (
    plot_qc_distributions,
    plot_filtering_thresholds,
    plot_qc_after_filtering
)

print("=" * 80)
print("Single-Cell RNA-seq Quality Control Analysis")
print("=" * 80)

# Default parameters (single source of truth)
DEFAULT_MAD_COUNTS = 5
DEFAULT_MAD_GENES = 5
DEFAULT_MAD_MT = 3
DEFAULT_MT_THRESHOLD = 8
DEFAULT_MIN_CELLS = 20
DEFAULT_MT_PATTERN = 'mt-,MT-'
DEFAULT_RIBO_PATTERN = 'Rpl,Rps,RPL,RPS'
DEFAULT_HB_PATTERN = '^Hb[^(p)]|^HB[^(P)]'

# Parse command-line arguments
parser = argparse.ArgumentParser(
    description='Quality Control Analysis for Single-Cell RNA-seq Data',
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog="""
Examples:
  python3 qc_analysis.py data.h5ad
  python3 qc_analysis.py raw_feature_bc_matrix.h5
  python3 qc_analysis.py data.h5ad --mad-counts 4 --mad-genes 4 --mad-mt 2.5
  python3 qc_analysis.py data.h5ad --mt-threshold 10 --min-cells 10
  python3 qc_analysis.py data.h5ad --mt-pattern "^mt-" --ribo-pattern "^Rpl,^Rps"
    """
)

parser.add_argument('input_file', help='Input .h5ad or .h5 file (10X Genomics format)')
parser.add_argument('--output-dir', type=str, help='Output directory (default: <input_basename>_qc_results)')
parser.add_argument('--mad-counts', type=float, default=DEFAULT_MAD_COUNTS, help=f'MAD threshold for total counts (default: {DEFAULT_MAD_COUNTS})')
parser.add_argument('--mad-genes', type=float, default=DEFAULT_MAD_GENES, help=f'MAD threshold for gene counts (default: {DEFAULT_MAD_GENES})')
parser.add_argument('--mad-mt', type=float, default=DEFAULT_MAD_MT, help=f'MAD threshold for mitochondrial percentage (default: {DEFAULT_MAD_MT})')
parser.add_argument('--mt-threshold', type=float, default=DEFAULT_MT_THRESHOLD, help=f'Hard threshold for mitochondrial percentage (default: {DEFAULT_MT_THRESHOLD})')
parser.add_argument('--min-cells', type=int, default=DEFAULT_MIN_CELLS, help=f'Minimum cells for gene filtering (default: {DEFAULT_MIN_CELLS})')
parser.add_argument('--mt-pattern', type=str, default=DEFAULT_MT_PATTERN, help=f'Comma-separated mitochondrial gene prefixes (default: "{DEFAULT_MT_PATTERN}")')
parser.add_argument('--ribo-pattern', type=str, default=DEFAULT_RIBO_PATTERN, help=f'Comma-separated ribosomal gene prefixes (default: "{DEFAULT_RIBO_PATTERN}")')
parser.add_argument('--hb-pattern', type=str, default=DEFAULT_HB_PATTERN, help=f'Hemoglobin gene regex pattern (default: "{DEFAULT_HB_PATTERN}")')
parser.add_argument('--no-log1p', action='store_true', help='Disable log1p transform for MAD calculations on counts/genes')

args = parser.parse_args()

# Verify input file exists
if not os.path.exists(args.input_file):
    print(f"\nError: File '{args.input_file}' not found!")
    sys.exit(1)

input_file = args.input_file
input_is_dir = os.path.isdir(input_file)
if input_is_dir:
    base_name = os.path.basename(os.path.abspath(input_file))
else:
    base_name = os.path.splitext(os.path.basename(input_file))[0]

# Set up output directory
if args.output_dir:
    output_dir = args.output_dir
else:
    output_dir = f"{base_name}_qc_results"

os.makedirs(output_dir, exist_ok=True)
print(f"\nOutput directory: {output_dir}")

# Display parameters
print(f"\nParameters:")
print(f"  MAD thresholds: counts={args.mad_counts}, genes={args.mad_genes}, MT%={args.mad_mt}")
print(f"  MT hard threshold: {args.mt_threshold}%")
print(f"  Min cells for gene filtering: {args.min_cells}")
print(f"  Gene patterns: MT={args.mt_pattern}, Ribo={args.ribo_pattern}, HB={args.hb_pattern}")
print(f"  MAD transform (counts/genes): {'none' if args.no_log1p else 'log1p'}")

# Load the data
print("\n[1/5] Loading data...")
file_ext = os.path.splitext(input_file)[1].lower()

if input_is_dir:
    adata = sc.read_10x_mtx(input_file, var_names='gene_symbols', make_unique=True)
    print(f"Loaded 10X directory: {adata.n_obs} cells × {adata.n_vars} genes")
elif file_ext == '.h5ad':
    adata = ad.read_h5ad(input_file)
    print(f"Loaded .h5ad file: {adata.n_obs} cells × {adata.n_vars} genes")
elif file_ext == '.h5':
    adata = sc.read_10x_h5(input_file)
    print(f"Loaded 10X .h5 file: {adata.n_obs} cells × {adata.n_vars} genes")
    # Make variable names unique (10X data sometimes has duplicate gene names)
    adata.var_names_make_unique()
else:
    print(f"\nError: Unsupported file format '{file_ext}'. Expected .h5ad, .h5, or 10X directory")
    sys.exit(1)

# Store original counts for comparison
n_cells_original = adata.n_obs
n_genes_original = adata.n_vars

# Calculate QC metrics
print("\n[2/5] Calculating QC metrics...")
calculate_qc_metrics(adata, mt_pattern=args.mt_pattern,
                     ribo_pattern=args.ribo_pattern,
                     hb_pattern=args.hb_pattern,
                     inplace=True)

print(f"  Found {adata.var['mt'].sum()} mitochondrial genes (pattern: {args.mt_pattern})")
print(f"  Found {adata.var['ribo'].sum()} ribosomal genes (pattern: {args.ribo_pattern})")
print(f"  Found {adata.var['hb'].sum()} hemoglobin genes (pattern: {args.hb_pattern})")

print_qc_summary(adata, label='QC Metrics Summary (before filtering)')

# Create before-filtering visualizations
print("\n[3/5] Creating QC visualizations...")
before_plot = os.path.join(output_dir, 'qc_metrics_before_filtering.png')
plot_qc_distributions(adata, before_plot, title='Quality Control Metrics - Before Filtering')
print(f"  Saved: {before_plot}")

# Apply MAD-based filtering
print("\n[4/5] Applying MAD-based filtering thresholds...")

counts_transform = None if args.no_log1p else 'log1p'
genes_transform = None if args.no_log1p else 'log1p'
qc_masks = build_qc_masks(
    adata,
    mad_counts=args.mad_counts,
    mad_genes=args.mad_genes,
    mad_mt=args.mad_mt,
    mt_threshold=args.mt_threshold,
    counts_transform=counts_transform,
    genes_transform=genes_transform,
    mt_transform=None,
    verbose=True
)

adata.obs['outlier_counts'] = qc_masks['outlier_counts']
adata.obs['outlier_genes'] = qc_masks['outlier_genes']
adata.obs['outlier_mt'] = qc_masks['outlier_mt']
adata.obs['pass_qc'] = qc_masks['pass_qc']

print(f"\n  Total cells failing QC: {(~adata.obs['pass_qc']).sum()} ({(~adata.obs['pass_qc']).sum()/adata.n_obs*100:.2f}%)")
print(f"  Cells passing QC: {adata.obs['pass_qc'].sum()} ({adata.obs['pass_qc'].sum()/adata.n_obs*100:.2f}%)")

# Visualize filtering thresholds
outlier_masks = {
    'total_counts': adata.obs['outlier_counts'].values,
    'n_genes_by_counts': adata.obs['outlier_genes'].values,
    'pct_counts_mt': adata.obs['outlier_mt'].values
}

thresholds = {
    'total_counts': {'n_mads': args.mad_counts, 'transform': counts_transform, 'tail': 'both'},
    'n_genes_by_counts': {'n_mads': args.mad_genes, 'transform': genes_transform, 'tail': 'both'},
    'pct_counts_mt': {'n_mads': args.mad_mt, 'transform': None, 'tail': 'high', 'hard': args.mt_threshold}
}

threshold_plot = os.path.join(output_dir, 'qc_filtering_thresholds.png')
plot_filtering_thresholds(adata, outlier_masks, thresholds, threshold_plot)
print(f"\n  Saved: {threshold_plot}")

# Apply filtering
print("\n[5/5] Applying filters...")
adata_filtered = filter_cells(adata, adata.obs['pass_qc'].values, inplace=False)
print(f"  Cells after filtering: {adata_filtered.n_obs} (removed {n_cells_original - adata_filtered.n_obs})")

# Filter genes
print(f"\n  Filtering genes detected in <{args.min_cells} cells...")
n_genes_before_gene_filtering = adata_filtered.n_vars
filter_genes(adata_filtered, min_cells=args.min_cells, inplace=True)
print(f"  Genes after filtering: {adata_filtered.n_vars} (removed {n_genes_before_gene_filtering - adata_filtered.n_vars})")

# Generate summary statistics
print("\n" + "=" * 80)
print("QC Summary")
print("=" * 80)

print("\nBefore filtering:")
print(f"  Cells: {n_cells_original}")
print(f"  Genes: {n_genes_original}")

print("\nAfter filtering:")
print(f"  Cells: {adata_filtered.n_obs} ({adata_filtered.n_obs/n_cells_original*100:.1f}% retained)")
print(f"  Genes: {adata_filtered.n_vars} ({adata_filtered.n_vars/n_genes_original*100:.1f}% retained)")

print_qc_summary(adata_filtered, label='\nFiltered data QC metrics')

# Create after-filtering visualizations
after_plot = os.path.join(output_dir, 'qc_metrics_after_filtering.png')
plot_qc_after_filtering(adata_filtered, after_plot)
print(f"\n  Saved: {after_plot}")

# Save filtered data
print("\nSaving filtered data...")
output_filtered = os.path.join(output_dir, f'{base_name}_filtered.h5ad')
output_with_qc = os.path.join(output_dir, f'{base_name}_with_qc.h5ad')
adata_filtered.write(output_filtered)
print(f"  Saved: {output_filtered}")

# Also save the unfiltered data with QC annotations
adata.write(output_with_qc)
print(f"  Saved: {output_with_qc} (original data with QC annotations)")

# Write summary JSON
summary_path = os.path.join(output_dir, 'qc_summary.json')
summary = {
    'run_timestamp_utc': datetime.utcnow().isoformat() + 'Z',
    'input': {
        'path': os.path.abspath(input_file),
        'type': '10x_directory' if input_is_dir else file_ext.lstrip('.')
    },
    'parameters': {
        'mad_counts': args.mad_counts,
        'mad_genes': args.mad_genes,
        'mad_mt': args.mad_mt,
        'mt_threshold': args.mt_threshold,
        'min_cells': args.min_cells,
        'mt_pattern': args.mt_pattern,
        'ribo_pattern': args.ribo_pattern,
        'hb_pattern': args.hb_pattern,
        'counts_transform': counts_transform,
        'genes_transform': genes_transform
    },
    'counts': {
        'cells_before': int(n_cells_original),
        'cells_after': int(adata_filtered.n_obs),
        'genes_before': int(n_genes_original),
        'genes_after': int(adata_filtered.n_vars)
    },
    'filtering': {
        'outlier_counts': int(adata.obs['outlier_counts'].sum()),
        'outlier_genes': int(adata.obs['outlier_genes'].sum()),
        'outlier_mt': int(adata.obs['outlier_mt'].sum()),
        'cells_removed': int(n_cells_original - adata_filtered.n_obs),
        'retention_rate': round(adata_filtered.n_obs / n_cells_original * 100, 2)
    },
    'outputs': {
        'qc_metrics_before_filtering': os.path.basename(before_plot),
        'qc_filtering_thresholds': os.path.basename(threshold_plot),
        'qc_metrics_after_filtering': os.path.basename(after_plot),
        'filtered_h5ad': os.path.basename(output_filtered),
        'with_qc_h5ad': os.path.basename(output_with_qc),
        'summary_json': os.path.basename(summary_path)
    }
}
with open(summary_path, 'w') as handle:
    json.dump(summary, handle, indent=2, sort_keys=True)
print(f"  Saved: {summary_path}")

print("\n" + "=" * 80)
print("Quality Control Analysis Complete!")
print("=" * 80)
print(f"\nAll results saved to: {output_dir}/")
print("\nGenerated files:")
print("  1. qc_metrics_before_filtering.png - Initial QC visualizations")
print("  2. qc_filtering_thresholds.png - MAD-based threshold visualization")
print("  3. qc_metrics_after_filtering.png - Post-filtering QC visualizations")
print(f"  4. {base_name}_filtered.h5ad - Filtered dataset")
print(f"  5. {base_name}_with_qc.h5ad - Original dataset with QC annotations")
print("  6. qc_summary.json - Machine-readable QC summary")
print("\nNext steps:")
print("  - Consider ambient RNA correction (SoupX)")
print("  - Consider doublet detection (scDblFinder)")
print("  - Proceed with normalization and downstream analysis")

__AUTHOR_SIGNATURE__ = "9a7f3c2e-MD-BABU-MIA-2026-MSSM-SECURE"