-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline.py
142 lines (114 loc) · 4.78 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import argparse
import scanpy as sc
import numpy as np
import subprocess
print('*****************************************')
print(' Single-Cell RNA Sequencing Analysis Tool')
print('*****************************************\n')
# Set up the argument parser
parser = argparse.ArgumentParser(description="scRNA-seq Analysis Tool")
parser.add_argument("fastq", nargs=2, help="Path to the paired-end FASTQ files")
parser.add_argument("-s", "--sample_id", required=True, help="Sample ID for Cell Ranger")
parser.add_argument("-r", "--ref_genome", required=True, help="Path to the reference genome for Cell Ranger")
parser.add_argument("-n", "--normalization", type=str, default='Yes',
choices=['Yes', 'None'],
help="Use the normalization or not")
parser.add_argument("-c", "--cluster", action='store_true',
help="Perform clustering analysis")
parser.add_argument("-d", "--dim_reduction", type=str, default='PCA',
choices=['PCA', 'tSNE', 'UMAP'],
help="Dimensionality reduction technique (PCA, tSNE, UMAP)")
parser.add_argument("-g", "--gene_list", nargs='*',
help="List of genes to include in the analysis (optional)")
parser.add_argument("-o", "--output", default='output.h5ad',
help="Output file name (e.g., a .h5ad file)")
parser.add_argument("-t", "--threads", type=int, default=1,
help="Number of threads to use for parallel processing (default: 1)")
# Parse arguments
args = parser.parse_args()
# Use args to access the arguments
fastq_files=args.fastq
sample_id=args.sample_id
ref_genome=args.ref_genome
input_file=args.input
normalization_method=args.normalization
do_clustering=args.cluster
dim_reduction_technique=args.dim_reduction
gene_list=args.gene_list
output_file=args.output
num_threads=args.threads
# Process the scRNA-seq data using Scanpy workflow
def run_fastqc(fastq_files):
print("Running FastQC...")
subprocess.run(["fastqc", *fastq_files])
def run_trim_galore(fastq_files):
print("Running Trim Galore...")
trimmed_fastq_files = []
for fastq in fastq_files:
output_fastq = fastq.replace('.fastq', '_trimmed.fastq')
subprocess.run(["trim_galore", "--illumina", "--paired", fastq, output_fastq]) # TD: --basename
trimmed_fastq_files.append(output_fastq)
return trimmed_fastq_files
def run_cellranger(trimmed_fastq_files, sample_id, ref_genome):
print("Running Cell Ranger...")
subprocess.run([
"cellranger", "count",
"--id", sample_id,
"--transcriptome", ref_genome,
"--fastqs", ",".join(trimmed_fastq_files),
"--sample", sample_id
])
def process_count_data(input_file, normalization, clustering, dim_reduction, gene_list, output_file, num_threads):
# Load the data
print("Loading data")
adata = sc.read(input_file)
# Basic Preprocessing
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
# Normalization
if normalization != 'None':
print("Normalizing data")
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# Selecting genes if provided
if gene_list:
adata = adata[:, gene_list]
# Dimensionality Reduction
print("Running PCA")
sc.tl.pca(adata, svd_solver='arpack')
if dim_reduction == 'tSNE':
print("Running tSNE")
sc.tl.tsne(adata, n_jobs=num_threads)
elif dim_reduction == 'UMAP':
print("Running UMAP")
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
# Clustering
if clustering:
print("Performing clustering...")
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.louvain(adata)
# Saving the anndata result
print("Saving results")
adata.write(output_file)
# Optionally, we can output plots
if dim_reduction == 'tSNE':
sc.pl.tsne(adata, save='tsne_plot.png')
elif dim_reduction == 'UMAP':
sc.pl.umap(adata, save='umap_plot.png')
# Call the functions with parsed arguments
run_fastqc(fastq_files=args.fastq)
trimmed_fastq = run_trim_galore(fastq_files=args.fastq)
run_cellranger(trimmed_fastq,
sample_id=args.sample_id,
ref_genome=args.ref_genome)
input_file_path=args.sample_id + "/outs"
process_count_data(input_file=input_file_path,
normalization=args.normalization,
clustering=args.cluster,
dim_reduction=args.dim_reduction,
gene_list=args.gene_list,
output_file=args.output,
num_threads=args.threads)
# In the shell
# python pipeline.py PBMC_R1.fastq PBMC_R2.fastq -s PBMC_count -r ../data/refdata-gex-GRCh38-2020-A -n Yes -d UMAP -o output.h5ad -t 30