Skip to content

Commit 32ed47b

Browse files
Merge pull request #6 from Multiomics-Analytics-Group/new_data
Inference mode for scripts, add argument parsing to scripts
2 parents 8b79a96 + a4736a5 commit 32ed47b

File tree

6 files changed

+173
-111
lines changed

6 files changed

+173
-111
lines changed

notebooks/heatmaps_gridsearch.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
}
1919
],
2020
"source": [
21-
"r\"\"\" Heatmaps notebook.\n",
22-
" _____ _______ _ _ \n",
21+
"r\"\"\"Heatmaps notebook.\n",
22+
" _____ _______ _ _\n",
2323
"| __ \\|__ __|| | | |\n",
2424
"| | | | | | | | | |\n",
2525
"| | | | | | | | | |\n",

notebooks/prot_optimization_dbg.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
"metadata": {},
88
"outputs": [],
99
"source": [
10-
"r\"\"\" Protease optimization\n",
11-
" _____ _______ _ _ \n",
10+
"r\"\"\"Protease optimization\n",
11+
" _____ _______ _ _\n",
1212
"| __ \\|__ __|| | | |\n",
1313
"| | | | | | | | | |\n",
1414
"| | | | | | | | | |\n",

notebooks/prot_optimization_greedy.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
"metadata": {},
88
"outputs": [],
99
"source": [
10-
"r\"\"\" Protease optimization\n",
11-
" _____ _______ _ _ \n",
10+
"r\"\"\"Protease optimization\n",
11+
" _____ _______ _ _\n",
1212
"| __ \\|__ __|| | | |\n",
1313
"| | | | | | | | | |\n",
1414
"| | | | | | | | | |\n",

src/preprocessing.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ def normalize_sequence(sequence):
8484
def remove_modifications(psm_column):
8585
"""
8686
Remove any content within parentheses, including the parentheses, from a given string.
87+
Remove UNIMOD modifications and normalize I to L.
8788
8889
Parameters:
8990
- psm_column (str): The string containing modifications in parentheses (e.g., "A(ox)BC(mod)D"). If the value is null, it returns None.
@@ -93,12 +94,31 @@ def remove_modifications(psm_column):
9394
"""
9495

9596
if pd.notnull(psm_column):
96-
return re.sub(
97+
ret = re.sub(
9798
r"\(.*?\)", "", psm_column
9899
) # Replace any content in parentheses with an empty string
100+
ret = re.sub(
101+
r"\[.*?\]", "", ret
102+
) # replace UNIMOD modifications in square brackets
103+
ret = normalize_sequence(ret)
104+
return ret
99105
return None
100106

101107

108+
# ! needs to move once it is a package
109+
def test_remove_modifications():
110+
assert remove_modifications("A(ox)BC(mod)D") == "ABCD"
111+
assert remove_modifications("A[UNIMOD:21]BC[UNIMOD:35]D") == "ABCD"
112+
assert remove_modifications("A(ox)[UNIMOD:21]BC(mod)[UNIMOD:35]D") == "ABCD"
113+
assert remove_modifications(None) is None
114+
assert remove_modifications("ACD") == "ACD"
115+
assert remove_modifications("A(I)BCD") == "ABCD"
116+
assert remove_modifications("A(ox)B(I)C(mod)D") == "ABCD"
117+
assert remove_modifications("A(ox)[UNIMOD:21]B(I)C(mod)[UNIMOD:35]D") == "ABCD"
118+
assert remove_modifications("AI BCD") == "AL BCD"
119+
assert remove_modifications("A(ox)I B(mod)CD") == "AL BCD"
120+
121+
102122
def clean_dataframe(df):
103123
"""
104124
Clean and preprocess a DataFrame for analysis by removing '(ox)' substrings from sequences in the 'seq' column.

src/script_dbg.py

Lines changed: 73 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
# !pip install kaleido # to export plotly figures as png
2121
# !pip install --upgrade nbformat # to avoid plotly error
2222

23+
import argparse
2324
import json
2425
import logging
2526
import os
@@ -29,7 +30,6 @@
2930
import pandas as pd
3031

3132
# import libraries
32-
3333
import alignment as align
3434
import clustering as clus
3535
import compute_statistics as comp_stat
@@ -40,16 +40,25 @@
4040
import mapping as map
4141
import preprocessing as prep
4242

43+
repo_folder = Path(__file__).resolve().parents[1]
44+
45+
parser = argparse.ArgumentParser(description="Protein Assembly Script")
46+
parser.add_argument("--input_csv", type=str, help="Input file")
47+
parser.add_argument(
48+
"--folder_outputs", default="outputs", type=str, help="Outputs folder"
49+
)
50+
parser.add_argument("--training", action="store_true", help="Training mode")
51+
4352
logging.basicConfig(
4453
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
4554
)
4655
logger = logging.getLogger(__name__)
4756

4857
BASE_DIR = Path(__file__).resolve().parents[1]
4958
JSON_DIR = BASE_DIR / "json"
50-
INPUT_DIR = BASE_DIR / "inputs"
51-
FASTA_DIR = BASE_DIR / "fasta"
52-
OUTPUTS_DIR = BASE_DIR / "outputs"
59+
# INPUT_DIR = BASE_DIR / "inputs"
60+
# FASTA_DIR = BASE_DIR / "fasta"
61+
# OUTPUTS_DIR = BASE_DIR / "outputs"
5362

5463

5564
def get_sample_metadata(run, chain="", json_path=JSON_DIR / "sample_metadata.json"):
@@ -68,17 +77,19 @@ def get_sample_metadata(run, chain="", json_path=JSON_DIR / "sample_metadata.jso
6877
raise ValueError(f"No metadata found for run '{run}' with chain '{chain}'.")
6978

7079

71-
def main():
80+
def main(input_csv: str, folder_outputs: str = "outputs", training: bool = False):
7281
"""Main function to run the assembly script."""
7382

74-
logger.info("Starting protein assembly pipeline.")
83+
input_csv = Path(input_csv)
7584

76-
run = "ma3"
85+
logger.info("Starting protein assembly pipeline.")
7786

78-
meta = get_sample_metadata(run, chain="light")
79-
protein = meta["protein"]
80-
chain = meta["chain"]
81-
proteases = meta["proteases"]
87+
run = input_csv.stem
88+
if training:
89+
meta = get_sample_metadata(run, chain="light")
90+
protein = meta["protein"]
91+
# chain = meta["chain"]
92+
proteases = meta["proteases"]
8293

8394
ass_method = "dbg"
8495

@@ -91,34 +102,37 @@ def main():
91102

92103
logger.info("Parameters loaded.")
93104

94-
folder_outputs = f"../outputs/{run}{chain}"
95-
prep.create_directory(folder_outputs)
96-
combination_folder_out = os.path.join(
97-
folder_outputs,
98-
f"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}_mi{min_identity}_mm{max_mismatches}",
105+
folder_outputs = Path(folder_outputs) / run
106+
folder_outputs.mkdir(parents=True, exist_ok=True)
107+
108+
combination_folder_out = (
109+
folder_outputs
110+
/ f"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}_mi{min_identity}_mm{max_mismatches}"
99111
)
100112
prep.create_subdirectories_outputs(combination_folder_out)
101113

102114
logger.info(f"Output folders created at: {combination_folder_out}")
103115

104116
# Data cleaning
105117
logger.info("Starting data cleaning...")
106-
107-
protein_norm = prep.normalize_sequence(protein)
108-
df = pd.read_csv(f"../inputs/{run}.csv")
109-
df["protease"] = df["experiment_name"].apply(
110-
lambda name: prep.extract_protease(name, proteases)
111-
)
118+
if training:
119+
protein_norm = prep.normalize_sequence(protein)
120+
df = pd.read_csv(input_csv)
121+
if training:
122+
df["protease"] = df["experiment_name"].apply(
123+
lambda name: prep.extract_protease(name, proteases)
124+
)
112125
df = prep.clean_dataframe(df)
113126
df["cleaned_preds"] = df["preds"].apply(prep.remove_modifications)
114127
cleaned_psms = df["cleaned_preds"].tolist()
115128
filtered_psms = prep.filter_contaminants(
116-
cleaned_psms, run, "../fasta/contaminants.fasta"
129+
cleaned_psms, run, repo_folder / "fasta/contaminants.fasta"
117130
)
118131
df = df[df["cleaned_preds"].isin(filtered_psms)]
119-
df["mapped"] = df["cleaned_preds"].apply(
120-
lambda x: "True" if x in protein_norm else "False"
121-
)
132+
if training:
133+
df["mapped"] = df["cleaned_preds"].apply(
134+
lambda x: "True" if x in protein_norm else "False"
135+
)
122136
df = df[df["conf"] > conf]
123137
df.reset_index(drop=True, inplace=True)
124138
final_psms = df["cleaned_preds"].tolist()
@@ -145,16 +159,17 @@ def main():
145159
f"{combination_folder_out}/contigs/{ass_method}_contig_{conf}_{run}.fasta",
146160
"fasta",
147161
)
148-
mapped_contigs = map.process_protein_contigs_scaffold(
149-
assembled_contigs, protein_norm, max_mismatches, min_identity
150-
)
151-
df_contigs = map.create_dataframe_from_mapped_sequences(data=mapped_contigs)
152-
comp_stat.compute_assembly_statistics(
153-
df=df_contigs,
154-
sequence_type="contigs",
155-
output_folder=f"{combination_folder_out}/statistics",
156-
reference=protein_norm,
157-
)
162+
if training:
163+
mapped_contigs = map.process_protein_contigs_scaffold(
164+
assembled_contigs, protein_norm, max_mismatches, min_identity
165+
)
166+
df_contigs = map.create_dataframe_from_mapped_sequences(data=mapped_contigs)
167+
comp_stat.compute_assembly_statistics(
168+
df=df_contigs,
169+
sequence_type="contigs",
170+
output_folder=f"{combination_folder_out}/statistics",
171+
reference=protein_norm,
172+
)
158173
assembled_scaffolds = dbg.create_scaffolds(assembled_contigs, min_overlap)
159174
assembled_scaffolds = list(set(assembled_scaffolds))
160175
assembled_scaffolds = sorted(assembled_scaffolds, key=len, reverse=True)
@@ -180,22 +195,23 @@ def main():
180195
f"{combination_folder_out}/scaffolds/{ass_method}_scaffold_{conf}_{run}.fasta",
181196
"fasta",
182197
)
183-
mapped_scaffolds = map.process_protein_contigs_scaffold(
184-
assembled_contigs=assembled_scaffolds,
185-
target_protein=protein_norm,
186-
max_mismatches=max_mismatches,
187-
min_identity=min_identity,
188-
)
198+
if training:
199+
mapped_scaffolds = map.process_protein_contigs_scaffold(
200+
assembled_contigs=assembled_scaffolds,
201+
target_protein=protein_norm,
202+
max_mismatches=max_mismatches,
203+
min_identity=min_identity,
204+
)
189205

190-
df_scaffolds_mapped = map.create_dataframe_from_mapped_sequences(
191-
data=mapped_scaffolds
192-
)
193-
comp_stat.compute_assembly_statistics(
194-
df=df_scaffolds_mapped,
195-
sequence_type="scaffolds",
196-
output_folder=f"{combination_folder_out}/statistics",
197-
reference=protein_norm,
198-
)
206+
df_scaffolds_mapped = map.create_dataframe_from_mapped_sequences(
207+
data=mapped_scaffolds
208+
)
209+
comp_stat.compute_assembly_statistics(
210+
df=df_scaffolds_mapped,
211+
sequence_type="scaffolds",
212+
output_folder=f"{combination_folder_out}/statistics",
213+
reference=protein_norm,
214+
)
199215

200216
# Clustering
201217
scaffolds_folder_out = f"{combination_folder_out}/scaffolds"
@@ -241,4 +257,9 @@ def main():
241257

242258

243259
if __name__ == "__main__":
244-
main()
260+
args = parser.parse_args()
261+
main(
262+
input_csv=args.input_csv,
263+
folder_outputs=args.folder_outputs,
264+
training=args.training,
265+
)

0 commit comments

Comments
 (0)