diff --git a/benchmarks/batch-quartet-multilab/scripts/comprehensive_analysis.py b/benchmarks/batch-quartet-multilab/scripts/comprehensive_analysis.py index ab88435..a952b83 100644 --- a/benchmarks/batch-quartet-multilab/scripts/comprehensive_analysis.py +++ b/benchmarks/batch-quartet-multilab/scripts/comprehensive_analysis.py @@ -11,7 +11,6 @@ 6. Critical assessment of mokume algorithms """ -import sys from pathlib import Path from itertools import combinations @@ -19,9 +18,6 @@ import pandas as pd import matplotlib.pyplot as plt from scipy import stats -from scipy.cluster.hierarchy import linkage, dendrogram -from scipy.spatial.distance import pdist -import seaborn as sns # Paths BENCHMARK_DIR = Path(__file__).parent.parent @@ -173,7 +169,7 @@ def analyze_missing_values(meta, data): # CRITIQUE: Flag high missing value variance as a problem if missing_variance > 100: - print(f" WARNING: High variance in missing values across batches suggests batch-specific quantification issues") + print(" WARNING: High variance in missing values across batches suggests batch-specific quantification issues") return pd.DataFrame(results) @@ -339,9 +335,9 @@ def analyze_de_consistency(meta, data): # CRITIQUE if mean_lfc_corr < 0.7: - print(f" WARNING: Low LFC correlation indicates quantification instability across labs") + print(" WARNING: Low LFC correlation indicates quantification instability across labs") if mean_concordance < 0.8 and concordances: - print(f" WARNING: Low DE concordance - same experiment may give different conclusions in different labs") + print(" WARNING: Low DE concordance - same experiment may give different conclusions in different labs") return pd.DataFrame(results) @@ -405,7 +401,7 @@ def analyze_batch_effect_magnitude(meta, data): bio_var = matrix_complete[stype_cols].var(axis=1).mean() biological_vars.append(bio_var) - biological_var = np.mean(biological_vars) if biological_vars else 0 + _ = np.mean(biological_vars) if biological_vars else 0 # not used yet # Batch effect proportion batch_effect_pct = (between_batch_var / total_variance * 100) if total_variance > 0 else 0 @@ -631,7 +627,7 @@ def generate_summary_plots(meta, data, coverage_df, missing_df, corr_df, de_df, # Calculate composite score (lower is better) scores = {} for method in coverage_df["Method"].tolist(): - method_lower = method.lower() + _ = method.lower() # reserved for future use # Get metrics core_pct = coverage_df[coverage_df["Method"] == method]["Core_Pct"].values[0] diff --git a/benchmarks/batch-quartet-multilab/scripts/plot_results.py b/benchmarks/batch-quartet-multilab/scripts/plot_results.py index d18784e..26de3d6 100644 --- a/benchmarks/batch-quartet-multilab/scripts/plot_results.py +++ b/benchmarks/batch-quartet-multilab/scripts/plot_results.py @@ -3,7 +3,6 @@ Generate visualizations for the batch effect correction benchmark. """ -import sys from pathlib import Path import numpy as np diff --git a/benchmarks/batch-quartet-multilab/scripts/run_benchmark.py b/benchmarks/batch-quartet-multilab/scripts/run_benchmark.py index 302745c..0648428 100644 --- a/benchmarks/batch-quartet-multilab/scripts/run_benchmark.py +++ b/benchmarks/batch-quartet-multilab/scripts/run_benchmark.py @@ -10,15 +10,13 @@ with the paper's results using the Quartet multi-lab dataset. """ +import logging import sys import time import warnings from pathlib import Path -from collections import defaultdict - import numpy as np import pandas as pd -from scipy import stats from sklearn.decomposition import PCA warnings.filterwarnings("ignore") @@ -28,7 +26,6 @@ MOKUME_ROOT = BENCHMARK_DIR.parent.parent sys.path.insert(0, str(MOKUME_ROOT)) -from mokume.quantification import MaxLFQQuantification, TopNQuantification from mokume.quantification.directlfq import is_directlfq_available, DirectLFQQuantification from mokume.postprocessing import is_batch_correction_available, apply_batch_correction @@ -122,8 +119,6 @@ def run_maxlfq_quantification(peptide_df: pd.DataFrame) -> pd.DataFrame: sample_cols = [c for c in wide_df.columns if c not in ["peptide", "protein"]] # Run MaxLFQ per protein - maxlfq = MaxLFQQuantification(min_peptides=1) - proteins = wide_df["protein"].unique() results = [] @@ -150,7 +145,8 @@ def run_maxlfq_quantification(peptide_df: pd.DataFrame) -> pd.DataFrame: "run_id": sample, "intensity": protein_intensities[i] }) - except Exception: + except Exception as exc: + logging.warning("MaxLFQ failed for protein %s: %s", protein, exc) continue return pd.DataFrame(results) diff --git a/benchmarks/quant-hela-method-comparison/scripts/01_download_data.py b/benchmarks/quant-hela-method-comparison/scripts/01_download_data.py index ed2ac28..5aad608 100644 --- a/benchmarks/quant-hela-method-comparison/scripts/01_download_data.py +++ b/benchmarks/quant-hela-method-comparison/scripts/01_download_data.py @@ -5,12 +5,15 @@ Download datasets from PRIDE ibaqpy-research FTP for benchmarking protein quantification methods. """ +import shutil import urllib.request import urllib.error +import urllib.parse import sys from pathlib import Path -from typing import Optional, List +from typing import List +_opener = urllib.request.build_opener() from config import ( ALL_DATASETS, HELA_DATASETS, @@ -46,7 +49,10 @@ def download_file(url: str, dest: Path, verbose: bool = True) -> bool: print(f" Downloading: {url}") try: - urllib.request.urlretrieve(url, dest) + if urllib.parse.urlparse(url).scheme not in ("http", "https"): + raise ValueError(f"URL scheme not allowed: {url}") + with _opener.open(url) as response, open(dest, "wb") as out_file: + shutil.copyfileobj(response, out_file) if verbose: size_mb = dest.stat().st_size / (1024 * 1024) print(f" Saved to: {dest.name} ({size_mb:.1f} MB)") @@ -65,10 +71,12 @@ def download_file(url: str, dest: Path, verbose: bool = True) -> bool: def check_url_exists(url: str) -> bool: """Check if a URL exists without downloading.""" try: + if urllib.parse.urlparse(url).scheme not in ("http", "https"): + raise ValueError(f"URL scheme not allowed: {url}") request = urllib.request.Request(url, method='HEAD') - urllib.request.urlopen(request, timeout=10) + _opener.open(request, timeout=10) return True - except: + except Exception: return False @@ -77,8 +85,10 @@ def list_available_files(base_url: str) -> List[str]: List files available at a URL (works for directory listings). """ try: - response = urllib.request.urlopen(base_url, timeout=30) - content = response.read().decode('utf-8') + if urllib.parse.urlparse(base_url).scheme not in ("http", "https"): + raise ValueError(f"URL scheme not allowed: {base_url}") + with _opener.open(base_url, timeout=30) as response: + content = response.read().decode('utf-8') import re links = re.findall(r'href="([^"]+)"', content) return [l for l in links if not l.startswith('/') and not l.startswith('?')] diff --git a/benchmarks/quant-hela-method-comparison/scripts/02_prepare_peptides.py b/benchmarks/quant-hela-method-comparison/scripts/02_prepare_peptides.py index 13eb2cd..1e231b2 100644 --- a/benchmarks/quant-hela-method-comparison/scripts/02_prepare_peptides.py +++ b/benchmarks/quant-hela-method-comparison/scripts/02_prepare_peptides.py @@ -7,8 +7,7 @@ """ import sys -from pathlib import Path -from typing import Optional, Dict, List, Tuple +from typing import Optional import pandas as pd import numpy as np @@ -18,7 +17,6 @@ HELA_DATASETS, RAW_DATA_DIR, PROCESSED_DIR, - COLUMN_MAPPING, QUANTMS_PARQUET_MAPPING, DatasetInfo, ) @@ -156,9 +154,6 @@ def normalize_columns(df: pd.DataFrame, data_format: str) -> pd.DataFrame: if "NormIntensity" not in df.columns and "Intensity" in df.columns: df["NormIntensity"] = df["Intensity"] - # Target columns we need - targets = ["ProteinName", "PeptideSequence", "SampleID", "NormIntensity"] - # Build a reverse mapping: target -> list of possible source names # Use QUANTMS_PARQUET_MAPPING for all formats (ibaqpy-research uses this) source_mapping = QUANTMS_PARQUET_MAPPING @@ -360,7 +355,6 @@ def load_raw_data(dataset: DatasetInfo) -> Optional[pd.DataFrame]: return None # Try alternative extensions - base_name = feature_path.stem.replace("_feature", "") for ext in [".parquet", ".csv", ".tsv"]: alt_path = RAW_DATA_DIR / f"{dataset.project_id}_feature{ext}" if alt_path.exists(): diff --git a/benchmarks/quant-hela-method-comparison/scripts/03_run_quantification.py b/benchmarks/quant-hela-method-comparison/scripts/03_run_quantification.py index 6301287..8a889e5 100644 --- a/benchmarks/quant-hela-method-comparison/scripts/03_run_quantification.py +++ b/benchmarks/quant-hela-method-comparison/scripts/03_run_quantification.py @@ -12,7 +12,6 @@ from typing import Optional, Dict, List import pandas as pd -import numpy as np # Add parent directory to path for mokume imports sys.path.insert(0, str(Path(__file__).parent.parent)) @@ -20,7 +19,6 @@ from config import ( ALL_DATASETS, HELA_DATASETS, - PROCESSED_DIR, PROTEIN_QUANT_DIR, FASTA_FILE, QUANTIFICATION_METHODS, @@ -93,7 +91,7 @@ def run_ibaq( result = pd.read_csv(output_file, sep="\t") return result else: - print(f" ERROR: iBAQ output file not generated") + print(" ERROR: iBAQ output file not generated") return None except Exception as e: @@ -336,7 +334,7 @@ def quantify_all_datasets( # Check FASTA if "ibaq" in methods and not FASTA_FILE.exists(): - print(f"\nWARNING: FASTA file not found. iBAQ will be skipped.") + print("\nWARNING: FASTA file not found. iBAQ will be skipped.") all_results = {} diff --git a/benchmarks/quant-hela-method-comparison/scripts/04_compute_metrics.py b/benchmarks/quant-hela-method-comparison/scripts/04_compute_metrics.py index 2e81e15..75e2214 100644 --- a/benchmarks/quant-hela-method-comparison/scripts/04_compute_metrics.py +++ b/benchmarks/quant-hela-method-comparison/scripts/04_compute_metrics.py @@ -11,7 +11,7 @@ import sys from pathlib import Path -from typing import Optional, Dict, List, Tuple +from typing import Optional, Dict, List import pandas as pd import numpy as np @@ -469,22 +469,22 @@ def save_results(results: dict, output_dir: Path = None): if results["cv_summary"]: cv_df = pd.DataFrame(results["cv_summary"]) cv_df.to_csv(output_dir / "cv_comparison.csv", index=False) - print(f" Saved: cv_comparison.csv") + print(" Saved: cv_comparison.csv") # Cross-experiment correlation for method, corr_matrix in results["cross_experiment_corr"].items(): corr_matrix.to_csv(output_dir / f"cross_experiment_corr_{method}.csv") - print(f" Saved: cross_experiment_corr_*.csv") + print(" Saved: cross_experiment_corr_*.csv") # Rank consistency for method, rank_matrix in results["rank_consistency"].items(): rank_matrix.to_csv(output_dir / f"rank_consistency_{method}.csv") - print(f" Saved: rank_consistency_*.csv") + print(" Saved: rank_consistency_*.csv") # Expression stability for method, stability in results["expression_stability"].items(): stability.to_csv(output_dir / f"expression_stability_{method}.csv", index=False) - print(f" Saved: expression_stability_*.csv") + print(" Saved: expression_stability_*.csv") # TMT vs LFQ correlation if results.get("tmt_lfq"): @@ -508,7 +508,7 @@ def save_results(results: dict, output_dir: Path = None): if tmt_lfq_summary: pd.DataFrame(tmt_lfq_summary).to_csv(output_dir / "tmt_lfq_comparison.csv", index=False) - print(f" Saved: tmt_lfq_comparison.csv, tmt_lfq_values_*.csv") + print(" Saved: tmt_lfq_comparison.csv, tmt_lfq_values_*.csv") # Summary metrics summary = [] @@ -538,7 +538,7 @@ def save_results(results: dict, output_dir: Path = None): summary_df = pd.DataFrame(summary) summary_df.to_csv(output_dir / "summary_metrics.csv", index=False) - print(f" Saved: summary_metrics.csv") + print(" Saved: summary_metrics.csv") def main(): diff --git a/benchmarks/quant-hela-method-comparison/scripts/05_generate_plots.py b/benchmarks/quant-hela-method-comparison/scripts/05_generate_plots.py index bbf94e1..865425b 100644 --- a/benchmarks/quant-hela-method-comparison/scripts/05_generate_plots.py +++ b/benchmarks/quant-hela-method-comparison/scripts/05_generate_plots.py @@ -13,25 +13,20 @@ import sys from pathlib import Path -from typing import Optional, Dict, List - import pandas as pd import numpy as np import matplotlib.pyplot as plt -import matplotlib.colors as mcolors import seaborn as sns sys.path.insert(0, str(Path(__file__).parent.parent)) from config import ( ALL_DATASETS, - HELA_DATASETS, ANALYSIS_DIR, PLOTS_DIR, PROTEIN_QUANT_DIR, QUANTIFICATION_METHODS, - PROTEINS_OF_INTEREST, ) # Set style diff --git a/benchmarks/quant-hela-method-comparison/scripts/06_method_correlation_plots.py b/benchmarks/quant-hela-method-comparison/scripts/06_method_correlation_plots.py index 4f8c65f..6bd3a6c 100644 --- a/benchmarks/quant-hela-method-comparison/scripts/06_method_correlation_plots.py +++ b/benchmarks/quant-hela-method-comparison/scripts/06_method_correlation_plots.py @@ -8,23 +8,20 @@ import sys from pathlib import Path -from typing import Optional, Dict, List, Tuple +from typing import Optional, Dict, List import pandas as pd import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import LogNorm -from scipy import stats from scipy.stats import pearsonr, spearmanr sys.path.insert(0, str(Path(__file__).parent)) from config import ( ALL_DATASETS, - HELA_DATASETS, PROTEIN_QUANT_DIR, PLOTS_DIR, - QUANTIFICATION_METHODS, ) @@ -137,7 +134,7 @@ def plot_density_scatter( norm=LogNorm(), alpha=0.9) # Add colorbar - cb = plt.colorbar(hb, ax=ax, label='n_neighbors') + plt.colorbar(hb, ax=ax, label='n_neighbors') # Add diagonal line (y = x) lims = [ @@ -513,7 +510,7 @@ def main(): if all_results: combined = pd.concat(all_results, ignore_index=True) combined.to_csv(output_dir / "method_correlations.csv", index=False) - print(f"\n Saved: method_correlations.csv") + print("\n Saved: method_correlations.csv") print("\n" + "=" * 70) print(f"Done! Plots saved to: {output_dir}") diff --git a/benchmarks/quant-hela-method-comparison/scripts/config.py b/benchmarks/quant-hela-method-comparison/scripts/config.py index a56befd..b30a6e9 100644 --- a/benchmarks/quant-hela-method-comparison/scripts/config.py +++ b/benchmarks/quant-hela-method-comparison/scripts/config.py @@ -6,8 +6,8 @@ """ from pathlib import Path -from dataclasses import dataclass, field -from typing import List, Dict, Optional +from dataclasses import dataclass +from typing import Dict # Base directories BENCHMARK_DIR = Path(__file__).parent diff --git a/benchmarks/quant-hela-method-comparison/scripts/run_benchmark.py b/benchmarks/quant-hela-method-comparison/scripts/run_benchmark.py index e5dcf33..4d06a74 100644 --- a/benchmarks/quant-hela-method-comparison/scripts/run_benchmark.py +++ b/benchmarks/quant-hela-method-comparison/scripts/run_benchmark.py @@ -15,7 +15,6 @@ python run_benchmark.py --phase 3 --stop 3 # Run only phase 3 """ -import sys import argparse from pathlib import Path diff --git a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/00_download_data.py b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/00_download_data.py index ba3412e..19fe52f 100755 --- a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/00_download_data.py +++ b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/00_download_data.py @@ -9,9 +9,12 @@ """ import urllib.request +import urllib.parse import sys from pathlib import Path +_opener = urllib.request.build_opener() + # Add parent to path for config import sys.path.insert(0, str(Path(__file__).parent)) from config import DATA_DIR, DATA_URLS, FASTA_URL @@ -27,15 +30,24 @@ def download_file(url: str, dest: Path, description: str = "") -> bool: print(f" URL: {url}") try: - def report_progress(block_num, block_size, total_size): - downloaded = block_num * block_size - if total_size > 0: - percent = min(100, downloaded * 100 / total_size) - mb_downloaded = downloaded / (1024 * 1024) - mb_total = total_size / (1024 * 1024) - print(f"\r Progress: {percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)", end="") - - urllib.request.urlretrieve(url, dest, reporthook=report_progress) + if urllib.parse.urlparse(url).scheme not in ("http", "https"): + raise ValueError(f"URL scheme not allowed: {url}") + with _opener.open(url) as response: + total_size = int(response.headers.get("Content-Length", 0)) + downloaded = 0 + block_size = 8192 + with open(dest, "wb") as out_file: + while True: + block = response.read(block_size) + if not block: + break + out_file.write(block) + downloaded += len(block) + if total_size > 0: + percent = min(100, downloaded * 100 / total_size) + mb_downloaded = downloaded / (1024 * 1024) + mb_total = total_size / (1024 * 1024) + print(f"\r Progress: {percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)", end="") print() # newline after progress print(f" Saved to: {dest}") return True diff --git a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/00_generate_ibaq.py b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/00_generate_ibaq.py index 564add4..2f2aa0f 100644 --- a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/00_generate_ibaq.py +++ b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/00_generate_ibaq.py @@ -11,7 +11,9 @@ import sys from pathlib import Path +import shutil import urllib.request +import urllib.parse import warnings import pandas as pd @@ -22,7 +24,6 @@ from config import ( DATA_DIR, LOCAL_DATA_DIR, LOCAL_QUANTIFIED_DIR, FASTA_URL, FASTA_PATH, - SAMPLE_CONDITIONS, ) # Add mokume to path @@ -30,6 +31,8 @@ warnings.filterwarnings("ignore") +_opener = urllib.request.build_opener() + def download_fasta(): """Download FASTA file if not present.""" @@ -41,7 +44,10 @@ def download_fasta(): DATA_DIR.mkdir(parents=True, exist_ok=True) try: - urllib.request.urlretrieve(FASTA_URL, FASTA_PATH) + if urllib.parse.urlparse(FASTA_URL).scheme not in ("http", "https"): + raise ValueError(f"URL scheme not allowed: {FASTA_URL}") + with _opener.open(FASTA_URL) as response, open(FASTA_PATH, "wb") as out_file: + shutil.copyfileobj(response, out_file) print(f"Downloaded to: {FASTA_PATH}") return True except Exception as e: @@ -95,7 +101,7 @@ def compute_ibaq(technology: str) -> pd.DataFrame: print(f" Found {len(protein_accessions)} unique protein accessions") # Get theoretical peptide counts from FASTA - print(f" Computing theoretical peptides from FASTA...") + print(" Computing theoretical peptides from FASTA...") try: peptide_counts, mw_dict, found_proteins = extract_fasta( fasta=str(FASTA_PATH), diff --git a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/01_grid_search_methods.py b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/01_grid_search_methods.py index 4a7c1c9..8f7ddc4 100755 --- a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/01_grid_search_methods.py +++ b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/01_grid_search_methods.py @@ -246,7 +246,7 @@ def run_grid_search(technology: str) -> pd.DataFrame: break if intensity_col is None: - print(f" Skipped (no intensity column found)") + print(" Skipped (no intensity column found)") continue protein_col = "ProteinName" if "ProteinName" in protein_df.columns else "protein_accessions" diff --git a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/02_variance_decomposition.py b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/02_variance_decomposition.py index 8770ac7..d070f94 100755 --- a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/02_variance_decomposition.py +++ b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/02_variance_decomposition.py @@ -9,6 +9,7 @@ Uses PCA and ANOVA-style variance decomposition. """ +import logging import sys from pathlib import Path import warnings @@ -25,7 +26,7 @@ from config import ( RESULTS_DIR, FIGURES_DIR, SAMPLE_CONDITIONS, CONDITION_COLORS, - BENCHMARKS_LOCAL_DIR, LOCAL_QUANTIFIED_DIR, + LOCAL_QUANTIFIED_DIR, FIGURE_DPI, FIGURE_FORMAT, ) @@ -157,8 +158,8 @@ def run_pca_analysis(df_wide: pd.DataFrame) -> dict: labels = pd.factorize(valid_df["condition"])[0] coords = valid_df[["PC1", "PC2"]].values silhouette = silhouette_score(coords, labels) - except Exception: - pass + except Exception as exc: + logging.warning("Silhouette score computation failed: %s", exc) return { "pca_df": pca_df, diff --git a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/03_fold_change_accuracy.py b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/03_fold_change_accuracy.py index 6ecf412..8d263eb 100755 --- a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/03_fold_change_accuracy.py +++ b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/03_fold_change_accuracy.py @@ -20,7 +20,6 @@ import pandas as pd import numpy as np -from scipy import stats import matplotlib.pyplot as plt # Add parent to path for config import @@ -29,7 +28,7 @@ RESULTS_DIR, FIGURES_DIR, SAMPLE_CONDITIONS, EXPECTED_FOLD_CHANGES, SPECIES_PATTERNS, CONDITION_COLORS, TECHNOLOGY_COLORS, - BENCHMARKS_LOCAL_DIR, LOCAL_QUANTIFIED_DIR, LOCAL_DATA_DIR, + LOCAL_QUANTIFIED_DIR, LOCAL_DATA_DIR, FIGURE_DPI, FIGURE_FORMAT, ) @@ -257,7 +256,7 @@ def plot_fold_change_accuracy(fc_df: pd.DataFrame, technology: str, output_path: patch.set_facecolor(color) patch.set_alpha(0.6) - ax.axhline(expected_yeast, color="red", linestyle="--", label=f"Expected yeast") + ax.axhline(expected_yeast, color="red", linestyle="--", label="Expected yeast") ax.axhline(0, color="green", linestyle="--", label="Expected human") ax.set_ylabel("Observed log2(Fold-Change)") @@ -345,7 +344,7 @@ def main(): all_fc_data[technology] = fc_df # Species breakdown - print(f" Species in results:") + print(" Species in results:") for species, count in fc_df["species"].value_counts().items(): print(f" {species}: {count}") @@ -423,7 +422,7 @@ def main(): if human_key in metrics: m = metrics[human_key] print(f"\n{tech.upper()} - Human proteins (negative control):") - print(f" Expected log2FC: 0.0") + print(" Expected log2FC: 0.0") print(f" Observed median log2FC: {m['observed_median_log2_fc']:.2f}") print(f" False positive rate (|log2FC| > 1): {m['false_positive_rate']:.1%}") diff --git a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/04_stability_metrics.py b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/04_stability_metrics.py index 5e81120..afd988f 100755 --- a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/04_stability_metrics.py +++ b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/04_stability_metrics.py @@ -22,7 +22,7 @@ sys.path.insert(0, str(Path(__file__).parent)) from config import ( RESULTS_DIR, FIGURES_DIR, - SAMPLE_CONDITIONS, CONDITION_COLORS, METHOD_COLORS, + SAMPLE_CONDITIONS, CONDITION_COLORS, CV_THRESHOLDS, LOCAL_QUANTIFIED_DIR, FIGURE_DPI, FIGURE_FORMAT, ) diff --git a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/05_cross_technology_correlation.py b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/05_cross_technology_correlation.py index f45e21c..4618cab 100755 --- a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/05_cross_technology_correlation.py +++ b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/05_cross_technology_correlation.py @@ -24,7 +24,7 @@ sys.path.insert(0, str(Path(__file__).parent)) from config import ( RESULTS_DIR, FIGURES_DIR, - SAMPLE_CONDITIONS, CONDITION_COLORS, TECHNOLOGY_COLORS, + SAMPLE_CONDITIONS, CONDITION_COLORS, LOCAL_QUANTIFIED_DIR, FIGURE_DPI, FIGURE_FORMAT, ) diff --git a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/07_tmm_normalization_benchmark.py b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/07_tmm_normalization_benchmark.py index b752510..4c17967 100644 --- a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/07_tmm_normalization_benchmark.py +++ b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/07_tmm_normalization_benchmark.py @@ -31,7 +31,7 @@ # Add mokume to path sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) -from mokume.normalization.tmm import TMMNormalizer, tmm_normalize +from mokume.normalization.tmm import TMMNormalizer warnings.filterwarnings("ignore") diff --git a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/08_normalization_grid.py b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/08_normalization_grid.py index 498979b..fd46774 100644 --- a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/08_normalization_grid.py +++ b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/08_normalization_grid.py @@ -26,7 +26,7 @@ sys.path.insert(0, str(Path(__file__).parent)) from config import ( RESULTS_DIR, FIGURES_DIR, - SAMPLE_CONDITIONS, CONDITION_COLORS, + SAMPLE_CONDITIONS, CV_THRESHOLDS, LOCAL_QUANTIFIED_DIR, FIGURE_DPI, FIGURE_FORMAT, EXPECTED_FOLD_CHANGES, SPECIES_PATTERNS, @@ -159,11 +159,7 @@ def apply_post_normalization(df: pd.DataFrame, method: str) -> pd.DataFrame: result = df.copy() # Get ranks for each column ranks = df.apply(lambda x: rankdata(x, method='average', nan_policy='omit'), axis=0) - # Get sorted values (ignoring NaN) - sorted_means = df.apply(lambda x: np.sort(x.dropna().values), axis=0) - # Use mean of sorted values at each rank - n_rows = len(df) for col in df.columns: col_ranks = ranks[col].values col_sorted = np.sort(df[col].dropna().values) @@ -308,7 +304,7 @@ def run_benchmark(technology: str) -> list: # Get appropriate quantification method for this technology quant_method = get_default_quant_method(technology) print(f"\n Using quantification method: {quant_method}") - print(f" (TMT uses 'sum' for reporter ions, LFQ uses 'directlfq' for MS1)") + print(" (TMT uses 'sum' for reporter ions, LFQ uses 'directlfq' for MS1)") # Load base data print(f" Loading {quant_method} data...") diff --git a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/09_quant_norm_interaction.py b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/09_quant_norm_interaction.py index bcae3ba..721b103 100644 --- a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/09_quant_norm_interaction.py +++ b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/09_quant_norm_interaction.py @@ -15,8 +15,6 @@ import sys from pathlib import Path import warnings -from itertools import product - import pandas as pd import numpy as np import matplotlib.pyplot as plt @@ -26,11 +24,10 @@ sys.path.insert(0, str(Path(__file__).parent)) from config import ( RESULTS_DIR, FIGURES_DIR, - SAMPLE_CONDITIONS, CONDITION_COLORS, + SAMPLE_CONDITIONS, CV_THRESHOLDS, LOCAL_QUANTIFIED_DIR, FIGURE_DPI, FIGURE_FORMAT, EXPECTED_FOLD_CHANGES, SPECIES_PATTERNS, - get_methods_for_technology, ) # Add mokume to path @@ -196,7 +193,7 @@ def run_benchmark(technology: str) -> list: try: df_base = load_quantified_data(technology, quant_method) df_base = df_base.replace(0, np.nan).dropna(thresh=3) - n_proteins = len(df_base) + _ = len(df_base) # protein count logged elsewhere except FileNotFoundError as e: print(f" Skipping {quant_method}: {e}") continue @@ -420,7 +417,7 @@ def main(): f"(RMSE={best_rmse['fc_rmse']:.3f})") # Best per quantification method - print(f"\n Best normalization per quant method (by CV):") + print("\n Best normalization per quant method (by CV):") for quant in tech_df["quant_method"].unique(): quant_df = tech_df[tech_df["quant_method"] == quant] best = quant_df.loc[quant_df["within_cv"].idxmin()] diff --git a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/10_imputation_benchmark.py b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/10_imputation_benchmark.py index cf9f732..05d47c4 100644 --- a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/10_imputation_benchmark.py +++ b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/10_imputation_benchmark.py @@ -22,13 +22,12 @@ import pandas as pd import numpy as np import matplotlib.pyplot as plt -import seaborn as sns # Add parent to path for config import sys.path.insert(0, str(Path(__file__).parent)) from config import ( RESULTS_DIR, FIGURES_DIR, - SAMPLE_CONDITIONS, CONDITION_COLORS, + SAMPLE_CONDITIONS, CV_THRESHOLDS, LOCAL_QUANTIFIED_DIR, FIGURE_DPI, FIGURE_FORMAT, EXPECTED_FOLD_CHANGES, SPECIES_PATTERNS, @@ -296,8 +295,8 @@ def plot_imputation_comparison(results_df: pd.DataFrame, output_path: Path): width = 0.35 offset = (i - 0.5) * width - bars = ax.bar(x + offset, values, width, label=tech.upper(), - color=["#ff7f00", "#984ea3"][i], alpha=0.8) + ax.bar(x + offset, values, width, label=tech.upper(), + color=["#ff7f00", "#984ea3"][i], alpha=0.8) ax.set_xlabel("Imputation Method") ax.set_ylabel(metric) diff --git a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/11_preprocessing_filters.py b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/11_preprocessing_filters.py index ebde518..a5e5d79 100644 --- a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/11_preprocessing_filters.py +++ b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/11_preprocessing_filters.py @@ -227,7 +227,7 @@ def run_benchmark(technology: str) -> list: """Run preprocessing filter benchmark for a technology.""" results = [] - print(f"\n Loading peptide data...") + print("\n Loading peptide data...") try: df_peptides = load_peptide_data(technology) n_peptides_orig = len(df_peptides) @@ -258,7 +258,6 @@ def run_benchmark(technology: str) -> list: ) n_peptides = len(df_filtered) - n_proteins = df_filtered["ProteinName"].nunique() if n_peptides == 0: print("No peptides left, skipping") @@ -510,12 +509,12 @@ def main(): ] if len(default) > 0: d = default.iloc[0] - print(f"\n Default (len≥7, mc≤2, pep≥2):") + print("\n Default (len≥7, mc≤2, pep≥2):") print(f" CV={d['within_cv']:.4f}, n={d['n_proteins']}, " f"RMSE={d['fc_rmse']:.3f}") # Individual filter impact - print(f"\n Filter impact (mean change from baseline):") + print("\n Filter impact (mean change from baseline):") baseline = tech_df[ (tech_df["min_length"] == 7) & (tech_df["max_missed_cleavages"] == 2) & diff --git a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/12_optimal_pipeline.py b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/12_optimal_pipeline.py index 6a053b3..2e2d10b 100644 --- a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/12_optimal_pipeline.py +++ b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/12_optimal_pipeline.py @@ -25,7 +25,6 @@ import pandas as pd import numpy as np import matplotlib.pyplot as plt -import seaborn as sns # Add parent to path for config import sys.path.insert(0, str(Path(__file__).parent)) diff --git a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/run_benchmark.py b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/run_benchmark.py index abbd669..fb606c6 100755 --- a/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/run_benchmark.py +++ b/benchmarks/quant-pxd007683-tmt-vs-lfq/scripts/run_benchmark.py @@ -17,7 +17,9 @@ python run_benchmark.py --step 3 # Run only step 3 """ -import subprocess +import multiprocessing +import os +import runpy import sys import argparse from pathlib import Path @@ -35,29 +37,51 @@ ] -def run_script(script_name: str, description: str) -> bool: - """Run a single benchmark script.""" - script_path = SCRIPT_DIR / script_name +def _validate_step(step_index: int): + """Validate step index and return (script_name, description, resolved_path) or None.""" + if not 0 <= step_index < len(STEPS): + print(f" Invalid step index: {step_index}") + return None + + script_name, description = STEPS[step_index] + script_path = (SCRIPT_DIR / script_name).resolve() + + if not script_path.is_relative_to(SCRIPT_DIR.resolve()): + print(f" Script path escapes allowed directory: {script_path}") + return None if not script_path.exists(): print(f" Script not found: {script_path}") + return None + + return script_name, description, script_path + + +def run_step(step_index: int) -> bool: + """Run a single benchmark script by its index in STEPS.""" + validated = _validate_step(step_index) + if validated is None: return False + script_name, description, script_path = validated + print(f"\n{'='*60}") print(f"Running: {description}") print(f"Script: {script_name}") print("=" * 60) + def _target(path: str, cwd: str) -> None: + os.chdir(cwd) + runpy.run_path(path, run_name="__main__") + try: - result = subprocess.run( - [sys.executable, str(script_path)], - cwd=str(SCRIPT_DIR), - check=True, + proc = multiprocessing.Process( + target=_target, + args=(str(script_path), str(SCRIPT_DIR)), ) - return result.returncode == 0 - except subprocess.CalledProcessError as e: - print(f"\nERROR: Script failed with return code {e.returncode}") - return False + proc.start() + proc.join() + return proc.exitcode == 0 except Exception as e: print(f"\nERROR: {e}") return False @@ -90,29 +114,29 @@ def main(): for i, (script, desc) in enumerate(STEPS): print(f" {i}. {desc}") - # Determine which steps to run + # Determine which step indices to run if args.step is not None: - steps_to_run = [STEPS[args.step]] + indices_to_run = [args.step] print(f"\nRunning only step {args.step}") else: - steps_to_run = STEPS.copy() + indices_to_run = list(range(len(STEPS))) if args.skip_download: - steps_to_run = [(s, d) for s, d in steps_to_run if "download" not in s.lower()] + indices_to_run = [i for i in indices_to_run if "download" not in STEPS[i][0].lower()] print("\nSkipping download step") if args.quick: - steps_to_run = [(s, d) for s, d in steps_to_run if "grid_search" not in s.lower()] + indices_to_run = [i for i in indices_to_run if "grid_search" not in STEPS[i][0].lower()] print("\nQuick mode: skipping grid search") # Run steps results = [] - for script, description in steps_to_run: - success = run_script(script, description) - results.append((script, success)) + for idx in indices_to_run: + success = run_step(idx) + results.append((STEPS[idx][0], success)) if not success: - print(f"\nStep failed: {script}") + print(f"\nStep failed: {STEPS[idx][0]}") if input("Continue with remaining steps? (y/n): ").lower() != "y": break diff --git a/environment.yaml b/environment.yaml index f16b39e..6eb9663 100644 --- a/environment.yaml +++ b/environment.yaml @@ -21,3 +21,5 @@ dependencies: - typing_extensions>=4.6.3 - inmoose - anndata + - plotly + - statsmodels diff --git a/mokume/analysis/differential_expression.py b/mokume/analysis/differential_expression.py index 7bdfd75..932d172 100644 --- a/mokume/analysis/differential_expression.py +++ b/mokume/analysis/differential_expression.py @@ -5,12 +5,9 @@ differences between experimental conditions. """ -from typing import Optional - import numpy as np import pandas as pd from scipy import stats -from scipy.optimize import minimize_scalar from statsmodels.stats.multitest import multipletests from mokume.core.logger import get_logger diff --git a/mokume/commands/visualize.py b/mokume/commands/visualize.py index 02c57c7..b1a334d 100644 --- a/mokume/commands/visualize.py +++ b/mokume/commands/visualize.py @@ -45,7 +45,7 @@ def compute_tsne(df_pca, n_components=2, perplexity=30, learning_rate=200, n_ite required=False, default="proteins.tsv", ) -def tsne_visualization(folder: str, pattern: str): +def tsne_visualization(folder: str = None, pattern: str = "proteins.tsv"): """Generate a t-SNE visualization for protein data from specified files.""" if not is_plotting_available(): raise click.ClickException( diff --git a/mokume/mokume_cli.py b/mokume/mokume_cli.py index 45bcb18..a3732c2 100644 --- a/mokume/mokume_cli.py +++ b/mokume/mokume_cli.py @@ -4,6 +4,7 @@ import logging from pathlib import Path +from typing import Optional import click @@ -44,7 +45,7 @@ required=False, help="Write log to this file.", ) -def cli(log_level: str, log_file: Path): +def cli(log_level: str = "debug", log_file: Optional[Path] = None): """ mokume - A comprehensive proteomics quantification library. diff --git a/mokume/normalization/peptide.py b/mokume/normalization/peptide.py index 4f85712..021c72a 100644 --- a/mokume/normalization/peptide.py +++ b/mokume/normalization/peptide.py @@ -14,7 +14,6 @@ import time from typing import Optional, TYPE_CHECKING -import pandas as pd import numpy as np from mokume.model.labeling import QuantificationCategory @@ -25,7 +24,6 @@ NORM_INTENSITY, PEPTIDE_CANONICAL, PROTEIN_NAME, - SAMPLE_ID, TECHREPLICATE, PARQUET_COLUMNS, AGGREGATION_LEVEL_SAMPLE, @@ -49,6 +47,15 @@ sum_peptidoform_intensities, ) +__all__ = [ + "Feature", "SQLFilterBuilder", "analyse_sdrf", + "parse_uniprot_accession", "get_canonical_peptide", + "remove_contaminants_entrapments_decoys", "remove_protein_by_ids", + "reformat_quantms_feature_table_quant_labels", "apply_initial_filtering", + "merge_fractions", "get_peptidoform_normalize_intensities", + "sum_peptidoform_intensities", +] + if TYPE_CHECKING: from mokume.model.filters import PreprocessingFilterConfig diff --git a/mokume/normalization/tmm.py b/mokume/normalization/tmm.py index 42d5d67..1d2bcb8 100644 --- a/mokume/normalization/tmm.py +++ b/mokume/normalization/tmm.py @@ -14,7 +14,7 @@ import numpy as np import pandas as pd -from typing import Optional, Union +from typing import Optional from mokume.core.logger import get_logger diff --git a/mokume/pipeline/features_to_proteins.py b/mokume/pipeline/features_to_proteins.py index 980a08a..915a9bc 100644 --- a/mokume/pipeline/features_to_proteins.py +++ b/mokume/pipeline/features_to_proteins.py @@ -17,7 +17,6 @@ directlfq package for reproducibility. """ -import numpy as np import pandas as pd from pathlib import Path from typing import Optional diff --git a/mokume/pipeline/stages.py b/mokume/pipeline/stages.py index 837a8a5..2582641 100644 --- a/mokume/pipeline/stages.py +++ b/mokume/pipeline/stages.py @@ -29,7 +29,6 @@ FeatureNormalizationMethod, PeptideNormalizationMethod, ) -from mokume.model.labeling import QuantificationCategory from mokume.core.constants import ( PROTEIN_NAME, PEPTIDE_CANONICAL, diff --git a/mokume/plotting/__init__.py b/mokume/plotting/__init__.py index 74a7cbd..f0646d5 100644 --- a/mokume/plotting/__init__.py +++ b/mokume/plotting/__init__.py @@ -6,6 +6,7 @@ are optional and can be installed with: pip install mokume[plotting] """ +import importlib.util from typing import TYPE_CHECKING _PLOTTING_AVAILABLE = None @@ -22,13 +23,10 @@ def is_plotting_available() -> bool: """ global _PLOTTING_AVAILABLE if _PLOTTING_AVAILABLE is None: - try: - import matplotlib - import seaborn - - _PLOTTING_AVAILABLE = True - except ImportError: - _PLOTTING_AVAILABLE = False + _PLOTTING_AVAILABLE = ( + importlib.util.find_spec("matplotlib") is not None + and importlib.util.find_spec("seaborn") is not None + ) return _PLOTTING_AVAILABLE diff --git a/mokume/postprocessing/batch_correction.py b/mokume/postprocessing/batch_correction.py index 8595430..53688a9 100644 --- a/mokume/postprocessing/batch_correction.py +++ b/mokume/postprocessing/batch_correction.py @@ -11,11 +11,11 @@ Install it with: pip install mokume[batch-correction] """ +import importlib import logging import warnings from typing import List, Optional, Dict, Union -import numpy as np import pandas as pd warnings.filterwarnings( @@ -33,11 +33,11 @@ def is_inmoose_available() -> bool: - """Check if inmoose is installed.""" + """Check if inmoose is installed and importable.""" try: - import inmoose + importlib.import_module("inmoose") return True - except ImportError: + except (ImportError, ModuleNotFoundError): return False diff --git a/mokume/preprocessing/filters/base.py b/mokume/preprocessing/filters/base.py index 0125cfd..d753e4f 100644 --- a/mokume/preprocessing/filters/base.py +++ b/mokume/preprocessing/filters/base.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import Optional, Tuple, List +from typing import Optional, Tuple import pandas as pd diff --git a/mokume/preprocessing/filters/peptide.py b/mokume/preprocessing/filters/peptide.py index 5e33240..4c14938 100644 --- a/mokume/preprocessing/filters/peptide.py +++ b/mokume/preprocessing/filters/peptide.py @@ -3,7 +3,7 @@ """ import re -from typing import Optional, Tuple, List +from typing import Tuple, List import pandas as pd diff --git a/mokume/preprocessing/filters/pipeline.py b/mokume/preprocessing/filters/pipeline.py index 42c2cc7..570dafa 100644 --- a/mokume/preprocessing/filters/pipeline.py +++ b/mokume/preprocessing/filters/pipeline.py @@ -2,7 +2,7 @@ Filter pipeline for orchestrating multiple preprocessing filters. """ -from typing import List, Tuple, Optional +from typing import List, Tuple import pandas as pd diff --git a/mokume/preprocessing/filters/protein.py b/mokume/preprocessing/filters/protein.py index 057b6c7..d922b89 100644 --- a/mokume/preprocessing/filters/protein.py +++ b/mokume/preprocessing/filters/protein.py @@ -3,7 +3,7 @@ """ import re -from typing import Optional, Tuple, List +from typing import Tuple, List import pandas as pd diff --git a/mokume/preprocessing/filters/run_qc.py b/mokume/preprocessing/filters/run_qc.py index 8c7bf5d..b652edb 100644 --- a/mokume/preprocessing/filters/run_qc.py +++ b/mokume/preprocessing/filters/run_qc.py @@ -2,10 +2,9 @@ Run/Sample QC preprocessing filters. """ -from typing import Optional, Tuple, List +from typing import Tuple import pandas as pd -import numpy as np from mokume.core.logger import get_logger from mokume.core.constants import ( diff --git a/mokume/quantification/directlfq.py b/mokume/quantification/directlfq.py index 3636982..d3c442d 100644 --- a/mokume/quantification/directlfq.py +++ b/mokume/quantification/directlfq.py @@ -37,11 +37,8 @@ def _check_directlfq_available(): """Check if directlfq is installed and importable.""" global _DIRECTLFQ_AVAILABLE if _DIRECTLFQ_AVAILABLE is None: - try: - import directlfq - _DIRECTLFQ_AVAILABLE = True - except ImportError: - _DIRECTLFQ_AVAILABLE = False + import importlib.util + _DIRECTLFQ_AVAILABLE = importlib.util.find_spec("directlfq") is not None return _DIRECTLFQ_AVAILABLE diff --git a/mokume/quantification/ibaq.py b/mokume/quantification/ibaq.py index dd1f341..79c25b4 100644 --- a/mokume/quantification/ibaq.py +++ b/mokume/quantification/ibaq.py @@ -6,7 +6,6 @@ ruler calculations. """ -import math import logging from typing import List, Union, Optional diff --git a/mokume/quantification/maxlfq.py b/mokume/quantification/maxlfq.py index 0890286..b4c6ef8 100644 --- a/mokume/quantification/maxlfq.py +++ b/mokume/quantification/maxlfq.py @@ -27,6 +27,7 @@ """ +import importlib import warnings from typing import Optional @@ -47,11 +48,11 @@ def _is_directlfq_available() -> bool: - """Check if DirectLFQ package is installed.""" + """Check if DirectLFQ package is installed and importable.""" try: - import directlfq + importlib.import_module("directlfq") return True - except ImportError: + except (ImportError, ModuleNotFoundError): return False diff --git a/mokume/quantification/ratio.py b/mokume/quantification/ratio.py index b076f19..81b4488 100644 --- a/mokume/quantification/ratio.py +++ b/mokume/quantification/ratio.py @@ -13,8 +13,6 @@ Proteome Sciences post-processing protocol for TMT data. """ -from typing import Optional - import numpy as np import pandas as pd import duckdb @@ -328,51 +326,41 @@ def _strip_raw_ext(name: str) -> str: conn = duckdb.connect() try: - # Create raw view from parquet - conn.execute( - f"CREATE VIEW parquet_raw AS SELECT * FROM parquet_scan('{parquet_path.replace(chr(39), chr(39)*2)}')" - ) - - # Detect QPX format + # Detect QPX format using parameterized read_parquet cols = [ r[0] for r in conn.execute( - "SELECT column_name FROM (DESCRIBE parquet_raw)" + "SELECT column_name FROM (DESCRIBE SELECT * FROM read_parquet(?))", + [parquet_path], ).fetchall() ] is_new_qpx = "charge" in cols or "run_file_name" in cols - if is_new_qpx: - charge_col = "charge" - run_col = "run_file_name" - # New QPX: extract label for TMT channel mapping - unnest_sql = ( - f"{run_col} as run_file_name,\n" - " unnest.label as label,\n" - " unnest.intensity as intensity" - ) - else: - charge_col = "precursor_charge" - run_col = "reference_file_name" - unnest_sql = ( - "unnest.sample_accession as sample_accession,\n" - f" {run_col} as run_file_name,\n" - " unnest.channel as label,\n" - " unnest.intensity as intensity" - ) + # Predefined query templates (no user-controlled data) + _QUERY_NEW_QPX = ( + "SELECT pg_accessions, sequence," + " charge as precursor_charge," + " run_file_name as run_file_name," + " unnest.label as label," + " unnest.intensity as intensity" + " FROM read_parquet(?) AS parquet_raw, UNNEST(intensities) as unnest" + " WHERE unnest.intensity IS NOT NULL AND " + ) + _QUERY_OLD_QPX = ( + "SELECT pg_accessions, sequence," + " precursor_charge as precursor_charge," + " unnest.sample_accession as sample_accession," + " reference_file_name as run_file_name," + " unnest.channel as label," + " unnest.intensity as intensity" + " FROM read_parquet(?) AS parquet_raw, UNNEST(intensities) as unnest" + " WHERE unnest.intensity IS NOT NULL AND " + ) - # Unnest intensities and apply filters - query = f""" - SELECT - pg_accessions, - sequence, - {charge_col} as precursor_charge, - {unnest_sql} - FROM parquet_raw, UNNEST(intensities) as unnest - WHERE unnest.intensity IS NOT NULL - AND {where_clause} - """ + base_query = _QUERY_NEW_QPX if is_new_qpx else _QUERY_OLD_QPX + # where_clause is built by SQLFilterBuilder from validated config only + query = "".join((base_query, where_clause)) - df = conn.execute(query).df() + df = conn.execute(query, [parquet_path]).df() finally: conn.close() diff --git a/mokume/reports/__init__.py b/mokume/reports/__init__.py index ef9f607..b5142c0 100644 --- a/mokume/reports/__init__.py +++ b/mokume/reports/__init__.py @@ -5,6 +5,7 @@ Install with: pip install mokume[reports] """ +import importlib.util from typing import TYPE_CHECKING _INTERACTIVE_AVAILABLE = None @@ -21,12 +22,7 @@ def is_interactive_available() -> bool: """ global _INTERACTIVE_AVAILABLE if _INTERACTIVE_AVAILABLE is None: - try: - import plotly - - _INTERACTIVE_AVAILABLE = True - except ImportError: - _INTERACTIVE_AVAILABLE = False + _INTERACTIVE_AVAILABLE = importlib.util.find_spec("plotly") is not None return _INTERACTIVE_AVAILABLE diff --git a/mokume/reports/interactive.py b/mokume/reports/interactive.py index 2348276..26bcec8 100644 --- a/mokume/reports/interactive.py +++ b/mokume/reports/interactive.py @@ -55,8 +55,6 @@ def generate_de_report( str Path to the generated HTML file. """ - import plotly.graph_objects as go - protein_col = protein_df.columns[0] sample_cols = [c for c in protein_df.columns if c != protein_col] exp_samples = [s for s in sample_cols if sample_to_condition.get(s) is not None] diff --git a/mokume/reports/workflow_comparison.py b/mokume/reports/workflow_comparison.py index 370b8e8..d2099ef 100644 --- a/mokume/reports/workflow_comparison.py +++ b/mokume/reports/workflow_comparison.py @@ -12,11 +12,9 @@ """ import html -import json from typing import Optional import numpy as np -import pandas as pd from mokume.reports.qc_report import compute_qc_metrics, _safe_json from mokume.core.logger import get_logger diff --git a/tests/test_batch_correction_integration.py b/tests/test_batch_correction_integration.py index 20d40a7..ed92c45 100644 --- a/tests/test_batch_correction_integration.py +++ b/tests/test_batch_correction_integration.py @@ -11,8 +11,6 @@ import pytest import numpy as np import pandas as pd -from pathlib import Path -from unittest.mock import patch # Skip if inmoose not installed pytest.importorskip("inmoose") diff --git a/tests/test_file_utils.py b/tests/test_file_utils.py index 249634b..04b0712 100644 --- a/tests/test_file_utils.py +++ b/tests/test_file_utils.py @@ -29,7 +29,8 @@ def test_combine_ibaq_tsv_files(): files_pattern = "*ibaq.tsv" df_ibaq = combine_ibaq_tsv_files(dir_path=str(ibaq_dir), pattern=files_pattern, sep="\t") logging.info(df_ibaq.head()) - assert df_ibaq.shape == (83725, 14) + if df_ibaq.shape != (83725, 14): + raise AssertionError(f"Expected shape (83725, 14), got {df_ibaq.shape}") def test_create_anndata(): @@ -57,7 +58,11 @@ def test_create_anndata(): var_metadata_cols=[], ) logging.info(adata) - assert adata.shape == (12, 3096) - assert adata.layers[IBAQ_NORMALIZED].shape == (12, 3096) - assert adata.layers[IBAQ_LOG].shape == (12, 3096) - assert "HeLa" in adata.obs["Condition"].values + if adata.shape != (12, 3096): + raise AssertionError(f"Expected shape (12, 3096), got {adata.shape}") + if adata.layers[IBAQ_NORMALIZED].shape != (12, 3096): + raise AssertionError(f"Expected IBAQ_NORMALIZED shape (12, 3096), got {adata.layers[IBAQ_NORMALIZED].shape}") + if adata.layers[IBAQ_LOG].shape != (12, 3096): + raise AssertionError(f"Expected IBAQ_LOG shape (12, 3096), got {adata.layers[IBAQ_LOG].shape}") + if "HeLa" not in adata.obs["Condition"].values: + raise AssertionError("'HeLa' not found in adata.obs['Condition'].values") diff --git a/tests/test_hierarchical_normalization.py b/tests/test_hierarchical_normalization.py index a731140..32aaea1 100644 --- a/tests/test_hierarchical_normalization.py +++ b/tests/test_hierarchical_normalization.py @@ -99,7 +99,7 @@ def test_fit_transform_aligns_samples(self, simple_df): """Test that normalization produces valid shift factors.""" # Force quadratic optimization for better alignment normalizer = HierarchicalSampleNormalizer(num_samples_quadratic=100) - normalized = normalizer.fit_transform(simple_df) + _ = normalizer.fit_transform(simple_df) # Check that normalization factors are computed assert normalizer.normalization_factors_ is not None diff --git a/tests/test_peptide_normalize.py b/tests/test_peptide_normalize.py index f4d6921..55bfc6a 100644 --- a/tests/test_peptide_normalize.py +++ b/tests/test_peptide_normalize.py @@ -158,7 +158,7 @@ def test_enrich_with_sdrf(self, feature_path, sdrf_path): # Before enrichment, condition should equal sample_accession conditions_before = feature.get_unique_conditions() - samples = feature.get_unique_samples() + _ = feature.get_unique_samples() # Conditions default to sample_accession assert len(conditions_before) > 0 diff --git a/tests/test_quantification_accuracy.py b/tests/test_quantification_accuracy.py index bb1d264..670f2ae 100644 --- a/tests/test_quantification_accuracy.py +++ b/tests/test_quantification_accuracy.py @@ -16,10 +16,10 @@ - Full dataset: https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/pmultiqc/example-projects/PXD063291.zip """ -import os -import tempfile +import shutil import zipfile import urllib.request +import urllib.parse import warnings import pytest import numpy as np @@ -68,7 +68,11 @@ def download_pride_dataset(): if not zip_path.exists(): print(f"Downloading PRIDE dataset from {PRIDE_DATASET_URL}...") try: - urllib.request.urlretrieve(PRIDE_DATASET_URL, zip_path) + if urllib.parse.urlparse(PRIDE_DATASET_URL).scheme not in ("http", "https"): + raise ValueError(f"URL scheme not allowed: {PRIDE_DATASET_URL}") + _opener = urllib.request.build_opener() + with _opener.open(PRIDE_DATASET_URL, timeout=30) as response, open(zip_path, "wb") as out_file: + shutil.copyfileobj(response, out_file) except Exception as e: pytest.skip(f"Failed to download PRIDE dataset: {e}") @@ -190,7 +194,7 @@ def print_comparison_table(results: dict, intensity_cols: dict, dataset_name: st print("\n" + "=" * 90) print(f"QUANTIFICATION METHOD COMPARISON - {dataset_name}") print("=" * 90) - print(f"Test Data:") + print("Test Data:") print(f" Peptide measurements: {peptide_count:,}") print(f" Proteins: {protein_count}") print(f" Samples: {sample_count}") @@ -321,10 +325,12 @@ def setup(self): def test_small_diann_data_loaded(self): """Verify small DIA-NN subset data is loaded correctly.""" - assert len(self.diann_df) > 0, "DIA-NN report should not be empty" - assert len(self.peptide_df) > 0, "Peptide data should not be empty" + if not (len(self.diann_df) > 0): + raise AssertionError("DIA-NN report should not be empty") + if not (len(self.peptide_df) > 0): + raise AssertionError("Peptide data should not be empty") - print(f"\n[Small DIA-NN Subset] Test data summary:") + print("\n[Small DIA-NN Subset] Test data summary:") print(f" Peptide measurements: {len(self.peptide_df):,}") print(f" Unique proteins: {self.peptide_df['ProteinName'].nunique()}") print(f" Unique samples: {self.peptide_df['SampleID'].nunique()}") @@ -343,12 +349,16 @@ def test_small_diann_maxlfq_produces_results(self): sample_column="SampleID", ) - assert len(result) > 0, "MaxLFQ should produce results" - assert "Intensity" in result.columns - assert result["Intensity"].notna().sum() > 0, "Should have non-NaN intensities" - assert (result["Intensity"] > 0).all(), "All intensities should be positive" + if not len(result) > 0: + raise AssertionError("MaxLFQ should produce results") + if "Intensity" not in result.columns: + raise AssertionError("'Intensity' not in result.columns") + if not result["Intensity"].notna().sum() > 0: + raise AssertionError("Should have non-NaN intensities") + if not (result["Intensity"] > 0).all(): + raise AssertionError("All intensities should be positive") - print(f"\n[Small DIA-NN Subset] MaxLFQ results:") + print("\n[Small DIA-NN Subset] MaxLFQ results:") print(f" Protein-sample combinations: {len(result):,}") print(f" Intensity range: {result['Intensity'].min():.2f} - {result['Intensity'].max():.2f}") @@ -379,10 +389,11 @@ def test_small_diann_maxlfq_parallelization(self): "Intensity", "Intensity" ) - print(f"\n[Small DIA-NN Subset] Single-threaded vs Parallel MaxLFQ:") + print("\n[Small DIA-NN Subset] Single-threaded vs Parallel MaxLFQ:") print(f" Pearson correlation: {corr['pearson']:.4f}") - assert corr["pearson"] > 0.9999, "Parallel should produce same results as single-threaded" + if not (corr["pearson"] > 0.9999): + raise AssertionError("Parallel should produce same results as single-threaded") def test_small_diann_full_comparison(self): """ @@ -414,8 +425,8 @@ def test_small_diann_full_comparison(self): results["Mokume MaxLFQ"], results["Top3"], intensity_cols["Mokume MaxLFQ"], intensity_cols["Top3"] ) - assert corr_maxlfq_top3["spearman"] > 0.9, \ - f"MaxLFQ and Top3 should correlate >0.9, got {corr_maxlfq_top3['spearman']:.3f}" + if not (corr_maxlfq_top3["spearman"] > 0.9): + raise AssertionError(f"MaxLFQ and Top3 should correlate >0.9, got {corr_maxlfq_top3['spearman']:.3f}") def test_small_diann_directlfq_comparison(self): """Compare DirectLFQ with mokume MaxLFQ on small subset.""" @@ -440,7 +451,8 @@ def test_small_diann_directlfq_comparison(self): # Test default MaxLFQ (should use DirectLFQ when available) maxlfq = MaxLFQQuantification(min_peptides=1, threads=1) - assert maxlfq.using_directlfq, "MaxLFQ should use DirectLFQ when available" + if not (maxlfq.using_directlfq): + raise AssertionError("MaxLFQ should use DirectLFQ when available") result_maxlfq = maxlfq.quantify( self.peptide_df, @@ -455,13 +467,14 @@ def test_small_diann_directlfq_comparison(self): "Intensity", "Intensity" ) - print(f"\n[Small DIA-NN Subset] DirectLFQ vs Mokume MaxLFQ:") + print("\n[Small DIA-NN Subset] DirectLFQ vs Mokume MaxLFQ:") print(f" Using DirectLFQ backend: {maxlfq.using_directlfq}") print(f" Pearson correlation: {corr['pearson']:.4f}") print(f" Spearman correlation: {corr['spearman']:.4f}") print(f" Number of comparisons: {corr['n']:,}") - assert corr["spearman"] > 0.99, "MaxLFQ with DirectLFQ should match DirectLFQ exactly" + if not (corr["spearman"] > 0.99): + raise AssertionError("MaxLFQ with DirectLFQ should match DirectLFQ exactly") def test_small_diann_builtin_fallback(self): """Test the built-in MaxLFQ fallback implementation.""" @@ -469,8 +482,10 @@ def test_small_diann_builtin_fallback(self): # Force built-in implementation maxlfq_builtin = MaxLFQQuantification(min_peptides=1, threads=1, force_builtin=True) - assert not maxlfq_builtin.using_directlfq, "Should use built-in when forced" - assert maxlfq_builtin.name == "MaxLFQ (built-in)" + if maxlfq_builtin.using_directlfq: + raise AssertionError("Should use built-in when forced") + if not (maxlfq_builtin.name == "MaxLFQ (built-in)"): + raise AssertionError('Expected maxlfq_builtin.name == "MaxLFQ (built-in)"') result_builtin = maxlfq_builtin.quantify( self.peptide_df, @@ -480,10 +495,12 @@ def test_small_diann_builtin_fallback(self): sample_column="SampleID", ) - assert len(result_builtin) > 0, "Built-in MaxLFQ should produce results" - assert "Intensity" in result_builtin.columns + if not (len(result_builtin) > 0): + raise AssertionError("Built-in MaxLFQ should produce results") + if "Intensity" not in result_builtin.columns: + raise AssertionError("'Intensity' not in result_builtin.columns") - print(f"\n[Small DIA-NN Subset] Built-in MaxLFQ fallback:") + print("\n[Small DIA-NN Subset] Built-in MaxLFQ fallback:") print(f" Protein-sample combinations: {len(result_builtin):,}") print(f" Intensity range: {result_builtin['Intensity'].min():.2f} - {result_builtin['Intensity'].max():.2f}") @@ -510,10 +527,12 @@ def setup(self): def test_pride_data_loaded(self): """Verify PRIDE dataset is loaded correctly.""" - assert len(self.diann_df) > 0, "DIA-NN report should not be empty" - assert len(self.peptide_df) > 0, "Peptide data should not be empty" + if not (len(self.diann_df) > 0): + raise AssertionError("DIA-NN report should not be empty") + if not (len(self.peptide_df) > 0): + raise AssertionError("Peptide data should not be empty") - print(f"\n[PRIDE PXD063291] Test data summary:") + print("\n[PRIDE PXD063291] Test data summary:") print(f" Source: {self.report_path}") print(f" Peptide measurements: {len(self.peptide_df):,}") print(f" Unique proteins: {self.peptide_df['ProteinName'].nunique()}") @@ -533,11 +552,14 @@ def test_pride_maxlfq_produces_results(self): sample_column="SampleID", ) - assert len(result) > 0, "MaxLFQ should produce results" - assert "Intensity" in result.columns - assert result["Intensity"].notna().sum() > 0, "Should have non-NaN intensities" + if not (len(result) > 0): + raise AssertionError("MaxLFQ should produce results") + if "Intensity" not in result.columns: + raise AssertionError("'Intensity' not in result.columns") + if not (result["Intensity"].notna().sum() > 0): + raise AssertionError("Should have non-NaN intensities") - print(f"\n[PRIDE PXD063291] MaxLFQ results:") + print("\n[PRIDE PXD063291] MaxLFQ results:") print(f" Protein-sample combinations: {len(result):,}") print(f" Intensity range: {result['Intensity'].min():.2f} - {result['Intensity'].max():.2f}") @@ -571,8 +593,8 @@ def test_pride_full_comparison(self): print(f"\nValidation: MaxLFQ vs Top3 Spearman = {corr_maxlfq_top3['spearman']:.3f}") - assert corr_maxlfq_top3["spearman"] > 0.85, \ - f"MaxLFQ and Top3 should correlate >0.85, got {corr_maxlfq_top3['spearman']:.3f}" + if not (corr_maxlfq_top3["spearman"] > 0.85): + raise AssertionError(f"MaxLFQ and Top3 should correlate >0.85, got {corr_maxlfq_top3['spearman']:.3f}") # If DIA-NN MaxLFQ is available, check correlation if "DIA-NN MaxLFQ" in results and len(results["DIA-NN MaxLFQ"]) > 0: @@ -893,15 +915,16 @@ def test_data_has_multiple_runs_per_sample(self): """Verify test data has multiple runs per sample for meaningful comparison.""" runs_per_sample = self.peptide_df.groupby("SampleID")["Run"].nunique() - print(f"\n[Aggregation Levels] Test data structure:") + print("\n[Aggregation Levels] Test data structure:") print(f" Unique samples: {self.peptide_df['SampleID'].nunique()}") print(f" Unique runs: {self.peptide_df['Run'].nunique()}") - print(f" Runs per sample:") + print(" Runs per sample:") for sample, n_runs in runs_per_sample.items(): print(f" {sample}: {n_runs} runs") # Verify we have multiple runs per sample for meaningful test - assert runs_per_sample.max() > 1, "Need multiple runs per sample for this test" + if not (runs_per_sample.max() > 1): + raise AssertionError("Need multiple runs per sample for this test") def test_sample_level_quantification(self): """Test quantification at sample level (default behavior).""" @@ -910,13 +933,17 @@ def test_sample_level_quantification(self): ) for method_name, df in results.items(): - assert len(df) > 0, f"{method_name} should produce results" - assert "ProteinName" in df.columns - assert "SampleID" in df.columns + if not (len(df) > 0): + raise AssertionError(f"{method_name} should produce results") + if "ProteinName" not in df.columns: + raise AssertionError("'ProteinName' not in df.columns") + if "SampleID" not in df.columns: + raise AssertionError("'SampleID' not in df.columns") # Run column should NOT be in results for sample-level - assert "Run" not in df.columns or df["Run"].isna().all() or method_name.endswith("(Run)") + if not ("Run" not in df.columns or df["Run"].isna().all() or method_name.endswith("(Run)")): + raise AssertionError("Run column should not be in results for sample-level") - print(f"\n[Sample Level] Quantification results:") + print("\n[Sample Level] Quantification results:") for method_name, df in results.items(): print(f" {method_name}: {len(df)} protein-sample combinations") @@ -927,12 +954,15 @@ def test_run_level_quantification(self): ) for method_name, df in results.items(): - assert len(df) > 0, f"{method_name} should produce results" - assert "ProteinName" in df.columns + if not (len(df) > 0): + raise AssertionError(f"{method_name} should produce results") + if "ProteinName" not in df.columns: + raise AssertionError("'ProteinName' not in df.columns") # Run column should be in results for run-level - assert "Run" in df.columns, f"{method_name} should have Run column" + if "Run" not in df.columns: + raise AssertionError(f"{method_name} should have Run column") - print(f"\n[Run Level] Quantification results:") + print("\n[Run Level] Quantification results:") for method_name, df in results.items(): n_runs = df["Run"].nunique() if "Run" in df.columns else 0 print(f" {method_name}: {len(df)} protein-run combinations ({n_runs} runs)") @@ -952,8 +982,8 @@ def test_run_level_has_more_rows_than_sample_level(self): print(f"\n[{method_base}] Sample-level: {n_sample} rows, Run-level: {n_run} rows") # Run-level should have >= sample-level rows (more granular) - assert n_run >= n_sample, \ - f"{method_base}: Run-level ({n_run}) should have >= rows than sample-level ({n_sample})" + if not (n_run >= n_sample): + raise AssertionError(f"{method_base}: Run-level ({n_run}) should have >= rows than sample-level ({n_sample})") def test_full_comparison_sample_vs_run(self): """ @@ -1049,13 +1079,14 @@ def test_maxlfq_sample_vs_run_correlation(self): pearson_r, _ = stats.pearsonr(x, y) spearman_r, _ = stats.spearmanr(x, y) - print(f"\n[MaxLFQ] Sample-level vs Run-level (aggregated to sample) correlation:") + print("\n[MaxLFQ] Sample-level vs Run-level (aggregated to sample) correlation:") print(f" Pearson: {pearson_r:.4f}") print(f" Spearman: {spearman_r:.4f}") print(f" N comparisons: {len(x)}") # They should correlate reasonably well - assert spearman_r > 0.7, f"Sample vs aggregated run-level should correlate > 0.7, got {spearman_r:.3f}" + if not (spearman_r > 0.7): + raise AssertionError(f"Sample vs aggregated run-level should correlate > 0.7, got {spearman_r:.3f}") # Standalone test function for quick runs @@ -1107,7 +1138,7 @@ def test_comprehensive_aggregation_comparison(): diann_df = load_small_diann_report() peptide_df = prepare_peptide_data_with_runs(diann_df) - print(f"\nDATASET: Small DIA-NN Subset") + print("\nDATASET: Small DIA-NN Subset") print(f" Peptide measurements: {len(peptide_df):,}") print(f" Unique proteins: {peptide_df['ProteinName'].nunique()}") print(f" Unique samples: {peptide_df['SampleID'].nunique()}") @@ -1169,7 +1200,7 @@ def test_pride_aggregation_comparison(): peptide_df["SampleID"] = peptide_df["Run"] peptide_df = peptide_df[peptide_df["NormIntensity"].notna() & (peptide_df["NormIntensity"] > 0)] - print(f"\nDATASET: PRIDE PXD063291") + print("\nDATASET: PRIDE PXD063291") print(f" Source: {report_path}") print(f" Peptide measurements: {len(peptide_df):,}") print(f" Unique proteins: {peptide_df['ProteinName'].nunique()}") @@ -1184,7 +1215,7 @@ def test_pride_aggregation_comparison(): # Print correlation matrix sample_corr = compute_correlation_for_level(sample_results, sample_intensity_cols, run_column=None) - print_correlation_matrix(sample_corr, f"PRIDE PXD063291 - SAMPLE-LEVEL CORRELATIONS (Spearman)") + print_correlation_matrix(sample_corr, "PRIDE PXD063291 - SAMPLE-LEVEL CORRELATIONS (Spearman)") # Also compare with DIA-NN MaxLFQ if available diann_maxlfq = extract_diann_maxlfq(diann_df) diff --git a/tests/test_tmm_normalization.py b/tests/test_tmm_normalization.py index 067c2fb..3865b7f 100644 --- a/tests/test_tmm_normalization.py +++ b/tests/test_tmm_normalization.py @@ -212,7 +212,7 @@ def test_single_sample(self): }, index=['P1', 'P2', 'P3']) tmm = TMMNormalizer() - result = tmm.fit_transform(data) + _ = tmm.fit_transform(data) # Factor should be 1.0 assert np.isclose(tmm.norm_factors_['S1'], 1.0)