Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,13 @@
6. Critical assessment of mokume algorithms
"""

import sys
from pathlib import Path
from itertools import combinations

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist
import seaborn as sns

# Paths
BENCHMARK_DIR = Path(__file__).parent.parent
Expand Down Expand Up @@ -173,7 +169,7 @@ def analyze_missing_values(meta, data):

# CRITIQUE: Flag high missing value variance as a problem
if missing_variance > 100:
print(f" WARNING: High variance in missing values across batches suggests batch-specific quantification issues")
print(" WARNING: High variance in missing values across batches suggests batch-specific quantification issues")

return pd.DataFrame(results)

Expand Down Expand Up @@ -339,9 +335,9 @@ def analyze_de_consistency(meta, data):

# CRITIQUE
if mean_lfc_corr < 0.7:
print(f" WARNING: Low LFC correlation indicates quantification instability across labs")
print(" WARNING: Low LFC correlation indicates quantification instability across labs")
if mean_concordance < 0.8 and concordances:
print(f" WARNING: Low DE concordance - same experiment may give different conclusions in different labs")
print(" WARNING: Low DE concordance - same experiment may give different conclusions in different labs")

return pd.DataFrame(results)

Expand Down Expand Up @@ -405,7 +401,7 @@ def analyze_batch_effect_magnitude(meta, data):
bio_var = matrix_complete[stype_cols].var(axis=1).mean()
biological_vars.append(bio_var)

biological_var = np.mean(biological_vars) if biological_vars else 0
_ = np.mean(biological_vars) if biological_vars else 0 # not used yet

# Batch effect proportion
batch_effect_pct = (between_batch_var / total_variance * 100) if total_variance > 0 else 0
Expand Down Expand Up @@ -631,7 +627,7 @@ def generate_summary_plots(meta, data, coverage_df, missing_df, corr_df, de_df,
# Calculate composite score (lower is better)
scores = {}
for method in coverage_df["Method"].tolist():
method_lower = method.lower()
_ = method.lower() # reserved for future use

# Get metrics
core_pct = coverage_df[coverage_df["Method"] == method]["Core_Pct"].values[0]
Expand Down
1 change: 0 additions & 1 deletion benchmarks/batch-quartet-multilab/scripts/plot_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
Generate visualizations for the batch effect correction benchmark.
"""

import sys
from pathlib import Path

import numpy as np
Expand Down
10 changes: 3 additions & 7 deletions benchmarks/batch-quartet-multilab/scripts/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,13 @@
with the paper's results using the Quartet multi-lab dataset.
"""

import logging
import sys
import time
import warnings
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.decomposition import PCA

warnings.filterwarnings("ignore")
Expand All @@ -28,7 +26,6 @@
MOKUME_ROOT = BENCHMARK_DIR.parent.parent
sys.path.insert(0, str(MOKUME_ROOT))

from mokume.quantification import MaxLFQQuantification, TopNQuantification
from mokume.quantification.directlfq import is_directlfq_available, DirectLFQQuantification
from mokume.postprocessing import is_batch_correction_available, apply_batch_correction

Expand Down Expand Up @@ -122,8 +119,6 @@ def run_maxlfq_quantification(peptide_df: pd.DataFrame) -> pd.DataFrame:
sample_cols = [c for c in wide_df.columns if c not in ["peptide", "protein"]]

# Run MaxLFQ per protein
maxlfq = MaxLFQQuantification(min_peptides=1)

proteins = wide_df["protein"].unique()
results = []

Expand All @@ -150,7 +145,8 @@ def run_maxlfq_quantification(peptide_df: pd.DataFrame) -> pd.DataFrame:
"run_id": sample,
"intensity": protein_intensities[i]
})
except Exception:
except Exception as exc:
logging.warning("MaxLFQ failed for protein %s: %s", protein, exc)
continue

return pd.DataFrame(results)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@
Download datasets from PRIDE ibaqpy-research FTP for benchmarking protein quantification methods.
"""

import shutil
import urllib.request
import urllib.error
import urllib.parse
import sys
from pathlib import Path
from typing import Optional, List
from typing import List

_opener = urllib.request.build_opener()
from config import (
ALL_DATASETS,
HELA_DATASETS,
Expand Down Expand Up @@ -46,7 +49,10 @@ def download_file(url: str, dest: Path, verbose: bool = True) -> bool:
print(f" Downloading: {url}")

try:
urllib.request.urlretrieve(url, dest)
if urllib.parse.urlparse(url).scheme not in ("http", "https"):
raise ValueError(f"URL scheme not allowed: {url}")
with _opener.open(url) as response, open(dest, "wb") as out_file:
shutil.copyfileobj(response, out_file)
if verbose:
size_mb = dest.stat().st_size / (1024 * 1024)
print(f" Saved to: {dest.name} ({size_mb:.1f} MB)")
Expand All @@ -65,10 +71,12 @@ def download_file(url: str, dest: Path, verbose: bool = True) -> bool:
def check_url_exists(url: str) -> bool:
"""Check if a URL exists without downloading."""
try:
if urllib.parse.urlparse(url).scheme not in ("http", "https"):
raise ValueError(f"URL scheme not allowed: {url}")
request = urllib.request.Request(url, method='HEAD')
urllib.request.urlopen(request, timeout=10)
_opener.open(request, timeout=10)
return True
except:
except Exception:
return False


Expand All @@ -77,8 +85,10 @@ def list_available_files(base_url: str) -> List[str]:
List files available at a URL (works for directory listings).
"""
try:
response = urllib.request.urlopen(base_url, timeout=30)
content = response.read().decode('utf-8')
if urllib.parse.urlparse(base_url).scheme not in ("http", "https"):
raise ValueError(f"URL scheme not allowed: {base_url}")
with _opener.open(base_url, timeout=30) as response:
content = response.read().decode('utf-8')
import re
links = re.findall(r'href="([^"]+)"', content)
return [l for l in links if not l.startswith('/') and not l.startswith('?')]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
"""

import sys
from pathlib import Path
from typing import Optional, Dict, List, Tuple
from typing import Optional

import pandas as pd
import numpy as np
Expand All @@ -18,7 +17,6 @@
HELA_DATASETS,
RAW_DATA_DIR,
PROCESSED_DIR,
COLUMN_MAPPING,
QUANTMS_PARQUET_MAPPING,
DatasetInfo,
)
Expand Down Expand Up @@ -156,9 +154,6 @@ def normalize_columns(df: pd.DataFrame, data_format: str) -> pd.DataFrame:
if "NormIntensity" not in df.columns and "Intensity" in df.columns:
df["NormIntensity"] = df["Intensity"]

# Target columns we need
targets = ["ProteinName", "PeptideSequence", "SampleID", "NormIntensity"]

# Build a reverse mapping: target -> list of possible source names
# Use QUANTMS_PARQUET_MAPPING for all formats (ibaqpy-research uses this)
source_mapping = QUANTMS_PARQUET_MAPPING
Expand Down Expand Up @@ -360,7 +355,6 @@ def load_raw_data(dataset: DatasetInfo) -> Optional[pd.DataFrame]:
return None

# Try alternative extensions
base_name = feature_path.stem.replace("_feature", "")
for ext in [".parquet", ".csv", ".tsv"]:
alt_path = RAW_DATA_DIR / f"{dataset.project_id}_feature{ext}"
if alt_path.exists():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,13 @@
from typing import Optional, Dict, List

import pandas as pd
import numpy as np

# Add parent directory to path for mokume imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from config import (
ALL_DATASETS,
HELA_DATASETS,
PROCESSED_DIR,
PROTEIN_QUANT_DIR,
FASTA_FILE,
QUANTIFICATION_METHODS,
Expand Down Expand Up @@ -93,7 +91,7 @@ def run_ibaq(
result = pd.read_csv(output_file, sep="\t")
return result
else:
print(f" ERROR: iBAQ output file not generated")
print(" ERROR: iBAQ output file not generated")
return None

except Exception as e:
Expand Down Expand Up @@ -336,7 +334,7 @@ def quantify_all_datasets(

# Check FASTA
if "ibaq" in methods and not FASTA_FILE.exists():
print(f"\nWARNING: FASTA file not found. iBAQ will be skipped.")
print("\nWARNING: FASTA file not found. iBAQ will be skipped.")

all_results = {}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import sys
from pathlib import Path
from typing import Optional, Dict, List, Tuple
from typing import Optional, Dict, List

import pandas as pd
import numpy as np
Expand Down Expand Up @@ -469,22 +469,22 @@ def save_results(results: dict, output_dir: Path = None):
if results["cv_summary"]:
cv_df = pd.DataFrame(results["cv_summary"])
cv_df.to_csv(output_dir / "cv_comparison.csv", index=False)
print(f" Saved: cv_comparison.csv")
print(" Saved: cv_comparison.csv")

# Cross-experiment correlation
for method, corr_matrix in results["cross_experiment_corr"].items():
corr_matrix.to_csv(output_dir / f"cross_experiment_corr_{method}.csv")
print(f" Saved: cross_experiment_corr_*.csv")
print(" Saved: cross_experiment_corr_*.csv")

# Rank consistency
for method, rank_matrix in results["rank_consistency"].items():
rank_matrix.to_csv(output_dir / f"rank_consistency_{method}.csv")
print(f" Saved: rank_consistency_*.csv")
print(" Saved: rank_consistency_*.csv")

# Expression stability
for method, stability in results["expression_stability"].items():
stability.to_csv(output_dir / f"expression_stability_{method}.csv", index=False)
print(f" Saved: expression_stability_*.csv")
print(" Saved: expression_stability_*.csv")

# TMT vs LFQ correlation
if results.get("tmt_lfq"):
Expand All @@ -508,7 +508,7 @@ def save_results(results: dict, output_dir: Path = None):

if tmt_lfq_summary:
pd.DataFrame(tmt_lfq_summary).to_csv(output_dir / "tmt_lfq_comparison.csv", index=False)
print(f" Saved: tmt_lfq_comparison.csv, tmt_lfq_values_*.csv")
print(" Saved: tmt_lfq_comparison.csv, tmt_lfq_values_*.csv")

# Summary metrics
summary = []
Expand Down Expand Up @@ -538,7 +538,7 @@ def save_results(results: dict, output_dir: Path = None):

summary_df = pd.DataFrame(summary)
summary_df.to_csv(output_dir / "summary_metrics.csv", index=False)
print(f" Saved: summary_metrics.csv")
print(" Saved: summary_metrics.csv")


def main():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,20 @@

import sys
from pathlib import Path
from typing import Optional, Dict, List

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns

sys.path.insert(0, str(Path(__file__).parent.parent))

from config import (
ALL_DATASETS,
HELA_DATASETS,
ANALYSIS_DIR,
PLOTS_DIR,
PROTEIN_QUANT_DIR,
QUANTIFICATION_METHODS,
PROTEINS_OF_INTEREST,
)

# Set style
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,20 @@

import sys
from pathlib import Path
from typing import Optional, Dict, List, Tuple
from typing import Optional, Dict, List

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from scipy import stats
from scipy.stats import pearsonr, spearmanr

sys.path.insert(0, str(Path(__file__).parent))

from config import (
ALL_DATASETS,
HELA_DATASETS,
PROTEIN_QUANT_DIR,
PLOTS_DIR,
QUANTIFICATION_METHODS,
)


Expand Down Expand Up @@ -137,7 +134,7 @@ def plot_density_scatter(
norm=LogNorm(), alpha=0.9)

# Add colorbar
cb = plt.colorbar(hb, ax=ax, label='n_neighbors')
plt.colorbar(hb, ax=ax, label='n_neighbors')

# Add diagonal line (y = x)
lims = [
Expand Down Expand Up @@ -513,7 +510,7 @@ def main():
if all_results:
combined = pd.concat(all_results, ignore_index=True)
combined.to_csv(output_dir / "method_correlations.csv", index=False)
print(f"\n Saved: method_correlations.csv")
print("\n Saved: method_correlations.csv")

print("\n" + "=" * 70)
print(f"Done! Plots saved to: {output_dir}")
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/quant-hela-method-comparison/scripts/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
"""

from pathlib import Path
from dataclasses import dataclass, field
from typing import List, Dict, Optional
from dataclasses import dataclass
from typing import Dict

# Base directories
BENCHMARK_DIR = Path(__file__).parent
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
python run_benchmark.py --phase 3 --stop 3 # Run only phase 3
"""

import sys
import argparse
from pathlib import Path

Expand Down
Loading
Loading