bigbio · ypriverol · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/benchmarks/batch-quartet-multilab/scripts/comprehensive_analysis.py b/benchmarks/batch-quartet-multilab/scripts/comprehensive_analysis.py
@@ -11,17 +11,13 @@
 6. Critical assessment of mokume algorithms
 """
 
-import sys
 from pathlib import Path
 from itertools import combinations
 
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 from scipy import stats
-from scipy.cluster.hierarchy import linkage, dendrogram
-from scipy.spatial.distance import pdist
-import seaborn as sns
 
 # Paths
 BENCHMARK_DIR = Path(__file__).parent.parent
@@ -173,7 +169,7 @@ def analyze_missing_values(meta, data):
 
         # CRITIQUE: Flag high missing value variance as a problem
         if missing_variance > 100:
-            print(f"  WARNING: High variance in missing values across batches suggests batch-specific quantification issues")
+            print("  WARNING: High variance in missing values across batches suggests batch-specific quantification issues")
 
     return pd.DataFrame(results)
 
@@ -339,9 +335,9 @@ def analyze_de_consistency(meta, data):
 
             # CRITIQUE
             if mean_lfc_corr < 0.7:
-                print(f"  WARNING: Low LFC correlation indicates quantification instability across labs")
+                print("  WARNING: Low LFC correlation indicates quantification instability across labs")
             if mean_concordance < 0.8 and concordances:
-                print(f"  WARNING: Low DE concordance - same experiment may give different conclusions in different labs")
+                print("  WARNING: Low DE concordance - same experiment may give different conclusions in different labs")
 
     return pd.DataFrame(results)
 
@@ -405,7 +401,7 @@ def analyze_batch_effect_magnitude(meta, data):
                     bio_var = matrix_complete[stype_cols].var(axis=1).mean()
                     biological_vars.append(bio_var)
 
-            biological_var = np.mean(biological_vars) if biological_vars else 0
+            _ = np.mean(biological_vars) if biological_vars else 0  # not used yet
 
             # Batch effect proportion
             batch_effect_pct = (between_batch_var / total_variance * 100) if total_variance > 0 else 0
@@ -631,7 +627,7 @@ def generate_summary_plots(meta, data, coverage_df, missing_df, corr_df, de_df,
     # Calculate composite score (lower is better)
     scores = {}
     for method in coverage_df["Method"].tolist():
-        method_lower = method.lower()
+        _ = method.lower()  # reserved for future use
 
         # Get metrics
         core_pct = coverage_df[coverage_df["Method"] == method]["Core_Pct"].values[0]

diff --git a/benchmarks/batch-quartet-multilab/scripts/plot_results.py b/benchmarks/batch-quartet-multilab/scripts/plot_results.py
@@ -3,7 +3,6 @@
 Generate visualizations for the batch effect correction benchmark.
 """
 
-import sys
 from pathlib import Path
 
 import numpy as np

diff --git a/benchmarks/batch-quartet-multilab/scripts/run_benchmark.py b/benchmarks/batch-quartet-multilab/scripts/run_benchmark.py
@@ -10,15 +10,13 @@
 with the paper's results using the Quartet multi-lab dataset.
 """
 
+import logging
 import sys
 import time
 import warnings
 from pathlib import Path
-from collections import defaultdict
-
 import numpy as np
 import pandas as pd
-from scipy import stats
 from sklearn.decomposition import PCA
 
 warnings.filterwarnings("ignore")
@@ -28,7 +26,6 @@
 MOKUME_ROOT = BENCHMARK_DIR.parent.parent
 sys.path.insert(0, str(MOKUME_ROOT))
 
-from mokume.quantification import MaxLFQQuantification, TopNQuantification
 from mokume.quantification.directlfq import is_directlfq_available, DirectLFQQuantification
 from mokume.postprocessing import is_batch_correction_available, apply_batch_correction
 
@@ -122,8 +119,6 @@ def run_maxlfq_quantification(peptide_df: pd.DataFrame) -> pd.DataFrame:
     sample_cols = [c for c in wide_df.columns if c not in ["peptide", "protein"]]
 
     # Run MaxLFQ per protein
-    maxlfq = MaxLFQQuantification(min_peptides=1)
-
     proteins = wide_df["protein"].unique()
     results = []
 
@@ -150,7 +145,8 @@ def run_maxlfq_quantification(peptide_df: pd.DataFrame) -> pd.DataFrame:
                         "run_id": sample,
                         "intensity": protein_intensities[i]
                     })
-        except Exception:
+        except Exception as exc:
+            logging.warning("MaxLFQ failed for protein %s: %s", protein, exc)
             continue
 
     return pd.DataFrame(results)

diff --git a/benchmarks/quant-hela-method-comparison/scripts/01_download_data.py b/benchmarks/quant-hela-method-comparison/scripts/01_download_data.py
@@ -5,12 +5,15 @@
 Download datasets from PRIDE ibaqpy-research FTP for benchmarking protein quantification methods.
 """
 
+import shutil
 import urllib.request
 import urllib.error
+import urllib.parse
 import sys
 from pathlib import Path
-from typing import Optional, List
+from typing import List
 
+_opener = urllib.request.build_opener()
 from config import (
     ALL_DATASETS,
     HELA_DATASETS,
@@ -46,7 +49,10 @@ def download_file(url: str, dest: Path, verbose: bool = True) -> bool:
         print(f"  Downloading: {url}")
 
     try:
-        urllib.request.urlretrieve(url, dest)
+        if urllib.parse.urlparse(url).scheme not in ("http", "https"):
+            raise ValueError(f"URL scheme not allowed: {url}")
+        with _opener.open(url) as response, open(dest, "wb") as out_file:
+            shutil.copyfileobj(response, out_file)
         if verbose:
             size_mb = dest.stat().st_size / (1024 * 1024)
             print(f"  Saved to: {dest.name} ({size_mb:.1f} MB)")
@@ -65,10 +71,12 @@ def download_file(url: str, dest: Path, verbose: bool = True) -> bool:
 def check_url_exists(url: str) -> bool:
     """Check if a URL exists without downloading."""
     try:
+        if urllib.parse.urlparse(url).scheme not in ("http", "https"):
+            raise ValueError(f"URL scheme not allowed: {url}")
         request = urllib.request.Request(url, method='HEAD')
-        urllib.request.urlopen(request, timeout=10)
+        _opener.open(request, timeout=10)
         return True
-    except:
+    except Exception:
         return False
 
 
@@ -77,8 +85,10 @@ def list_available_files(base_url: str) -> List[str]:
     List files available at a URL (works for directory listings).
     """
     try:
-        response = urllib.request.urlopen(base_url, timeout=30)
-        content = response.read().decode('utf-8')
+        if urllib.parse.urlparse(base_url).scheme not in ("http", "https"):
+            raise ValueError(f"URL scheme not allowed: {base_url}")
+        with _opener.open(base_url, timeout=30) as response:
+            content = response.read().decode('utf-8')
         import re
         links = re.findall(r'href="([^"]+)"', content)
         return [l for l in links if not l.startswith('/') and not l.startswith('?')]

diff --git a/benchmarks/quant-hela-method-comparison/scripts/02_prepare_peptides.py b/benchmarks/quant-hela-method-comparison/scripts/02_prepare_peptides.py
@@ -7,8 +7,7 @@
 """
 
 import sys
-from pathlib import Path
-from typing import Optional, Dict, List, Tuple
+from typing import Optional
 
 import pandas as pd
 import numpy as np
@@ -18,7 +17,6 @@
     HELA_DATASETS,
     RAW_DATA_DIR,
     PROCESSED_DIR,
-    COLUMN_MAPPING,
     QUANTMS_PARQUET_MAPPING,
     DatasetInfo,
 )
@@ -156,9 +154,6 @@ def normalize_columns(df: pd.DataFrame, data_format: str) -> pd.DataFrame:
         if "NormIntensity" not in df.columns and "Intensity" in df.columns:
             df["NormIntensity"] = df["Intensity"]
 
-    # Target columns we need
-    targets = ["ProteinName", "PeptideSequence", "SampleID", "NormIntensity"]
-
     # Build a reverse mapping: target -> list of possible source names
     # Use QUANTMS_PARQUET_MAPPING for all formats (ibaqpy-research uses this)
     source_mapping = QUANTMS_PARQUET_MAPPING
@@ -360,7 +355,6 @@ def load_raw_data(dataset: DatasetInfo) -> Optional[pd.DataFrame]:
             return None
 
     # Try alternative extensions
-    base_name = feature_path.stem.replace("_feature", "")
     for ext in [".parquet", ".csv", ".tsv"]:
         alt_path = RAW_DATA_DIR / f"{dataset.project_id}_feature{ext}"
         if alt_path.exists():

diff --git a/benchmarks/quant-hela-method-comparison/scripts/03_run_quantification.py b/benchmarks/quant-hela-method-comparison/scripts/03_run_quantification.py
@@ -12,15 +12,13 @@
 from typing import Optional, Dict, List
 
 import pandas as pd
-import numpy as np
 
 # Add parent directory to path for mokume imports
 sys.path.insert(0, str(Path(__file__).parent.parent))
 
 from config import (
     ALL_DATASETS,
     HELA_DATASETS,
-    PROCESSED_DIR,
     PROTEIN_QUANT_DIR,
     FASTA_FILE,
     QUANTIFICATION_METHODS,
@@ -93,7 +91,7 @@ def run_ibaq(
                 result = pd.read_csv(output_file, sep="\t")
                 return result
             else:
-                print(f"  ERROR: iBAQ output file not generated")
+                print("  ERROR: iBAQ output file not generated")
                 return None
 
     except Exception as e:
@@ -336,7 +334,7 @@ def quantify_all_datasets(
 
     # Check FASTA
     if "ibaq" in methods and not FASTA_FILE.exists():
-        print(f"\nWARNING: FASTA file not found. iBAQ will be skipped.")
+        print("\nWARNING: FASTA file not found. iBAQ will be skipped.")
 
     all_results = {}
 

diff --git a/benchmarks/quant-hela-method-comparison/scripts/04_compute_metrics.py b/benchmarks/quant-hela-method-comparison/scripts/04_compute_metrics.py
@@ -11,7 +11,7 @@
 
 import sys
 from pathlib import Path
-from typing import Optional, Dict, List, Tuple
+from typing import Optional, Dict, List
 
 import pandas as pd
 import numpy as np
@@ -469,22 +469,22 @@ def save_results(results: dict, output_dir: Path = None):
     if results["cv_summary"]:
         cv_df = pd.DataFrame(results["cv_summary"])
         cv_df.to_csv(output_dir / "cv_comparison.csv", index=False)
-        print(f"  Saved: cv_comparison.csv")
+        print("  Saved: cv_comparison.csv")
 
     # Cross-experiment correlation
     for method, corr_matrix in results["cross_experiment_corr"].items():
         corr_matrix.to_csv(output_dir / f"cross_experiment_corr_{method}.csv")
-    print(f"  Saved: cross_experiment_corr_*.csv")
+    print("  Saved: cross_experiment_corr_*.csv")
 
     # Rank consistency
     for method, rank_matrix in results["rank_consistency"].items():
         rank_matrix.to_csv(output_dir / f"rank_consistency_{method}.csv")
-    print(f"  Saved: rank_consistency_*.csv")
+    print("  Saved: rank_consistency_*.csv")
 
     # Expression stability
     for method, stability in results["expression_stability"].items():
         stability.to_csv(output_dir / f"expression_stability_{method}.csv", index=False)
-    print(f"  Saved: expression_stability_*.csv")
+    print("  Saved: expression_stability_*.csv")
 
     # TMT vs LFQ correlation
     if results.get("tmt_lfq"):
@@ -508,7 +508,7 @@ def save_results(results: dict, output_dir: Path = None):
 
         if tmt_lfq_summary:
             pd.DataFrame(tmt_lfq_summary).to_csv(output_dir / "tmt_lfq_comparison.csv", index=False)
-            print(f"  Saved: tmt_lfq_comparison.csv, tmt_lfq_values_*.csv")
+            print("  Saved: tmt_lfq_comparison.csv, tmt_lfq_values_*.csv")
 
     # Summary metrics
     summary = []
@@ -538,7 +538,7 @@ def save_results(results: dict, output_dir: Path = None):
 
     summary_df = pd.DataFrame(summary)
     summary_df.to_csv(output_dir / "summary_metrics.csv", index=False)
-    print(f"  Saved: summary_metrics.csv")
+    print("  Saved: summary_metrics.csv")
 
 
 def main():

diff --git a/benchmarks/quant-hela-method-comparison/scripts/05_generate_plots.py b/benchmarks/quant-hela-method-comparison/scripts/05_generate_plots.py
@@ -13,25 +13,20 @@
 
 import sys
 from pathlib import Path
-from typing import Optional, Dict, List
-
 import pandas as pd
 import numpy as np
 
 import matplotlib.pyplot as plt
-import matplotlib.colors as mcolors
 import seaborn as sns
 
 sys.path.insert(0, str(Path(__file__).parent.parent))
 
 from config import (
     ALL_DATASETS,
-    HELA_DATASETS,
     ANALYSIS_DIR,
     PLOTS_DIR,
     PROTEIN_QUANT_DIR,
     QUANTIFICATION_METHODS,
-    PROTEINS_OF_INTEREST,
 )
 
 # Set style

diff --git a/benchmarks/quant-hela-method-comparison/scripts/06_method_correlation_plots.py b/benchmarks/quant-hela-method-comparison/scripts/06_method_correlation_plots.py
@@ -8,23 +8,20 @@
 
 import sys
 from pathlib import Path
-from typing import Optional, Dict, List, Tuple
+from typing import Optional, Dict, List
 
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib.colors import LogNorm
-from scipy import stats
 from scipy.stats import pearsonr, spearmanr
 
 sys.path.insert(0, str(Path(__file__).parent))
 
 from config import (
     ALL_DATASETS,
-    HELA_DATASETS,
     PROTEIN_QUANT_DIR,
     PLOTS_DIR,
-    QUANTIFICATION_METHODS,
 )
 
 
@@ -137,7 +134,7 @@ def plot_density_scatter(
                    norm=LogNorm(), alpha=0.9)
 
     # Add colorbar
-    cb = plt.colorbar(hb, ax=ax, label='n_neighbors')
+    plt.colorbar(hb, ax=ax, label='n_neighbors')
 
     # Add diagonal line (y = x)
     lims = [
@@ -513,7 +510,7 @@ def main():
         if all_results:
             combined = pd.concat(all_results, ignore_index=True)
             combined.to_csv(output_dir / "method_correlations.csv", index=False)
-            print(f"\n  Saved: method_correlations.csv")
+            print("\n  Saved: method_correlations.csv")
 
     print("\n" + "=" * 70)
     print(f"Done! Plots saved to: {output_dir}")

diff --git a/benchmarks/quant-hela-method-comparison/scripts/config.py b/benchmarks/quant-hela-method-comparison/scripts/config.py
@@ -6,8 +6,8 @@
 """
 
 from pathlib import Path
-from dataclasses import dataclass, field
-from typing import List, Dict, Optional
+from dataclasses import dataclass
+from typing import Dict
 
 # Base directories
 BENCHMARK_DIR = Path(__file__).parent

diff --git a/benchmarks/quant-hela-method-comparison/scripts/run_benchmark.py b/benchmarks/quant-hela-method-comparison/scripts/run_benchmark.py
@@ -15,7 +15,6 @@
     python run_benchmark.py --phase 3 --stop 3  # Run only phase 3
 """
 
-import sys
 import argparse
 from pathlib import Path