Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
358 changes: 225 additions & 133 deletions mokume/io/feature.py

Large diffs are not rendered by default.

17 changes: 6 additions & 11 deletions mokume/pipeline/stages.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,18 +161,13 @@ def load_for_directlfq(self) -> pd.DataFrame:
feature.enrich_with_sdrf(self.config.input.sdrf)

# Build query with filters
where_clause = filter_builder.build_where_clause()
query = f"""
SELECT
pg_accessions,
sequence,
sample_accession,
intensity
FROM parquet_db
WHERE {where_clause}
"""
where_clause, where_params = filter_builder.build_where_clause()
query = "".join([
"SELECT pg_accessions, sequence, sample_accession, intensity",
" FROM parquet_db WHERE ", where_clause,
])

df = feature.parquet_db.sql(query).df()
df = feature.parquet_db.execute(query, where_params).df()

# Parse protein accessions
# Extract first element from pg_accessions list, then parse UniProt ID
Expand Down
9 changes: 4 additions & 5 deletions mokume/quantification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __getattr__(name):
"""Lazy import for optional dependencies."""
if name == "DirectLFQQuantification":
from mokume.quantification.directlfq import DirectLFQQuantification

return DirectLFQQuantification
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

Expand All @@ -70,7 +71,6 @@ def get_quantification_method(method: str, **kwargs) -> ProteinQuantificationMet
For MaxLFQ:
- min_peptides: int (default 2)
- n_jobs: int (default -1, all cores)
- use_variance_guided: bool (default True)

For TopN:
- n: int (default 3, can also be parsed from method name)
Expand Down Expand Up @@ -120,13 +120,13 @@ def get_quantification_method(method: str, **kwargs) -> ProteinQuantificationMet
elif method_lower == "maxlfq":
return MaxLFQQuantification(
min_peptides=kwargs.get("min_peptides", 2),
n_jobs=kwargs.get("n_jobs", -1),
use_variance_guided=kwargs.get("use_variance_guided", True),
threads=kwargs.get("n_jobs", -1),
verbose=kwargs.get("verbose", 0),
)
Comment on lines 71 to 125
Copy link

Copilot AI Mar 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get_quantification_method(..., method="maxlfq") maps kwargs["n_jobs"] into the threads constructor argument, but ignores a caller-provided threads kwarg. This is inconsistent with the public docs in this repo (e.g., README uses threads=...) and will lead to surprising behavior. Consider accepting both (prefer threads when present, fall back to n_jobs) and update the docstring bullets/examples to match.

Copilot uses AI. Check for mistakes.

elif method_lower == "directlfq":
from mokume.quantification.directlfq import DirectLFQQuantification

return DirectLFQQuantification(
min_nonan=kwargs.get("min_nonan", 1),
num_cores=kwargs.get("num_cores", None),
Expand All @@ -139,8 +139,7 @@ def get_quantification_method(method: str, **kwargs) -> ProteinQuantificationMet
else:
available = "topN (e.g., top3, top5, top10), maxlfq, directlfq, all/sum"
raise ValueError(
f"Unknown quantification method: {method}. "
f"Available methods: {available}"
f"Unknown quantification method: {method}. Available methods: {available}"
)


Expand Down
66 changes: 43 additions & 23 deletions mokume/quantification/ratio.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,13 +301,12 @@ def load_psm_data(
Long-format PSM data with columns: ProteinName, PeptideCanonical,
PrecursorCharge, SampleID, Fraction, Intensity.
"""
# Build SQL filters
# Build SQL filters (where_clause built after is_decoy detection below)
filter_builder = SQLFilterBuilder(
remove_contaminants=remove_contaminants,
min_peptide_length=min_aa,
require_unique=True,
)
where_clause = filter_builder.build_where_clause()

# Load SDRF for fraction info
sdrf_df = pd.read_csv(sdrf_path, sep="\t")
Expand Down Expand Up @@ -335,32 +334,53 @@ def _strip_raw_ext(name: str) -> str:
]
is_new_qpx = "charge" in cols or "run_file_name" in cols

# Predefined query templates (no user-controlled data)
_QUERY_NEW_QPX = (
"SELECT pg_accessions, sequence,"
" charge as precursor_charge,"
" run_file_name as run_file_name,"
" unnest.label as label,"
" unnest.intensity as intensity"
" FROM read_parquet(?) AS parquet_raw, UNNEST(intensities) as unnest"
" WHERE unnest.intensity IS NOT NULL AND "
)
_QUERY_OLD_QPX = (
"SELECT pg_accessions, sequence,"
" precursor_charge as precursor_charge,"
" unnest.sample_accession as sample_accession,"
" reference_file_name as run_file_name,"
" unnest.channel as label,"
" unnest.intensity as intensity"
" FROM read_parquet(?) AS parquet_raw, UNNEST(intensities) as unnest"
" WHERE unnest.intensity IS NOT NULL AND "
# Set has_is_decoy before building WHERE clause so DECOY filter is optimal
if "is_decoy" in cols:
filter_builder.has_is_decoy = True
where_clause, where_params = filter_builder.build_where_clause()

# Detect if pg_accessions is list<struct{accession,...}> (new QPX)
pg_is_struct = False
if "pg_accessions" in cols:
try:
type_str = conn.execute(
"SELECT typeof(pg_accessions) FROM read_parquet(?) LIMIT 1",
[parquet_path],
).fetchone()[0].lower()
pg_is_struct = "struct" in type_str
except Exception as exc:
logger.debug("Could not detect pg_accessions type: %s", exc)
pg_col = (
"list_transform(pg_accessions, x -> x.accession) as pg_accessions"
if pg_is_struct
else "pg_accessions"
)

# Predefined query templates (no user-controlled data)
_QUERY_NEW_QPX = "".join([
"SELECT ", pg_col, ", sequence,",
" charge as precursor_charge,",
" run_file_name as run_file_name,",
" unnest.label as label,",
" unnest.intensity as intensity",
" FROM read_parquet(?) AS parquet_raw, UNNEST(intensities) as unnest",
" WHERE unnest.intensity IS NOT NULL AND ",
])
_QUERY_OLD_QPX = "".join([
"SELECT ", pg_col, ", sequence,",
" precursor_charge as precursor_charge,",
" unnest.sample_accession as sample_accession,",
" reference_file_name as run_file_name,",
" unnest.channel as label,",
" unnest.intensity as intensity",
" FROM read_parquet(?) AS parquet_raw, UNNEST(intensities) as unnest",
" WHERE unnest.intensity IS NOT NULL AND ",
])

base_query = _QUERY_NEW_QPX if is_new_qpx else _QUERY_OLD_QPX
# where_clause is built by SQLFilterBuilder from validated config only
query = "".join((base_query, where_clause))

df = conn.execute(query, [parquet_path]).df()
df = conn.execute(query, [parquet_path] + where_params).df()
finally:
conn.close()

Expand Down
Loading
Loading