Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 25 additions & 6 deletions onsite/onsitec.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def _get_metas_dict(df, row_idx: int) -> Dict[str, str]:
return {str(m["name"]): str(m["value"]) for m in items if isinstance(m, dict)}


def merge_algorithm_results(ascore_file, phosphors_file, lucxor_file, output_file):
def merge_algorithm_results(ascore_file, phosphors_file, lucxor_file, output_file, input_idparquet):
"""Merge results from all three algorithms into a single idparquet directory."""
ascore_df, _, _, _ = load_dataframes(ascore_file)
phosphors_df, _, _, _ = load_dataframes(phosphors_file)
Expand All @@ -291,7 +291,6 @@ def merge_algorithm_results(ascore_file, phosphors_file, lucxor_file, output_fil
click.echo(f" Warning: {stats['seq_mismatch']} PSM(s) skipped (seq mismatch)")

merged_rows = []
merged_pep_idx = 0

for ai, pi, li in triples:
a_metas = _get_metas_dict(ascore_df, ai)
Expand Down Expand Up @@ -369,17 +368,37 @@ def merge_algorithm_results(ascore_file, phosphors_file, lucxor_file, output_fil
"spectrum_reference": spec_ref,
"reference_file_name": ref_file,
"hit_index": hit_idx,
"peptide_identification_index": merged_pep_idx,
"peptide_identification_index": li,
"psm_metavalues": np.array(merged_metas, dtype=object),
"modifications": np.array([], dtype=object),
"protein_accessions": np.array([], dtype=object),
"additional_scores": np.array([], dtype=object),
"run_identifier": str(l_hit.get("run_identifier", "")),
})
merged_pep_idx += 1

out_df = pd.DataFrame(merged_rows)
save_dataframes(output_file, out_df, proteins_df, template_df=lucxor_df)

full_df = lucxor_df.copy()

out_df = out_df.set_index(["peptide_identification_index", "hit_index"])
full_df = full_df.set_index(["peptide_identification_index", "hit_index"])
Comment on lines 379 to +384

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Handle the zero-merge path before calling set_index.

When no triples survive the join, pd.DataFrame(merged_rows) has no columns, so Line 383 raises KeyError instead of writing an output where every score is marked missing. This is reachable whenever the tools have no common spectrum_reference or every common PSM is skipped for sequence mismatch.

Suggested fix
-    out_df = pd.DataFrame(merged_rows)
-
     full_df = lucxor_df.copy()
+    out_df = (
+        pd.DataFrame(merged_rows)
+        if merged_rows
+        else full_df.iloc[0:0].copy()
+    )

     out_df = out_df.set_index(["peptide_identification_index", "hit_index"])
     full_df = full_df.set_index(["peptide_identification_index", "hit_index"])
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@onsite/onsitec.py` around lines 379 - 384, The code fails when merged_rows is
empty because out_df has no columns and
set_index(["peptide_identification_index","hit_index"]) raises KeyError; fix by
detecting this zero-merge case right after out_df = pd.DataFrame(merged_rows)
and constructing an out_df that contains the expected index columns and score
columns (or at minimum the peptide_identification_index and hit_index columns)
populated from lucxor_df unique pairs (or set to NaN) before calling set_index;
ensure you reference merged_rows, out_df, full_df, lucxor_df and use
set_index(["peptide_identification_index","hit_index"]) only after this fallback
is in place so the downstream merge/writes produce rows with missing scores
instead of crashing.


for col in out_df.columns:
if col in full_df.columns and out_df[col].dtype != full_df[col].dtype:
try:
out_df[col] = out_df[col].astype(full_df[col].dtype)
except Exception:
pass

full_df.update(out_df)

missing_mask = ~full_df.index.isin(out_df.index)
full_df.loc[missing_mask, "score"] = np.nan
full_df.loc[missing_mask, "score_type"] = "onsite_combined_score"

out_df = full_df.reset_index()

save_dataframes(output_file, out_df, proteins_df, template_df=lucxor_df, source_idparquet=input_idparquet)
click.echo(f"Successfully merged {stats['merged']} peptide identifications")
click.echo("Each peptide contains scores from all three algorithms")

Expand Down Expand Up @@ -447,7 +466,7 @@ def run_all_algorithms_from_single_cli(
if exit_code != 0:
raise RuntimeError(f"LucXor failed with exit code {exit_code}")

merge_algorithm_results(ascore_out, phosphors_out, lucxor_out, out_file)
merge_algorithm_results(ascore_out, phosphors_out, lucxor_out, out_file, id_file)

elapsed = time.time() - start_time
click.echo(f"All algorithms completed in {elapsed:.2f}s")
Expand Down