update markers to clarify id

mffrank · mffrank · commit a175d08fdbcd · 2026-01-30T11:02:28.000-08:00
diff --git a/grassp/datasets/external/atha_markers.tsv b/grassp/datasets/external/atha_markers.tsv
@@ -1,4 +1,4 @@
-uniprot_id	lilley
+id	lilley
 AT1G01620	PM
 AT1G01790	Envelope
 AT1G02280	Envelope
diff --git a/grassp/datasets/external/dmel_markers.tsv b/grassp/datasets/external/dmel_markers.tsv
@@ -1,4 +1,4 @@
-uniprot_id	lilley
+id	lilley
 A1Z6P3	Cytoskeleton
 A1Z8U0	Nucleus
 A1Z920	PM
diff --git a/grassp/datasets/external/ggal_markers.tsv b/grassp/datasets/external/ggal_markers.tsv
@@ -1,4 +1,4 @@
-uniprot_id	lilley
+id	lilley
 IPI00570752.1	ER
 IPI00571172.2	ER
 IPI00571288.3	Mitochondrion
diff --git a/grassp/datasets/external/hsap_markers.tsv b/grassp/datasets/external/hsap_markers.tsv
@@ -1,4 +1,4 @@
-uniprot_id	lilley	christopher	geladaki	itzhak	villaneuva	hein2024_component	hein2024_gt_component
+id	lilley	christopher	geladaki	itzhak	villaneuva	hein2024_component	hein2024_gt_component
 A0A096LP01						Mitochondrion
 A0A096LP55						Mitochondrion
 A0A0U1RRE5						Cytosol
diff --git a/grassp/datasets/external/mmus_markers.tsv b/grassp/datasets/external/mmus_markers.tsv
@@ -1,4 +1,4 @@
-uniprot_id	lilley	christoforou
+id	lilley	christoforou
 A2AJ15	ER	ERGIC
 A2ATU0	Mitochondrion	Mitochondrion
 A2TJV2	PM
diff --git a/grassp/datasets/external/scer_markers.tsv b/grassp/datasets/external/scer_markers.tsv
@@ -1,4 +1,4 @@
-uniprot_id	lilley
+id	lilley
 D6VTK4	PM
 O13563	Proteasome
 P00128	Mitochondrion - IM
diff --git a/grassp/datasets/external/toxo_markers.tsv b/grassp/datasets/external/toxo_markers.tsv
@@ -1,4 +1,4 @@
-uniprot_id	barylyuk
+id	barylyuk
 TGME49_200250	Micronemes
 TGME49_201780	Micronemes
 TGME49_201840	Endomembrane Vesicles
diff --git a/grassp/datasets/external/tryp_markers.tsv b/grassp/datasets/external/tryp_markers.tsv
@@ -1,4 +1,4 @@
-uniprot_id	moloney
+id	moloney
 Tb11.v5.0162	Proteasome
 Tb11.v5.0196	Proteasome
 Tb11.v5.0200	Proteasome
diff --git a/grassp/datasets/marker_curation/fetch_proloc_markers.R b/grassp/datasets/marker_curation/fetch_proloc_markers.R
@@ -107,8 +107,8 @@ read_markers <- function(info) {
   obj <- readRDS(info$path)
   df <- as.data.frame(obj, stringsAsFactors = FALSE)
 
-  # The Uniprot IDs are in the row names
-  uniprot_ids <- rownames(df)
+  # The protein IDs are in the row names
+  protein_ids <- rownames(df)
 
   # The compartments are in the 'markers' column
   compartments <- df$markers
@@ -118,17 +118,17 @@ read_markers <- function(info) {
 
   # Create a clean data frame
   data <- data.frame(
-    uniprot_id = uniprot_ids,
+    id = protein_ids,
     compartment = compartments,
     stringsAsFactors = FALSE
   )
 
-  # Remove rows with missing or empty Uniprot IDs
-  data <- data[!is.na(data$uniprot_id) & nzchar(data$uniprot_id), ]
+  # Remove rows with missing or empty protein IDs
+  data <- data[!is.na(data$id) & nzchar(data$id), ]
 
-  # Aggregate by Uniprot ID in case there are duplicates
-  data <- aggregate(data$compartment, by = list(uniprot_id = data$uniprot_id), FUN = collapse_values)
-  names(data) <- c("uniprot_id", info$author)
+  # Aggregate by protein ID in case there are duplicates
+  data <- aggregate(data$compartment, by = list(id = data$id), FUN = collapse_values)
+  names(data) <- c("id", info$author)
 
   data
 }
@@ -152,15 +152,15 @@ for (species in names(species_groups)) {
   data_list <- lapply(infos, read_markers)
 
   # Merge all data frames for this species
-  merged <- Reduce(function(x, y) merge(x, y, by = "uniprot_id", all = TRUE), data_list)
+  merged <- Reduce(function(x, y) merge(x, y, by = "id", all = TRUE), data_list)
 
-  # Order columns: uniprot_id first, then authors in order
-  ordered_cols <- c("uniprot_id", authors)
+  # Order columns: id first, then authors in order
+  ordered_cols <- c("id", authors)
   ordered_cols <- ordered_cols[ordered_cols %in% names(merged)]
   merged <- merged[, ordered_cols, drop = FALSE]
 
-  # Sort by uniprot_id
-  merged <- merged[order(merged$uniprot_id), ]
+  # Sort by id
+  merged <- merged[order(merged$id), ]
 
   out_file <- file.path(out_dir, paste0("marker2_", species, "_merged.csv"))
   write.csv(merged, out_file, row.names = FALSE, na = "")
diff --git a/grassp/preprocessing/annotation.py b/grassp/preprocessing/annotation.py
@@ -503,8 +503,9 @@ def add_markers(
 ) -> None:
     """Annotate proteins with marker annotations from literature.
 
-    Matches UniProt IDs in ``.obs`` against a collection of marker annotations
-    from different authors.
+    Matches protein IDs in ``.obs`` against a collection of marker annotations
+    from different authors. Note that marker IDs are species-specific and may
+    not be UniProt accessions (see table below).
 
     Marker annotations are sourced from:
 
@@ -521,6 +522,47 @@ def add_markers(
           - Obtained from pRoloc. See: https://bioconductor.org/packages/pRoloc/
             and https://lgatto.github.io/pRoloc/reference/pRolocmarkers.html
 
+    **Protein ID types by species:**
+
+    .. list-table::
+        :header-rows: 1
+
+        * - Species Code
+          - Common Name
+          - ID Type
+          - Example ID
+        * - atha
+          - *Arabidopsis thaliana*
+          - TAIR/Araport
+          - AT1G01620
+        * - dmel
+          - *Drosophila melanogaster*
+          - UniProt
+          - A1Z6P3
+        * - ggal
+          - *Gallus gallus* (Chicken)
+          - IPI
+          - IPI00570752.1
+        * - hsap
+          - *Homo sapiens* (Human)
+          - UniProt
+          - A0AVT1
+        * - mmus
+          - *Mus musculus* (Mouse)
+          - UniProt
+          - A2AJ15
+        * - scer
+          - *Saccharomyces cerevisiae* (Yeast)
+          - UniProt
+          - D6VTK4
+        * - toxo
+          - *Toxoplasma gondii*
+          - ToxoDB Gene IDs
+          - TGME49_200250
+        * - tryp
+          - *Trypanosoma brucei*
+          - TriTrypDB Gene IDs
+          - Tb11.v5.0162
 
     This function modifies the AnnData object in-place by adding marker
     annotation columns to ``.obs``.
@@ -538,7 +580,7 @@ def add_markers(
         includes all available author columns. Can be a single author name
         (string) or a list of author names.
     uniprot_id_column
-        Column in ``.obs`` containing UniProt IDs. If None, uses ``.obs_names``.
+        Column in ``.obs`` containing protein IDs (see the specific ID needed in the description above). If None, uses ``.obs_names``.
     add_colors
         If True, automatically add color mappings to ``.uns`` for each marker
         column, following scanpy plotting conventions. Colors are stored as
@@ -584,9 +626,9 @@ def add_markers(
         )
 
     # Read marker file
-    markers_df = pd.read_csv(marker_file, sep="\t", dtype=str, index_col="uniprot_id")
+    markers_df = pd.read_csv(marker_file, sep="\t", dtype=str, index_col="id")
 
-    # Get all author columns (everything except uniprot_id)
+    # Get all author columns (everything except id, which is the index)
     all_author_columns = [col for col in markers_df.columns]
 
     # Determine which columns to include

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-uniprot_id lilley`
	`1`	`+id lilley`
`2`	`2`	`AT1G01620 PM`
`3`	`3`	`AT1G01790 Envelope`
`4`	`4`	`AT1G02280 Envelope`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-uniprot_id lilley christopher geladaki itzhak villaneuva hein2024_component hein2024_gt_component`
	`1`	`+id lilley christopher geladaki itzhak villaneuva hein2024_component hein2024_gt_component`
`2`	`2`	`A0A096LP01 Mitochondrion`
`3`	`3`	`A0A096LP55 Mitochondrion`
`4`	`4`	`A0A0U1RRE5 Cytosol`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-uniprot_id lilley christoforou`
	`1`	`+id lilley christoforou`
`2`	`2`	`A2AJ15 ER ERGIC`
`3`	`3`	`A2ATU0 Mitochondrion Mitochondrion`
`4`	`4`	`A2TJV2 PM`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-uniprot_id barylyuk`
	`1`	`+id barylyuk`
`2`	`2`	`TGME49_200250 Micronemes`
`3`	`3`	`TGME49_201780 Micronemes`
`4`	`4`	`TGME49_201840 Endomembrane Vesicles`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-uniprot_id moloney`
	`1`	`+id moloney`
`2`	`2`	`Tb11.v5.0162 Proteasome`
`3`	`3`	`Tb11.v5.0196 Proteasome`
`4`	`4`	`Tb11.v5.0200 Proteasome`