Skip to content

Commit a175d08

Browse files
committed
update markers to clarify id
1 parent 19e37c7 commit a175d08

File tree

10 files changed

+68
-26
lines changed

10 files changed

+68
-26
lines changed

grassp/datasets/external/atha_markers.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
uniprot_id lilley
1+
id lilley
22
AT1G01620 PM
33
AT1G01790 Envelope
44
AT1G02280 Envelope

grassp/datasets/external/dmel_markers.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
uniprot_id lilley
1+
id lilley
22
A1Z6P3 Cytoskeleton
33
A1Z8U0 Nucleus
44
A1Z920 PM

grassp/datasets/external/ggal_markers.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
uniprot_id lilley
1+
id lilley
22
IPI00570752.1 ER
33
IPI00571172.2 ER
44
IPI00571288.3 Mitochondrion

grassp/datasets/external/hsap_markers.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
uniprot_id lilley christopher geladaki itzhak villaneuva hein2024_component hein2024_gt_component
1+
id lilley christopher geladaki itzhak villaneuva hein2024_component hein2024_gt_component
22
A0A096LP01 Mitochondrion
33
A0A096LP55 Mitochondrion
44
A0A0U1RRE5 Cytosol

grassp/datasets/external/mmus_markers.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
uniprot_id lilley christoforou
1+
id lilley christoforou
22
A2AJ15 ER ERGIC
33
A2ATU0 Mitochondrion Mitochondrion
44
A2TJV2 PM

grassp/datasets/external/scer_markers.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
uniprot_id lilley
1+
id lilley
22
D6VTK4 PM
33
O13563 Proteasome
44
P00128 Mitochondrion - IM

grassp/datasets/external/toxo_markers.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
uniprot_id barylyuk
1+
id barylyuk
22
TGME49_200250 Micronemes
33
TGME49_201780 Micronemes
44
TGME49_201840 Endomembrane Vesicles

grassp/datasets/external/tryp_markers.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
uniprot_id moloney
1+
id moloney
22
Tb11.v5.0162 Proteasome
33
Tb11.v5.0196 Proteasome
44
Tb11.v5.0200 Proteasome

grassp/datasets/marker_curation/fetch_proloc_markers.R

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,8 @@ read_markers <- function(info) {
107107
obj <- readRDS(info$path)
108108
df <- as.data.frame(obj, stringsAsFactors = FALSE)
109109

110-
# The Uniprot IDs are in the row names
111-
uniprot_ids <- rownames(df)
110+
# The protein IDs are in the row names
111+
protein_ids <- rownames(df)
112112

113113
# The compartments are in the 'markers' column
114114
compartments <- df$markers
@@ -118,17 +118,17 @@ read_markers <- function(info) {
118118

119119
# Create a clean data frame
120120
data <- data.frame(
121-
uniprot_id = uniprot_ids,
121+
id = protein_ids,
122122
compartment = compartments,
123123
stringsAsFactors = FALSE
124124
)
125125

126-
# Remove rows with missing or empty Uniprot IDs
127-
data <- data[!is.na(data$uniprot_id) & nzchar(data$uniprot_id), ]
126+
# Remove rows with missing or empty protein IDs
127+
data <- data[!is.na(data$id) & nzchar(data$id), ]
128128

129-
# Aggregate by Uniprot ID in case there are duplicates
130-
data <- aggregate(data$compartment, by = list(uniprot_id = data$uniprot_id), FUN = collapse_values)
131-
names(data) <- c("uniprot_id", info$author)
129+
# Aggregate by protein ID in case there are duplicates
130+
data <- aggregate(data$compartment, by = list(id = data$id), FUN = collapse_values)
131+
names(data) <- c("id", info$author)
132132

133133
data
134134
}
@@ -152,15 +152,15 @@ for (species in names(species_groups)) {
152152
data_list <- lapply(infos, read_markers)
153153

154154
# Merge all data frames for this species
155-
merged <- Reduce(function(x, y) merge(x, y, by = "uniprot_id", all = TRUE), data_list)
155+
merged <- Reduce(function(x, y) merge(x, y, by = "id", all = TRUE), data_list)
156156

157-
# Order columns: uniprot_id first, then authors in order
158-
ordered_cols <- c("uniprot_id", authors)
157+
# Order columns: id first, then authors in order
158+
ordered_cols <- c("id", authors)
159159
ordered_cols <- ordered_cols[ordered_cols %in% names(merged)]
160160
merged <- merged[, ordered_cols, drop = FALSE]
161161

162-
# Sort by uniprot_id
163-
merged <- merged[order(merged$uniprot_id), ]
162+
# Sort by id
163+
merged <- merged[order(merged$id), ]
164164

165165
out_file <- file.path(out_dir, paste0("marker2_", species, "_merged.csv"))
166166
write.csv(merged, out_file, row.names = FALSE, na = "")

grassp/preprocessing/annotation.py

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -503,8 +503,9 @@ def add_markers(
503503
) -> None:
504504
"""Annotate proteins with marker annotations from literature.
505505
506-
Matches UniProt IDs in ``.obs`` against a collection of marker annotations
507-
from different authors.
506+
Matches protein IDs in ``.obs`` against a collection of marker annotations
507+
from different authors. Note that marker IDs are species-specific and may
508+
not be UniProt accessions (see table below).
508509
509510
Marker annotations are sourced from:
510511
@@ -521,6 +522,47 @@ def add_markers(
521522
- Obtained from pRoloc. See: https://bioconductor.org/packages/pRoloc/
522523
and https://lgatto.github.io/pRoloc/reference/pRolocmarkers.html
523524
525+
**Protein ID types by species:**
526+
527+
.. list-table::
528+
:header-rows: 1
529+
530+
* - Species Code
531+
- Common Name
532+
- ID Type
533+
- Example ID
534+
* - atha
535+
- *Arabidopsis thaliana*
536+
- TAIR/Araport
537+
- AT1G01620
538+
* - dmel
539+
- *Drosophila melanogaster*
540+
- UniProt
541+
- A1Z6P3
542+
* - ggal
543+
- *Gallus gallus* (Chicken)
544+
- IPI
545+
- IPI00570752.1
546+
* - hsap
547+
- *Homo sapiens* (Human)
548+
- UniProt
549+
- A0AVT1
550+
* - mmus
551+
- *Mus musculus* (Mouse)
552+
- UniProt
553+
- A2AJ15
554+
* - scer
555+
- *Saccharomyces cerevisiae* (Yeast)
556+
- UniProt
557+
- D6VTK4
558+
* - toxo
559+
- *Toxoplasma gondii*
560+
- ToxoDB Gene IDs
561+
- TGME49_200250
562+
* - tryp
563+
- *Trypanosoma brucei*
564+
- TriTrypDB Gene IDs
565+
- Tb11.v5.0162
524566
525567
This function modifies the AnnData object in-place by adding marker
526568
annotation columns to ``.obs``.
@@ -538,7 +580,7 @@ def add_markers(
538580
includes all available author columns. Can be a single author name
539581
(string) or a list of author names.
540582
uniprot_id_column
541-
Column in ``.obs`` containing UniProt IDs. If None, uses ``.obs_names``.
583+
Column in ``.obs`` containing protein IDs (see the specific ID needed in the description above). If None, uses ``.obs_names``.
542584
add_colors
543585
If True, automatically add color mappings to ``.uns`` for each marker
544586
column, following scanpy plotting conventions. Colors are stored as
@@ -584,9 +626,9 @@ def add_markers(
584626
)
585627

586628
# Read marker file
587-
markers_df = pd.read_csv(marker_file, sep="\t", dtype=str, index_col="uniprot_id")
629+
markers_df = pd.read_csv(marker_file, sep="\t", dtype=str, index_col="id")
588630

589-
# Get all author columns (everything except uniprot_id)
631+
# Get all author columns (everything except id, which is the index)
590632
all_author_columns = [col for col in markers_df.columns]
591633

592634
# Determine which columns to include

0 commit comments

Comments
 (0)