Skip to content

Commit 38f62a5

Browse files
authored
Merge pull request #1376 from microbiomedata/1337-ingest-pfam-entries
Ingest PFAM entries and clans
2 parents 7988d1e + 2cffe6b commit 38f62a5

File tree

4 files changed

+80
-1
lines changed

4 files changed

+80
-1
lines changed

nmdc_server/ingest/kegg.py

+33
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
KoTermText,
1313
KoTermToModule,
1414
KoTermToPathway,
15+
PfamEntryToClan,
1516
)
1617

1718
ORTHOLOGY_URL = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00001&format=json"
@@ -26,11 +27,14 @@
2627
# Note that we're using the same file for both COG terms and pathways
2728
COG_PATHWAY_DEFS = COG_TERM_DEFS = "/data/ingest/cog/cog-20.def.tab"
2829

30+
PFAM_TERM_DEFS = PFAM_CLAN_DEFS = "/data/ingest/pfam/Pfam-A.clans.tsv"
31+
2932

3033
def load(db: Session) -> None:
3134
ingest_ko_search(db)
3235
ingest_ko_module_map(db)
3336
ingest_ko_pathway_map(db)
37+
ingest_pfam_clan_map(db)
3438

3539

3640
def ingest_ko_search(db: Session) -> None:
@@ -76,6 +80,13 @@ def get_search_records_from_delimeted_file(
7680
"pubmed_id",
7781
"pdb_id",
7882
]
83+
pfam_headers = [
84+
"pfam_accession",
85+
"clan_accession",
86+
"clan_name",
87+
"pfam_short_name",
88+
"pfam_name",
89+
]
7990

8091
cog_function_headers = ["function_code", "sequence", "definition"]
8192

@@ -100,6 +111,16 @@ def get_search_records_from_delimeted_file(
100111
"term_key": cog_def_headers[0],
101112
"text_key": cog_def_headers[2],
102113
},
114+
PFAM_TERM_DEFS: {
115+
"fieldnames": pfam_headers,
116+
"term_key": "pfam_accession",
117+
"text_key": "pfam_name",
118+
},
119+
PFAM_CLAN_DEFS: {
120+
"fieldnames": pfam_headers,
121+
"term_key": "clan_accession",
122+
"text_key": "clan_name",
123+
},
103124
}
104125

105126

@@ -179,3 +200,15 @@ def ingest_ko_pathway_map(db: Session) -> None:
179200
[CogTermToPathway(term=mapping[0], pathway=mapping[1]) for mapping in mappings]
180201
)
181202
db.commit()
203+
204+
205+
def ingest_pfam_clan_map(db: Session) -> None:
206+
"""Ingest a mapping of Pfam entries to clans"""
207+
db.execute(f"truncate table {PfamEntryToClan.__tablename__}")
208+
with open(PFAM_CLAN_DEFS) as fd:
209+
reader = csv.DictReader(fd, fieldnames=pfam_headers, delimiter="\t")
210+
mappings = set([(row[pfam_headers[0]], row[pfam_headers[1]]) for row in reader])
211+
db.bulk_save_objects(
212+
[PfamEntryToClan(entry=mapping[0], clan=mapping[1]) for mapping in mappings]
213+
)
214+
db.commit()

nmdc_server/ingest/pipeline.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
DataObjectList = List[str]
1515
LoadObjectReturn = models.PipelineStep
16-
gene_regex = re.compile(r"^(KEGG\.ORTHOLOGY|COG)")
16+
gene_regex = re.compile(r"^(KEGG\.ORTHOLOGY|COG|PFAM)")
1717

1818

1919
class LoadObject(Protocol):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""Add PFAM mappings
2+
3+
Revision ID: 5fb9910ca8e6
4+
Revises: ff4e651c3007
5+
Create Date: 2024-08-30 21:12:14.993046
6+
7+
"""
8+
9+
from typing import Optional
10+
11+
import sqlalchemy as sa
12+
from alembic import op
13+
14+
# revision identifiers, used by Alembic.
15+
revision: str = "5fb9910ca8e6"
16+
down_revision: Optional[str] = "ff4e651c3007"
17+
branch_labels: Optional[str] = None
18+
depends_on: Optional[str] = None
19+
20+
21+
def upgrade():
22+
# ### commands auto generated by Alembic - please adjust! ###
23+
op.create_table(
24+
"pfam_entry_to_clan",
25+
sa.Column("entry", sa.String(), nullable=False),
26+
sa.Column("clan", sa.String(), nullable=False),
27+
sa.PrimaryKeyConstraint("entry", "clan", name=op.f("pk_pfam_entry_to_clan")),
28+
)
29+
op.create_index(
30+
op.f("ix_pfam_entry_to_clan_clan"), "pfam_entry_to_clan", ["clan"], unique=False
31+
)
32+
# ### end Alembic commands ###
33+
34+
35+
def downgrade():
36+
# ### commands auto generated by Alembic - please adjust! ###
37+
op.drop_index(op.f("ix_pfam_entry_to_clan_clan"), table_name="pfam_entry_to_clan")
38+
op.drop_table("pfam_entry_to_clan")
39+
# ### end Alembic commands ###

nmdc_server/models.py

+7
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,13 @@ class CogTermToFunction(Base):
163163
function = Column(String, nullable=False, primary_key=True, index=True)
164164

165165

166+
class PfamEntryToClan(Base):
167+
__tablename__ = "pfam_entry_to_clan"
168+
169+
entry = Column(String, nullable=False, primary_key=True)
170+
clan = Column(String, nullable=False, primary_key=True, index=True)
171+
172+
166173
class KoTermText(Base):
167174
__tablename__ = "ko_term_text"
168175

0 commit comments

Comments
 (0)