Skip to content

Commit ca91a61

Browse files
authored
Merge pull request #1313 from microbiomedata/1307-metat-model-updates
Update ingest and data model to account for Metatranscriptomics
2 parents 0fe0163 + a809361 commit ca91a61

11 files changed

+510
-8
lines changed

nmdc_server/crud.py

+6
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,12 @@ def get_database_summary(db: Session) -> schemas.DatabaseSummary:
5151
reads_qc=aggregations.get_table_summary(db, models.ReadsQC),
5252
metagenome_assembly=aggregations.get_table_summary(db, models.MetagenomeAssembly),
5353
metagenome_annotation=aggregations.get_table_summary(db, models.MetagenomeAnnotation),
54+
metatranscriptome_assembly=aggregations.get_table_summary(
55+
db, models.MetatranscriptomeAssembly
56+
),
57+
metatranscriptome_annotation=aggregations.get_table_summary(
58+
db, models.MetatranscriptomeAnnotation
59+
),
5460
metaproteomic_analysis=aggregations.get_table_summary(db, models.MetaproteomicAnalysis),
5561
mags_analysis=aggregations.get_table_summary(db, models.MAGsAnalysis),
5662
read_based_analysis=aggregations.get_table_summary(db, models.ReadBasedAnalysis),

nmdc_server/data_object_filters.py

+6
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ class WorkflowActivityTypeEnum(Enum):
3131
reads_qc = "nmdc:ReadQCAnalysisActivity"
3232
metagenome_assembly = "nmdc:MetagenomeAssembly"
3333
metagenome_annotation = "nmdc:MetagenomeAnnotation" # TODO name out of date, fix
34+
metatranscriptome_assembly = "nmdc:MetatranscriptomeAssembly"
35+
metatranscriptome_annotation = "nmdc:MetatranscriptomeAnnotation" # TODO name out of date, fix
3436
metaproteomic_analysis = "nmdc:MetaProteomicAnalysis"
3537
mags_analysis = "nmdc:MAGsAnalysisActivity"
3638
read_based_analysis = "nmdc:ReadbasedAnalysis" # TODO name out of date, fix
@@ -52,6 +54,8 @@ def output_association(self):
5254
WorkflowActivityTypeEnum.reads_qc: models.ReadsQC,
5355
WorkflowActivityTypeEnum.metagenome_assembly: models.MetagenomeAssembly,
5456
WorkflowActivityTypeEnum.metagenome_annotation: models.MetagenomeAnnotation,
57+
WorkflowActivityTypeEnum.metatranscriptome_assembly: models.MetatranscriptomeAssembly,
58+
WorkflowActivityTypeEnum.metatranscriptome_annotation: models.MetatranscriptomeAnnotation,
5559
WorkflowActivityTypeEnum.metaproteomic_analysis: models.MetaproteomicAnalysis,
5660
WorkflowActivityTypeEnum.mags_analysis: models.MAGsAnalysis,
5761
WorkflowActivityTypeEnum.read_based_analysis: models.ReadBasedAnalysis,
@@ -67,6 +71,8 @@ def output_association(self):
6771
WorkflowActivityTypeEnum.reads_qc: models.reads_qc_output_association,
6872
WorkflowActivityTypeEnum.metagenome_assembly: models.metagenome_assembly_output_association,
6973
WorkflowActivityTypeEnum.metagenome_annotation: models.metagenome_annotation_output_association,
74+
WorkflowActivityTypeEnum.metatranscriptome_assembly: models.metatranscriptome_assembly_output_association, # noqa: E501
75+
WorkflowActivityTypeEnum.metatranscriptome_annotation: models.metatranscriptome_annotation_output_association, # noqa: E501
7076
_mpa: models.metaproteomic_analysis_output_association,
7177
WorkflowActivityTypeEnum.mags_analysis: models.mags_analysis_output_association,
7278
WorkflowActivityTypeEnum.read_based_analysis: models.read_based_analysis_output_association,

nmdc_server/filters.py

+35
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
EnvMediumAncestor,
3434
EnvMediumTerm,
3535
MetaPGeneFunction,
36+
MetaTGeneFunction,
3637
Table,
3738
workflow_execution_tables,
3839
)
@@ -327,6 +328,40 @@ def join_self(self, query: Query, parent: Table) -> Query:
327328
return query
328329

329330

331+
class MetaTGeneFunctionFilter(OmicsProcessingFilter):
332+
table = Table.metat_gene_function
333+
334+
def join(self, target_table: Table, query: Query) -> Query:
335+
if target_table == Table.metatranscriptome_annotation:
336+
return query.join(
337+
models.MetaTGeneFunctionAggregation,
338+
models.MetaTGeneFunctionAggregation.metatranscriptome_annotation_id
339+
== models.MetatranscriptomeAnnotation.id,
340+
).join(
341+
MetaTGeneFunction,
342+
MetaTGeneFunction.id == models.MetaTGeneFunctionAggregation.gene_function_id,
343+
)
344+
query = super().join(target_table, query)
345+
return (
346+
query.join(
347+
models.MetatranscriptomeAnnotation,
348+
models.MetatranscriptomeAnnotation.omics_processing_id == models.OmicsProcessing.id,
349+
)
350+
.join(
351+
models.MetaTGeneFunctionAggregation,
352+
models.MetaTGeneFunctionAggregation.metatranscriptome_annotation_id
353+
== models.MetatranscriptomeAnnotation.id,
354+
)
355+
.join(
356+
MetaTGeneFunction,
357+
MetaTGeneFunction.id == models.MetaTGeneFunctionAggregation.gene_function_id,
358+
)
359+
)
360+
361+
def join_self(self, query: Query, parent: Table) -> Query:
362+
return query
363+
364+
330365
def _get_all_subclasses(cls: Type[BaseFilter]) -> List[Type[BaseFilter]]:
331366
all_subclasses: List[Type[BaseFilter]] = []
332367
for subclass in cls.__subclasses__():

nmdc_server/ingest/all.py

+28-4
Original file line numberDiff line numberDiff line change
@@ -120,14 +120,22 @@ def load(db: Session, function_limit=None, skip_annotation=False):
120120
)
121121
db.commit()
122122

123-
logger.info("Loading metatranscriptome activities...")
123+
logger.info("Loading metatranscriptome expression analyses...")
124124
pipeline.load(
125125
db,
126-
mongodb["metatranscriptome_activity_set"].find(),
126+
mongodb["metatranscriptome_expression_analysis_set"].find(),
127127
pipeline.load_metatranscriptome,
128128
WorkflowActivityTypeEnum.metatranscriptome.value,
129129
)
130130

131+
logger.info("Loading metatranscriptome assemblies...")
132+
pipeline.load(
133+
db,
134+
mongodb["metatranscriptome_assembly_set"].find(),
135+
pipeline.load_mt_assembly,
136+
WorkflowActivityTypeEnum.metatranscriptome_assembly.value,
137+
)
138+
131139
logger.info("Loading NOM analysis...")
132140
pipeline.load(
133141
db,
@@ -170,13 +178,29 @@ def load(db: Session, function_limit=None, skip_annotation=False):
170178
annotations=mongodb["functional_annotation_agg"],
171179
function_limit=function_limit,
172180
)
181+
173182
except Exception:
174-
logger.exception("Failed during metag ingest.")
183+
logger.exception("Failed during annotation ingest.")
175184
finally:
176185
db.commit()
177186

178187
else:
179-
logger.info("Skipping annotation ingest")
188+
logger.info("Skipping mg annotation ingest")
189+
190+
try:
191+
logger.info("Loading metatranscriptome annotation...")
192+
pipeline.load(
193+
db,
194+
mongodb["metatranscriptome_annotation_set"].find(),
195+
pipeline.load_mt_annotation,
196+
WorkflowActivityTypeEnum.metatranscriptome_annotation.value,
197+
annotations=mongodb["functional_annotation_agg"],
198+
function_limit=function_limit,
199+
)
200+
except Exception:
201+
logger.exception("Failed during metatranscriptome annotation ingest.")
202+
finally:
203+
db.commit()
180204

181205
logger.info("Loading read qc...")
182206
pipeline.load(

nmdc_server/ingest/pipeline.py

+55
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,55 @@ def load_mp_analysis(db: Session, obj: Dict[str, Any], **kwargs) -> LoadObjectRe
130130
return pipeline
131131

132132

133+
def load_mt_annotation(db: Session, obj: Dict[str, Any], **kwargs) -> LoadObjectReturn:
134+
# Ingest the MetatranscriptomeAnnotation record
135+
pipeline = cast(models.MetatranscriptomeAnnotation, load_mt_annotation_base(db, obj, **kwargs))
136+
137+
annotations: Collection = kwargs["annotations"]
138+
139+
# Query gene function annotations from mongo and build the appropriate objects
140+
query = annotations.find(
141+
{
142+
"metagenome_annotation_id": pipeline.id,
143+
"gene_function_id": {
144+
"$regex": ko_regex,
145+
},
146+
},
147+
no_cursor_timeout=True,
148+
projection={
149+
"_id": False,
150+
"metatranscriptome_annotation_id": True,
151+
"count": True,
152+
"gene_function_id": True,
153+
},
154+
)
155+
if kwargs.get("function_limit"):
156+
query = query.limit(kwargs["function_limit"])
157+
158+
gene_functions: Set[str] = set()
159+
gene_function_aggregations: List[models.MetaTGeneFunctionAggregation] = []
160+
for annotation in query:
161+
function_id = annotation["gene_function_id"]
162+
gene_functions.add(function_id)
163+
gene_function_aggregations.append(
164+
models.MetaTGeneFunctionAggregation(
165+
metatranscriptome_annotation_id=pipeline.id,
166+
gene_function_id=function_id,
167+
count=annotation["count"],
168+
)
169+
)
170+
# Save both newly encountered gene functions and the gene function aggregations
171+
if gene_function_aggregations:
172+
db.execute(
173+
insert(models.GeneFunction)
174+
.on_conflict_do_nothing()
175+
.values([(gf,) for gf in gene_functions])
176+
)
177+
db.bulk_save_objects(gene_function_aggregations)
178+
179+
return pipeline
180+
181+
133182
# This is a loader for a generic workflow type that doesn't need any
134183
# additional processing.
135184
def generate_pipeline_loader(schema, model) -> LoadObject:
@@ -161,6 +210,12 @@ def loader(db: Session, obj: Dict[str, Any], **kwargs: Any) -> LoadObjectReturn:
161210
load_metatranscriptome = generate_pipeline_loader(
162211
schemas.MetatranscriptomeBase, models.Metatranscriptome
163212
)
213+
load_mt_assembly = generate_pipeline_loader(
214+
schemas.MetatranscriptomeAssemblyBase, models.MetatranscriptomeAssembly
215+
)
216+
load_mt_annotation_base = generate_pipeline_loader(
217+
schemas.MetatranscriptomeAnnotationBase, models.MetatranscriptomeAnnotation
218+
)
164219

165220

166221
# This is a generic function for load workflow execution objects. Some workflow types require
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
"""Add table for metaT gene functions
2+
3+
Revision ID: c0b36f8dc4b8
4+
Revises: d3d563e01a74
5+
Create Date: 2024-07-25 14:24:16.657561
6+
7+
"""
8+
9+
from typing import Optional
10+
11+
import sqlalchemy as sa
12+
from alembic import op
13+
14+
# revision identifiers, used by Alembic.
15+
revision: str = "c0b36f8dc4b8"
16+
down_revision: Optional[str] = "d3d563e01a74"
17+
branch_labels: Optional[str] = None
18+
depends_on: Optional[str] = None
19+
20+
21+
def upgrade():
22+
# ### commands auto generated by Alembic - please adjust! ###
23+
op.create_table(
24+
"metat_gene_function_aggregation",
25+
sa.Column("metatranscriptome_annotation_id", sa.String(), nullable=False),
26+
sa.Column("gene_function_id", sa.String(), nullable=False),
27+
sa.Column("count", sa.BigInteger(), nullable=False),
28+
sa.ForeignKeyConstraint(
29+
["gene_function_id"],
30+
["gene_function.id"],
31+
name=op.f("fk_metat_gene_function_aggregation_gene_function_id_gene_function"),
32+
),
33+
sa.ForeignKeyConstraint(
34+
["metatranscriptome_annotation_id"],
35+
["metatranscriptome_annotation.id"],
36+
name=op.f(
37+
"fk_metat_gene_function_aggregation_metatranscriptome_annotation_id_metatranscriptome_annotation" # noqa: E501
38+
),
39+
),
40+
sa.PrimaryKeyConstraint(
41+
"metatranscriptome_annotation_id",
42+
"gene_function_id",
43+
name=op.f("pk_metat_gene_function_aggregation"),
44+
),
45+
)
46+
# ### end Alembic commands ###
47+
48+
49+
def downgrade():
50+
# ### commands auto generated by Alembic - please adjust! ###
51+
op.drop_table("metat_gene_function_aggregation")
52+
# ### end Alembic commands ###

0 commit comments

Comments
 (0)