Skip to content

Commit d779ac7

Browse files
committed
Get entity source metadata from mmCIF/BinaryCIF
Extend ihm.metadata.CIFParser and BinaryCIFParser to read the entity_src tables and return the same entity_source dict that PDBParser does. Closes #168.
1 parent f9bf384 commit d779ac7

File tree

4 files changed

+97
-38
lines changed

4 files changed

+97
-38
lines changed

ihm/metadata.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -763,6 +763,11 @@ def __init__(self, m):
763763
self.template_ranges = ihm.reader.IDMapper(None, _TemplateRange)
764764
self.target_ranges = ihm.reader.IDMapper(None, _TargetRange)
765765
self.templates = ihm.reader.IDMapper(m['templates'], _Template)
766+
self.entities = ihm.reader.IDMapper(None, ihm.Entity, [])
767+
self.asym_units = ihm.reader.IDMapper(m['asyms'], ihm.AsymUnit, None)
768+
self.src_gens = ihm.reader.IDMapper(None, ihm.source.Manipulated)
769+
self.src_nats = ihm.reader.IDMapper(None, ihm.source.Natural)
770+
self.src_syns = ihm.reader.IDMapper(None, ihm.source.Synthetic)
766771

767772

768773
class _TemplateDetailsHandler(ihm.reader.Handler):
@@ -875,7 +880,8 @@ class _CIFParserBase(Parser):
875880

876881
def parse_file(self, filename):
877882
m = {'db': {}, 'title': 'Starting model structure',
878-
'software': [], 'templates': [], 'alignments': []}
883+
'software': [], 'templates': [], 'alignments': [],
884+
'asyms': []}
879885
with self._open_file(filename) as fh:
880886
dbh = _Database2Handler(m)
881887
structh = _StructHandler(m)
@@ -891,6 +897,12 @@ def parse_file(self, filename):
891897
'_modeller_template': modtmplh,
892898
'_software': ihm.reader._SoftwareHandler(sysr),
893899
'_citation': ihm.reader._CitationHandler(sysr),
900+
'_struct_asym': ihm.reader._StructAsymHandler(sysr),
901+
'_entity': ihm.reader._EntityHandler(sysr),
902+
'_entity_src_nat': ihm.reader._EntitySrcNatHandler(sysr),
903+
'_pdbx_entity_src_syn':
904+
ihm.reader._EntitySrcSynHandler(sysr),
905+
'_entity_src_gen': ihm.reader._EntitySrcGenHandler(sysr),
894906
'_citation_author':
895907
ihm.reader._CitationAuthorHandler(sysr),
896908
'_ma_template_details': _TemplateDetailsHandler(sysr),
@@ -905,6 +917,8 @@ def parse_file(self, filename):
905917
dset = self._get_dataset(filename, m)
906918
return {'dataset': dset, 'software': m['software'],
907919
'templates': self._get_templates(filename, m, dset),
920+
'entity_source': {asym.id: asym.entity.source
921+
for asym in m['asyms']},
908922
'script': m['script']}
909923

910924
def _get_dataset(self, filename, m):
@@ -982,6 +996,8 @@ def parse_file(self, filename):
982996
IDs in the PDB file and values the list of comparative
983997
model templates used to model that chain as
984998
:class:`ihm.startmodel.Template` objects;
999+
'entity_source' pointing to a dict with keys the asym IDs
1000+
and values :class:`ihm.source.Source` objects;
9851001
'software' pointing to a list of software used to generate
9861002
the file (as :class:`ihm.Software` objects);
9871003
'script' pointing to the script used to generate the

test/input/official.bcif

5.18 KB
Binary file not shown.

test/input/official.cif

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,44 @@ _pdbx_audit_revision_history.revision_date
2727
3 'Structure model' 1 2 2011-07-13
2828
4 'Structure model' 1 3 2017-10-18
2929
5 'Structure model' 1 4 2021-11-10
30+
#
31+
loop_
32+
_entity.id
33+
_entity.src_method
34+
1 man
35+
2 nat
36+
3 syn
37+
#
38+
_entity_src_gen.entity_id 1
39+
_entity_src_gen.pdbx_src_id 42
40+
_entity_src_gen.pdbx_gene_src_scientific_name 'MUS MUSCULUS'
41+
_entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id 10090
42+
_entity_src_gen.gene_src_common_name 'HOUSE MOUSE'
43+
_entity_src_gen.gene_src_strain 'TEST STRAIN 1'
44+
_entity_src_gen.pdbx_host_org_scientific_name 'ESCHERICHIA COLI'
45+
_entity_src_gen.pdbx_host_org_ncbi_taxonomy_id 562
46+
_entity_src_gen.host_org_common_name 'TEST COMMON 1'
47+
_entity_src_gen.pdbx_host_org_strain 'TEST STRAIN 2'
48+
#
49+
_entity_src_nat.entity_id 2
50+
_entity_src_nat.pdbx_src_id 42
51+
_entity_src_nat.pdbx_organism_scientific 'ESCHERICHIA COLI'
52+
_entity_src_nat.pdbx_ncbi_taxonomy_id 562
53+
_entity_src_nat.common_name 'TEST COMMON 2'
54+
_entity_src_nat.strain 'TEST STRAIN 3'
55+
#
56+
_pdbx_entity_src_syn.entity_id 3
57+
_pdbx_entity_src_syn.pdbx_src_id 42
58+
_pdbx_entity_src_syn.organism_scientific 'HELIANTHUS ANNUUS'
59+
_pdbx_entity_src_syn.organism_common_name 'COMMON SUNFLOWER'
60+
_pdbx_entity_src_syn.ncbi_taxonomy_id 4232
61+
#
62+
loop_
63+
_struct_asym.id
64+
_struct_asym.entity_id
65+
_struct_asym.details
66+
A 1 ?
67+
B 2 ?
68+
C 2 ?
69+
D 3 ?
70+
#

test/test_metadata.py

Lines changed: 39 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -131,37 +131,7 @@ def test_official_pdb(self):
131131
self.assertEqual(len(p['metadata']), 1)
132132
self.assertEqual(p['metadata'][0].helix_id, '10')
133133
self.assertIsNone(p['script'])
134-
dataset = p['dataset']
135-
self.assertEqual(dataset.data_type, 'Experimental model')
136-
self.assertEqual(dataset.location.db_name, 'PDB')
137-
self.assertEqual(dataset.location.access_code, '2HBJ')
138-
self.assertEqual(dataset.location.version, '14-JUN-06')
139-
self.assertEqual(dataset.location.details,
140-
'STRUCTURE OF THE YEAST NUCLEAR EXOSOME COMPONENT, '
141-
'RRP6P, REVEALS AN INTERPLAY BETWEEN THE ACTIVE '
142-
'SITE AND THE HRDC DOMAIN')
143-
es = p['entity_source']
144-
self.assertEqual(sorted(es.keys()), ['A', 'B', 'C', 'D'])
145-
self.assertEqual(es['B'], es['C'])
146-
self.assertEqual(es['A'].src_method, 'man')
147-
self.assertEqual(es['A'].gene.scientific_name, 'MUS MUSCULUS')
148-
self.assertEqual(es['A'].gene.common_name, 'HOUSE MOUSE')
149-
self.assertEqual(es['A'].gene.strain, 'TEST STRAIN 1')
150-
self.assertEqual(es['A'].gene.ncbi_taxonomy_id, '10090')
151-
self.assertEqual(es['A'].host.scientific_name, 'ESCHERICHIA COLI')
152-
self.assertEqual(es['A'].host.common_name, 'TEST COMMON 1')
153-
self.assertEqual(es['A'].host.ncbi_taxonomy_id, '562')
154-
self.assertEqual(es['A'].host.strain, 'TEST STRAIN 2')
155-
self.assertEqual(es['B'].src_method, 'nat')
156-
self.assertEqual(es['B'].scientific_name, 'ESCHERICHIA COLI')
157-
self.assertEqual(es['B'].common_name, 'TEST COMMON 2')
158-
self.assertEqual(es['B'].ncbi_taxonomy_id, '562')
159-
self.assertEqual(es['B'].strain, 'TEST STRAIN 3')
160-
self.assertEqual(es['D'].src_method, 'syn')
161-
self.assertEqual(es['D'].scientific_name, 'HELIANTHUS ANNUUS')
162-
self.assertEqual(es['D'].common_name, 'COMMON SUNFLOWER')
163-
self.assertEqual(es['D'].ncbi_taxonomy_id, '4232')
164-
self.assertEqual(es['D'].strain, 'TEST STRAIN 4')
134+
self._check_parsed_official_pdb(p, pdb_format=True)
165135

166136
def test_bad_header(self):
167137
"""Test PDBParser when given a non-official PDB with HEADER line"""
@@ -454,16 +424,48 @@ def test_binary_cif_official_pdb(self):
454424
p = parser.parse_file(fname)
455425
self._check_parsed_official_pdb(p)
456426

457-
def _check_parsed_official_pdb(self, p):
427+
def _check_parsed_official_pdb(self, p, pdb_format=False):
458428
dataset = p['dataset']
459429
self.assertEqual(dataset.data_type, 'Experimental model')
460430
self.assertEqual(dataset.location.db_name, 'PDB')
461431
self.assertEqual(dataset.location.access_code, '2HBJ')
462-
self.assertEqual(dataset.location.version, '2021-11-10')
463-
self.assertEqual(dataset.location.details,
464-
'Structure of the yeast nuclear exosome component, '
465-
'Rrp6p, reveals an interplay between the active '
466-
'site and the HRDC domain')
432+
if pdb_format:
433+
self.assertEqual(dataset.location.version, '14-JUN-06')
434+
else:
435+
self.assertEqual(dataset.location.version, '2021-11-10')
436+
details = ('Structure of the yeast nuclear exosome component, '
437+
'Rrp6p, reveals an interplay between the active '
438+
'site and the HRDC domain')
439+
if pdb_format:
440+
details = details.upper()
441+
self.assertEqual(dataset.location.details, details)
442+
443+
es = p['entity_source']
444+
self.assertEqual(sorted(es.keys()), ['A', 'B', 'C', 'D'])
445+
self.assertEqual(es['B'], es['C'])
446+
self.assertEqual(es['A'].src_method, 'man')
447+
self.assertEqual(es['A'].gene.scientific_name, 'MUS MUSCULUS')
448+
self.assertEqual(es['A'].gene.common_name, 'HOUSE MOUSE')
449+
self.assertEqual(es['A'].gene.strain, 'TEST STRAIN 1')
450+
self.assertEqual(es['A'].gene.ncbi_taxonomy_id, '10090')
451+
self.assertEqual(es['A'].host.scientific_name, 'ESCHERICHIA COLI')
452+
self.assertEqual(es['A'].host.common_name, 'TEST COMMON 1')
453+
self.assertEqual(es['A'].host.ncbi_taxonomy_id, '562')
454+
self.assertEqual(es['A'].host.strain, 'TEST STRAIN 2')
455+
self.assertEqual(es['B'].src_method, 'nat')
456+
self.assertEqual(es['B'].scientific_name, 'ESCHERICHIA COLI')
457+
self.assertEqual(es['B'].common_name, 'TEST COMMON 2')
458+
self.assertEqual(es['B'].ncbi_taxonomy_id, '562')
459+
self.assertEqual(es['B'].strain, 'TEST STRAIN 3')
460+
self.assertEqual(es['D'].src_method, 'syn')
461+
self.assertEqual(es['D'].scientific_name, 'HELIANTHUS ANNUUS')
462+
self.assertEqual(es['D'].common_name, 'COMMON SUNFLOWER')
463+
self.assertEqual(es['D'].ncbi_taxonomy_id, '4232')
464+
# _pdbx_entity_src_syn.strain is not used in current PDB entries
465+
if pdb_format:
466+
self.assertEqual(es['D'].strain, 'TEST STRAIN 4')
467+
else:
468+
self.assertIsNone(es['D'].strain)
467469

468470
def test_cif_model_archive(self):
469471
"""Test CIFParser when given an mmCIF in Model Archive"""

0 commit comments

Comments
 (0)