This is based on my limited understanding of the codebase so please correct me if I am wrong. When inspecting the code, it appears to me that there is a bug in the process_mmcif function within DataPipelineMultimer when there are chains with the same sequence in a mmcif file.
|
def process_mmcif( |
|
self, |
|
mmcif: mmcif_parsing.MmcifObject, # parsing is expensive, so no path |
|
alignment_dir: str, |
|
alignment_index: Optional[Any] = None, |
|
) -> FeatureDict: |
|
|
|
all_chain_features = {} |
|
sequence_features = {} |
|
is_homomer_or_monomer = len(set(list(mmcif.chain_to_seqres.values()))) == 1 |
|
for chain_id, seq in mmcif.chain_to_seqres.items(): |
|
desc= "_".join([mmcif.file_id, chain_id]) |
|
|
|
if seq in sequence_features: |
|
all_chain_features[desc] = copy.deepcopy( |
|
sequence_features[seq] |
|
) |
|
continue |
|
|
|
if alignment_index is not None: |
|
chain_alignment_index = alignment_index.get(desc) |
|
chain_alignment_dir = alignment_dir |
|
else: |
|
chain_alignment_index = None |
|
chain_alignment_dir = os.path.join(alignment_dir, desc) |
|
|
|
chain_features = self._process_single_chain( |
|
chain_id=desc, |
|
sequence=seq, |
|
description=desc, |
|
chain_alignment_dir=chain_alignment_dir, |
|
chain_alignment_index=chain_alignment_index, |
|
is_homomer_or_monomer=is_homomer_or_monomer |
|
) |
|
|
|
chain_features = convert_monomer_features( |
|
chain_features, |
|
chain_id=desc |
|
) |
|
|
|
mmcif_feats = self.get_mmcif_features(mmcif, chain_id) |
|
chain_features.update(mmcif_feats) |
|
all_chain_features[desc] = chain_features |
|
sequence_features[seq] = chain_features |
|
|
|
all_chain_features = add_assembly_features(all_chain_features) |
|
|
|
np_example = feature_processing_multimer.pair_and_merge( |
|
all_chain_features=all_chain_features, |
|
) |
|
|
|
# Pad MSA to avoid zero-sized extra_msa. |
|
np_example = pad_msa(np_example, 512) |
|
|
|
return np_example |
When looping through chains in the mmcif object, previously processed features are cached in a dictionary with the sequence as keys. If a future chain has a matching sequence in the cached dictionary, the features are then directly retrieved from the dictionary. This works when only sequence features are being processed, but here atom coordinates are being included at the same time within the loop. From my understanding, this would cause all chains of the same sequence within a mmcif file to have the same coordinates.
Proposed change is to append structural information in a separate loop:
for chain_id, seq in mmcif.chain_to_seqres.items():
desc= "_".join([mmcif.file_id, chain_id])
if seq in sequence_features:
all_chain_features[desc] = copy.deepcopy(
sequence_features[seq]
)
continue
if alignment_index is not None:
chain_alignment_index = alignment_index.get(desc)
chain_alignment_dir = alignment_dir
else:
chain_alignment_index = None
chain_alignment_dir = os.path.join(alignment_dir, desc)
chain_features = self._process_single_chain(
chain_id=desc,
sequence=seq,
description=desc,
chain_alignment_dir=chain_alignment_dir,
chain_alignment_index=chain_alignment_index,
is_homomer_or_monomer=is_homomer_or_monomer
)
chain_features = convert_monomer_features(
chain_features,
chain_id=desc
)
all_chain_features[desc] = chain_features
sequence_features[seq] = chain_features
for chain_id, seq in mmcif.chain_to_seqres.items():
desc= "_".join([mmcif.file_id, chain_id])
mmcif_feats = self.get_mmcif_features(mmcif, chain_id)
all_chain_features[desc].update(mmcif_feats)
This is based on my limited understanding of the codebase so please correct me if I am wrong. When inspecting the code, it appears to me that there is a bug in the
process_mmciffunction withinDataPipelineMultimerwhen there are chains with the same sequence in a mmcif file.openfold/openfold/data/data_pipeline.py
Lines 1319 to 1373 in 815a042
When looping through chains in the mmcif object, previously processed features are cached in a dictionary with the sequence as keys. If a future chain has a matching sequence in the cached dictionary, the features are then directly retrieved from the dictionary. This works when only sequence features are being processed, but here atom coordinates are being included at the same time within the loop. From my understanding, this would cause all chains of the same sequence within a mmcif file to have the same coordinates.
Proposed change is to append structural information in a separate loop: