Skip to content

Potential bug in DataPipelineMultimer when there are chains with the same sequence #525

@dxu16

Description

@dxu16

This is based on my limited understanding of the codebase so please correct me if I am wrong. When inspecting the code, it appears to me that there is a bug in the process_mmcif function within DataPipelineMultimer when there are chains with the same sequence in a mmcif file.

def process_mmcif(
self,
mmcif: mmcif_parsing.MmcifObject, # parsing is expensive, so no path
alignment_dir: str,
alignment_index: Optional[Any] = None,
) -> FeatureDict:
all_chain_features = {}
sequence_features = {}
is_homomer_or_monomer = len(set(list(mmcif.chain_to_seqres.values()))) == 1
for chain_id, seq in mmcif.chain_to_seqres.items():
desc= "_".join([mmcif.file_id, chain_id])
if seq in sequence_features:
all_chain_features[desc] = copy.deepcopy(
sequence_features[seq]
)
continue
if alignment_index is not None:
chain_alignment_index = alignment_index.get(desc)
chain_alignment_dir = alignment_dir
else:
chain_alignment_index = None
chain_alignment_dir = os.path.join(alignment_dir, desc)
chain_features = self._process_single_chain(
chain_id=desc,
sequence=seq,
description=desc,
chain_alignment_dir=chain_alignment_dir,
chain_alignment_index=chain_alignment_index,
is_homomer_or_monomer=is_homomer_or_monomer
)
chain_features = convert_monomer_features(
chain_features,
chain_id=desc
)
mmcif_feats = self.get_mmcif_features(mmcif, chain_id)
chain_features.update(mmcif_feats)
all_chain_features[desc] = chain_features
sequence_features[seq] = chain_features
all_chain_features = add_assembly_features(all_chain_features)
np_example = feature_processing_multimer.pair_and_merge(
all_chain_features=all_chain_features,
)
# Pad MSA to avoid zero-sized extra_msa.
np_example = pad_msa(np_example, 512)
return np_example

When looping through chains in the mmcif object, previously processed features are cached in a dictionary with the sequence as keys. If a future chain has a matching sequence in the cached dictionary, the features are then directly retrieved from the dictionary. This works when only sequence features are being processed, but here atom coordinates are being included at the same time within the loop. From my understanding, this would cause all chains of the same sequence within a mmcif file to have the same coordinates.

Proposed change is to append structural information in a separate loop:

for chain_id, seq in mmcif.chain_to_seqres.items():
    desc= "_".join([mmcif.file_id, chain_id])

    if seq in sequence_features:
        all_chain_features[desc] = copy.deepcopy(
            sequence_features[seq]
        )
        continue

    if alignment_index is not None:
        chain_alignment_index = alignment_index.get(desc)
        chain_alignment_dir = alignment_dir
    else:
        chain_alignment_index = None
        chain_alignment_dir = os.path.join(alignment_dir, desc)

    chain_features = self._process_single_chain(
        chain_id=desc,
        sequence=seq,
        description=desc,
        chain_alignment_dir=chain_alignment_dir,
        chain_alignment_index=chain_alignment_index,
        is_homomer_or_monomer=is_homomer_or_monomer
    )

    chain_features = convert_monomer_features(
        chain_features,
        chain_id=desc
    )

    all_chain_features[desc] = chain_features
    sequence_features[seq] = chain_features

for chain_id, seq in mmcif.chain_to_seqres.items():
    desc= "_".join([mmcif.file_id, chain_id])

    mmcif_feats = self.get_mmcif_features(mmcif, chain_id)
    all_chain_features[desc].update(mmcif_feats)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions