Skip to content

Relative Accessible Surface Area #14

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
152 changes: 152 additions & 0 deletions pylib_3.9.7/df/ProteinRASA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import base64
import gzip
from io import StringIO
import os
import traceback
from typing import Optional

from Bio.SeqRecord import SeqRecord
from Bio.PDB import PDBParser
from Bio.PDB.SASA import ShrakeRupley
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.SeqUtils import seq1 as amino3to1

from df.bio_helper import column_to_sequences, string_to_sequence, column_to_structures, generate_color_gradient
from df.data_transfer import ColumnData, TableData, DataFunctionRequest, DataFunctionResponse, DataFunction, DataType, \
string_input_field, boolean_input_field, integer_input_field, input_field_to_column, \
Notification, NotificationLevel

from ruse.bio.antibody import align_antibody_sequences, NumberingScheme, CDRDefinitionScheme, ANTIBODY_NUMBERING_COLUMN_PROPERTY
from ruse.bio.bio_data_table_helper import sequence_to_genbank_base64_str

class ProteinRASA(DataFunction):
"""
Computes relative solvent accessible surface area of a given protein
"""

def execute(self, request: DataFunctionRequest) -> DataFunctionResponse:

# get settings from UI
structure_column = input_field_to_column(request, 'uiStructureColumn')
structures, notifications = column_to_structures(structure_column)

sequence_column = input_field_to_column(request, 'uiSequenceColumn')
sequences = column_to_sequences(sequence_column)

rasa_cutoff = integer_input_field(request, 'uiRASACutoff')
exposedColor = string_input_field(request, 'uiExposedColor')

labelAll = boolean_input_field(request, 'uiLabelAll')
startColor = string_input_field(request, 'uiStartColor')
endColor = string_input_field(request, 'uiEndColor')

if labelAll:
RASA_color_gradient = generate_color_gradient(startColor, endColor, 21)

# setup BioPython tools
sr = ShrakeRupley()
backbone_atoms = ['N', 'H', 'CA', 'HA', 'C', 'O', 'H2', 'H3']

# output storage
output_sequences = []

# process each structure
for index, (structure, sequence) in enumerate(zip(structures, sequences)):
if not (structure and sequence):
notifications.append(Notification(level = NotificationLevel.ERROR,
title = 'Relative Accessible Surface Area',
summary = f'Error for structure or sequence in Row {index}/nNull values found.',
details = f'Either the structure or sequence were null.'))
continue

# create a mapping between the structure residues and potentially gapped sequence residues
# would this be a useful biotools function rather than buried here?
sequence_number_mapping = {}
structure_residue_number = 0
structure_residues = [amino3to1(res.get_resname()) for res in structure.get_residues()]
for sequence_residue_number, sequence_residue in enumerate(sequence.seq):
if sequence_residue != '-':
if sequence_residue != structure_residues[structure_residue_number]:
# send a notification and then fail
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you raise an error here that will fail the data function and create a Spotfire notification

notifications.append(Notification(level = NotificationLevel.ERROR,
title = 'Relative Accessible Surface Area',
summary = f'Error for structure and sequence in Row {index}.',
details = f'Sequences in structure do not match those in sequence column.'))
continue
sequence_number_mapping[structure_residue_number] = sequence_residue_number
structure_residue_number += 1

# compute RASA for each residue/atom in context of the intact protein
try:
sr.compute(structure, level = 'A')
except Exception as ex:
notifications.append(Notification(level = NotificationLevel.ERROR,
title = 'Relative Accessible Surface Area',
summary = f'Error for structure in Row {index}.\n' +
'Shrake-Rupley calculation failed for complete structure.',
details = f'{ex.__class__} - {ex}\n' +
f'{traceback.format_exc()}'))
output_sequences.append(sequence)
continue

success = True # for scoping outside loop
for residue_index, residue in enumerate(structure.get_residues()):
# compute RASA for each residue isolated from the protein structure
residue_copy = residue.copy()

success = True # optimistic for success
ex_storage = None # use for possible storage of exception for use outside loop
try:
sr.compute(residue_copy, level = 'A')
except Exception as ex:
ex_storage = ex.copy()
success = False
continue

in_context_sa = sum([atom.sasa for atom in residue.get_atoms() if atom.get_id() not in backbone_atoms])
reference_sa = sum([atom.sasa for atom in residue_copy.get_atoms() if atom.get_id() not in backbone_atoms])
rasa = round(in_context_sa / reference_sa * 100, 1)

# create annotations
if labelAll:
feature = SeqFeature(FeatureLocation(sequence_number_mapping[residue_index],
sequence_number_mapping[residue_index] + 1),
type = 'misc_feature',
qualifiers = {'feature_name': 'Relative Accessible Surface Area',
'note': ['RASA',
'glysade_annotation_type: RASA',
f'RASA: {rasa}%',
f'Residue ID: {residue.resname}',
f'ld_style:{{"color": "{RASA_color_gradient[int(rasa // 5)]}", "shape": "rounded-rectangle"}}',
'ld_track: RASA']})
sequence.features.append(feature)

if rasa >= rasa_cutoff:
feature = SeqFeature(FeatureLocation(sequence_number_mapping[residue_index],
sequence_number_mapping[residue_index] + 1),
type = 'misc_feature',
qualifiers = {'feature_name': 'Solvent Exposed Residue',
'note': [f'Exposed >= {rasa_cutoff}',
'glysade_annotation_type: RASA_exposed',
f'RASA: {rasa}%',
f'Residue ID: {residue.resname}',
f'ld_style:{{"color": "{exposedColor}", "shape": "rounded-rectangle"}}',
'ld_track: RASA_exposed']})
sequence.features.append(feature)

if not success:
notifications.append(Notification(level = NotificationLevel.ERROR,
title = 'Relative Accessible Surface Area',
summary = f'Error for structure in Row {index}.\n' +
f'Shrake-Rupley calculation failed for for some residues.',
details = f'Example error: {ex.__class__} - {ex}\n' +
f'{traceback.format_exc()}'))
output_sequences.append(sequence)

# construct output column
rows = [sequence_to_genbank_base64_str(s) for s in output_sequences]
output_column = ColumnData(name = f'Relative Accessible Surface Area Annotations',
dataType = DataType.BINARY,
contentType = 'chemical/x-genbank', values = rows)

return DataFunctionResponse(outputColumns = [output_column])
83 changes: 80 additions & 3 deletions pylib_3.9.7/df/bio_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,20 @@
`Biopython <https://biopython.org/>`_ SeqRecord objects are used to represent sequences

"""

import base64
import gzip
from io import StringIO
from typing import List, Optional
import traceback
from typing import List, Optional, Tuple

from Bio import SeqIO
from Bio.PDB import PDBParser
from Bio.PDB.Structure import Structure
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from df.data_transfer import ColumnData, DataType, DataFunctionRequest, string_input_field
from df.data_transfer import ColumnData, DataType, DataFunctionRequest, string_input_field, \
Notification, NotificationLevel
from ruse.bio.bio_data_table_helper import genbank_base64_str_to_sequence, sequence_to_genbank_base64_str, \
string_to_sequence

Expand Down Expand Up @@ -68,6 +73,49 @@ def encode_seq(seq):
else:
return ColumnData(name=column_name, dataType=DataType.STRING, contentType='chemical/x-sequence', values=values)

def column_to_structures(column: ColumnData, id_column: Optional[ColumnData] = None) \
-> Tuple[List[Optional[Structure]], List[Optional[Notification]]]:
"""
Converts a Spotfire column into a list of PDB files

:param column: the Spotfire column
:param id_column: if set, row values from this column are used to set the structure identifier

:return: Tuple of List of Structure objects and List of Notification objects
"""

content_type = column.contentType
if content_type != 'chemical/x-pdb':
raise ValueError(f'Unable to process content type {content_type} as Structure Column.')

parser = PDBParser(QUIET=True)
structures = []
notifications = []

try:
for index, data in enumerate(column.values):
if id_column:
identifier = id_column.values[index]
else:
identifier = f'Row {index}'

pdb_zip = base64.b64decode(data)
pdb_data = gzip.decompress(pdb_zip).decode()

with StringIO(pdb_data) as structure_fh:
structure = parser.get_structure(identifier, structure_fh)
structure.id = identifier

structures.append(structure)
except Exception as ex:
structures.append(None)
notifications.append(Notification(level = NotificationLevel.WARNING,
title = 'column_to_structures',
summary = f'Error for structure {identifier}./n{ex.__class__} - {ex}',
details = f'An error occurred during parsing of the compressed structure.\n' +
f'{traceback.format_exc()}'))

return structures, notifications

def query_from_request(request: DataFunctionRequest, input_field_name: str = 'query') -> SeqRecord:
"""
Expand Down Expand Up @@ -97,3 +145,32 @@ def query_from_request(request: DataFunctionRequest, input_field_name: str = 'qu
sequence = SeqRecord(Seq(query), 'Query')

return sequence

def generate_color_gradient(start_color, end_color, num_steps):
"""
Extract RGB components from hex values

:param start_color: hex color for start of gradient
:param end_color: hex color for end of gradient
:param num_steps: number of color steps in gradient
:return: a list of hex colors constituting the gradient
"""

start_rgb = [int(start_color[i:i+2], 16) / 255.0 for i in (1, 3, 5)]
end_rgb = [int(end_color[i:i+2], 16) / 255.0 for i in (1, 3, 5)]

# Calculate step size for each RGB component
r_step = (end_rgb[0] - start_rgb[0]) / (num_steps - 1)
g_step = (end_rgb[1] - start_rgb[1]) / (num_steps - 1)
b_step = (end_rgb[2] - start_rgb[2]) / (num_steps - 1)

# Generate a linear gradient in RGB space
color_gradient = [
'#{0:02x}{1:02x}{2:02x}'.format(
int((start_rgb[0] + i * r_step) * 255),
int((start_rgb[1] + i * g_step) * 255),
int((start_rgb[2] + i * b_step) * 255)
) for i in range(num_steps)
]

return color_gradient