Module xi_covutils.distances
Functions and classes to work with residue distances in proteins structures
Expand source code
"""
Functions and classes to work with residue distances in proteins structures
"""
import csv
import operator
import re
from functools import reduce
from itertools import combinations, combinations_with_replacement, product
from typing import Callable, Dict, List, Optional, TextIO, Tuple, TypeVar, Union
from Bio.PDB.Atom import Atom
from Bio.PDB.Model import Model
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Residue import Residue
from Bio.PDB.Structure import Structure
from xi_covutils.pdbbank import PDBSource, pdb_structure_from
Chain = str
Position = int
Distance = float
Resname = str
AtomId = str
DistanceElement = Tuple[Chain, Position, Chain, Position, Distance]
DistanceElementLong = Tuple[
Chain, Position, Resname, AtomId,
Chain, Position, Resname, AtomId,
Distance
]
DistanceData = List[DistanceElement]
DistanceDataLong = List[DistanceElementLong]
DistanceDataSH = List[Union[DistanceElement, DistanceElementLong]]
AtomSelector = Callable[[Atom, Atom], bool]
class Distances():
'''
Store and access distance data for residues from a protein structure.
'''
def __init__(
self,
dist_data: DistanceData
):
"""
Creates a new instance from distance data.
Args:
dist_data (DistanceData): Distance data should be a list of tuples of five
elements: (chain1, pos1, chain2, pos2, distance).
"""
dis:Dict[Tuple[Chain, Position], Dict[Tuple[Chain, Position], float]] = {}
for ch1, po1, ch2, po2, dist in dist_data:
if (ch1, po1) not in dis:
dis[(ch1, po1)] = {}
dis[(ch1, po1)][(ch2, po2)] = dist
self._distances = dis
def raw_distances(self) -> DistanceData:
"""
Returns:
DistanceData: Returns the distances data of the object as a list of
tuples. Each tuples has five elements: (chain1, pos1, chain2, pos2,
distance).
"""
return [
(chain1, pos1, chain2, pos2, dist)
for (chain1, pos1), c_pos in self._distances.items()
for (chain2, pos2), dist in c_pos.items()
]
def of( #pylint: disable=invalid-name
self,
chain_a:Chain,
pos_a:Position,
chain_b:Chain,
pos_b:Position
) -> Optional[Distance]:
"""
Retrieves distance for a residue pair.
Args:
chain_a (str): A string specifying the first residue chain.
pos_a (int): An integer specifying the first residue position.
chain_b (str): A string specifying the second residue chain.
pos_b (int): An integer specifying the second residue position.
Returns:
Optional[float]: The distance between two residue positions. If the pair
is not found, None is returned.
"""
pair1 = ((chain_a, pos_a))
pair2 = ((chain_b, pos_b))
if pair1 == pair2: # Special case for distance with the same residue.
return 0
distance = self._distances.get(pair1, {}).get(pair2)
if not distance:
distance = self._distances.get(pair2, {}).get(pair1)
return distance
def remap_positions(
self,
mapping: dict[Chain, dict[Position, Position]]
):
"""
Remap index positions.
If a positions could not be mapped it is excluded from the results.
Args:
mapping (dict[str, dict[int, float]]): a dict that maps old positions to
new positions.
"""
T = TypeVar("T")
def _remap(dic: dict[tuple[str, int], T]) -> dict[tuple[str, int], T]:
return {
(chain, mapping[chain][pos]):value
for (chain, pos), value in dic.items()
if pos in mapping.get(chain, {})
}
self._distances = _remap(
{
(c1, p1):_remap(r2)
for (c1, p1), r2 in self._distances.items()
}
)
def is_contact( #pylint: disable=too-many-arguments
self,
chain_a: Chain,
pos_a: Position,
chain_b: Chain,
pos_b: Position,
distance_cutoff:Distance=6.05
) -> bool:
'''
Args:
chain_a (str): A string specifying the first residue chain.
pos_a (int): An integer specifying the first residue position.
chain_b (str): A string specifying the second residue chain.
pos_b (str): An integer specifying the second residue position.
distance_cutoff (float): a float with the distance cutoff (defaults to
6.05 angstroms)
Returns:
bool: Returns True if a given pair's distance is lower or equal than a
given distance cutoff.
'''
dist = self.of(chain_a, pos_a, chain_b, pos_b)
if dist is None:
return False
return dist <= distance_cutoff
@staticmethod
def _sum_true(boolean_list: list[bool]):
return reduce(lambda a, b: a+(1 if b else 0), boolean_list, 0)
def mean_intramolecular(self) -> dict[Chain, float]:
"""
Returns:
Return the mean number of intramolecular contacts across all residues for
every chain.
"""
def _pos_contacts(chain:str, pos1:int, all_positions: list[int]):
return [
self.is_contact(chain, pos1, chain, pos2)
for pos2 in all_positions
if not pos1 == pos2
]
all_residues = set(self._distances.keys()).union(
{
pair2
for pair1 in self._distances.keys()
for pair2 in self._distances[pair1].keys()
}
)
all_chains = {chain for chain, _ in all_residues}
pos_by_chain = {
chain: [p for c, p in all_residues if c == chain]
for chain in all_chains
}
n_contacts = {
chain: [
self._sum_true(_pos_contacts(chain, pos, pos_by_chain[chain]))
for pos in pos_by_chain[chain]
]
for chain in all_chains
}
n_contacts = {
chain: float(reduce(operator.add, n, 0)) / max(1, len(n))
for chain, n in n_contacts.items()
}
return n_contacts
@staticmethod
def from_contact_map(
contact_map: Dict[Tuple[int, int], bool]
) -> 'Distances':
"""
Create a new Distance object from a contact map.
Set contact to a distace of 1 and non contacts to 10.
Sets the chain to be 'A'.
"""
dist_data = []
for (pos1, pos2), is_contact in contact_map.items():
dist_data.append(
('A', pos1, 'A', pos2, 1 if is_contact else 10)
)
return Distances(dist_data)
def from_mitos(dist_file: str) -> DistanceData:
"""
Loads data of residue distances from a file generated by MIToS.
Input data should look like:
<pre>
# model_i,chain_i,group_i,pdbe_i,number_i,name_i,model_j,chain_j,group_j,pdbe_j,number_j,name_j,distance
1,A,ATOM,,55,LEU,1,A,ATOM,,56,LEU,1.3247309160731473
</pre>
Args:
dist_file (str): A string to a text file with the distance data.
Returns:
list[tuple[str, int, str, int, float]]: The distances in the file.
""" # pylint: disable=line-too-long
d_pattern = re.compile(
r"(\d+),(.),(.+),.*,(\d+),(.+),(\d+),(.),(.+),.*,(\d+),(.+),(.+)$"
)
res = []
with open(dist_file, "r", encoding="utf8") as handle:
for line in handle:
line = line.strip()
if not line.startswith("#"):
match = re.match(d_pattern, line)
if not match:
continue
try:
res.append((
match.group(2), # Chain 1
int(match.group(4)), # Pos res 1
match.group(7), # Chain 2
int(match.group(9)), # Pos res 2
float(match.group(11)))) # distance
except (IndexError, AttributeError):
pass
return res
def is_back_bone(atom:Atom) -> bool:
"""
Decides if an atom belongs to the backbone of a prototein by their name.
Args:
atom (Atom): An atom.
Returns:
bool: True if the given atom belongs to the backbone of the protein.
"""
return atom.id in ['N', 'CA', 'CB']
def all_atoms_selector(atom1:Atom, atom2:Atom) -> bool:
"""
Accepts two any atoms.
Args:
atom1 (Atom): An Atom
atom2 (Atom): An Atom
Returns:
bool: True, always.
"""
#pylint: disable=unused-argument
return True
def side_chain_selector(atom1:Atom, atom2:Atom) -> bool:
"""
Accepts two atoms that are part of the sidechain of an aminoacid.
Args:
atom1 (Atom): An Atom.
atom2 (Atom): An Atom.
Returns:
bool: True if both atom are part of the side chain of a residue.
"""
return not is_back_bone(atom1) and not is_back_bone(atom2)
def carbon_alfa_selector(atom1:Atom, atom2:Atom) -> bool:
"""
Accepts two alpha carbon atoms
Args:
atom1 (Atom): An Atom
atom2 (Atom): An Atom
Returns:
bool: True if both atom are Alpha Carbons.
"""
return atom1.id == 'CA' and atom2.id == 'CA'
def carbon_beta_selector(atom1:Atom, atom2:Atom) -> bool:
"""
Accepts two beta carbon atoms
Args:
atom1 (Atom): An Atom
atom2 (Atom): An Atom
Returns:
bool: True, if both atoms a Beta Carbons.
"""
return atom1.id == 'CB' and atom2.id == 'CB'
def _pick_pdb_model(pdb_source) -> Optional[Model]:
model = None
if isinstance(pdb_source, Structure):
struct = pdb_source
model = list(struct.get_models())[0]
elif isinstance(pdb_source, Model):
model = pdb_source
elif isinstance(pdb_source, str):
parser = PDBParser()
struct = parser.get_structure('XXXX', pdb_source)
model = list(struct.get_models())[0]
return model
def _shorter_distance_between_residues(
res1: Residue,
res2: Residue,
atom_selector: AtomSelector
) -> Optional[DistanceElementLong]:
min_dist = float('inf')
min_res_data = None
p_1 = res1.parent
c_1 = str(p_1.id) if p_1 else ""
p_2 = res2.parent
c_2 = str(p_2.id) if p_2 else ""
for atom1, atom2 in product(res1, res2):
if (
not atom1.id.startswith('H') and
not atom2.id.startswith('H') and
atom_selector(atom1, atom2)
):
dist = atom1-atom2
if dist < min_dist:
min_dist = dist
sorted_pair = sorted(
[
(c_1, res1.id[1], res1.resname, atom1.id),
(c_2, res2.id[1], res2.resname, atom2.id)
]
)
min_res_data = (
sorted_pair[0][0],
sorted_pair[0][1],
sorted_pair[0][2],
sorted_pair[0][3],
sorted_pair[1][0],
sorted_pair[1][1],
sorted_pair[1][2],
sorted_pair[1][3],
dist
)
if not min_res_data:
return None
return min_res_data
def calculate_distances(
pdb_source:PDBSource,
atom_selector:AtomSelector=all_atoms_selector,
include_extra_info:bool=False
) -> DistanceDataSH:
"""
Compute distances between residues
Args:
pdb_source (PDBSource): a path to a pdb file, a Bio.PDB.Structure or a
Bio.PDB.Model
atom_selector (AtomSelector): all_atoms_selector. a function that allows to
select pairs of atoms to include into the distance calculation.
include_extra_info (bool): False. If True adds residue name and atom name
for each contacting atom to the output.
Returns:
DistanceDataSH: The distances calculated.
Throws:
ValueError: If a PDB model cannot be found in the PDB source.
"""
model = _pick_pdb_model(pdb_source)
if not model:
raise ValueError("PDB source not recognized")
chains = model.get_chains()
out = []
for chain1, chain2 in combinations_with_replacement(chains, 2):
if chain1 is chain2:
res_iter = combinations(chain1, 2)
else:
res_iter = product(chain1, chain2)
for res1, res2 in res_iter:
min_res_data = None
if not res1 is res2:
min_res_data = _shorter_distance_between_residues(
res1, res2, atom_selector
)
if min_res_data:
if include_extra_info:
out.append(min_res_data)
else:
out.append((
min_res_data[0],
min_res_data[1],
min_res_data[4],
min_res_data[5],
min_res_data[8]
))
return out
def save_distances(
dist_data:DistanceDataSH,
outfile:str
):
"""
Saves distance data to a file.
Despite the content of the dist_data list, the output file will contain
nine fields. Missing data fill filled with NA fields.
Args:
dist_data (DistanceDataSH): data generated with calculate_distance function.
outfile (str): exported file.
Throws:
ValueError: If input data has wrong number of elements.
"""
with open(outfile, 'w', encoding='utf8') as text_handle:
for row in dist_data:
if len(row) == 9: # Data with additional info.
pass
elif len(row) == 5: # Data with no additional info.
row = [
str(row[0]),
str(row[1]),
"NA",
"NA",
str(row[2]),
str(row[3]),
"NA",
"NA",
str(row[4])
]
else:
raise ValueError("Distance data has wrong number of element")
text_handle.write(" ".join([str(x) for x in row]))
text_handle.write("\n")
def read_distances(
distance_file: str,
add_extra_info:bool=False
) -> DistanceDataSH:
"""
Read distance data file.
Args:
distance_file (str): The input file.
add_extra_info (bool): Read extra information from input data.
Returns:
DistanceDataSH. A List of distances.
"""
out = []
with open(distance_file, "r", encoding="utf-8") as csv_file:
csv_reader = csv.reader(csv_file, delimiter=' ')
for row in csv_reader:
if len(row) == 9: # Data with additional info.
if add_extra_info:
out.append([
row[0],
int(row[1]),
row[2],
row[3],
row[4],
int(row[5]),
row[6],
row[7],
float(row[8])
])
else:
out.append([
row[0],
int(row[1]),
row[4],
int(row[5]),
float(row[8])
])
return out
def contact_map_from_scpe(
file_handle: TextIO,
quaternary: bool = False,
chars: str = "10"
) -> Dict[Tuple[int, int], bool]:
"""
Read contact from SCPE output.
The file content should have as many lines as positions in the protein.
Each line should have characters from chars argument, separated by a space.
There should be as many characters in every line as positions in the
protein.
Args:
file_handle (TextIO): handle to the contact map file.
quaternary (bool): a boolean value that indicates if quaternary contacts
should be included.
chars (str): Characters accepted in the contact map.
This argument is expected to have a length of two characters.
The first is the value for residue pairs in contact,
the second one is the value for non contacts. Defaults to "10".
Returns:
Dict[Tuple[int, int], bool]. Returns a dict object from position pairs to
boolean values that indicates that the corresponding pair is in contact or
not. Position index start at 1.
"""
contact_line_pattern = re.compile(f"[{chars}]( [{chars}])+$")
qtag = "quaternary"
ttag = "terciary"
tags = [qtag, ttag]
target_tag = qtag if quaternary else ttag
correct_section = False
position_index = 0
contact_map = {}
for line in file_handle:
line = line.lower()
c_match = re.match(contact_line_pattern, line)
if c_match and correct_section:
position_index += 1
c_contacts = line.split()
contact_map.update({
(position_index, x+1): c == chars[0]
for x, c in enumerate(c_contacts)})
else:
line = line.strip()
if any(line == t for t in tags):
correct_section = line == target_tag
return contact_map
def contact_map_from_text(
file_handle: TextIO,
chars: str = "10"
) -> Dict[Tuple[int, int], bool]:
"""
Reads the content of a file object as a contact map.
The file content should have as many lines as positions in the protein.
Each line should have characters from chars argument, separated by a space.
There should be as many characters in every line as positions in the
protein.
Args:
file_handle (TextIO):. handle to the contact map file.
chars (str): Characters accepted in the contact map.
This argument is expected to have a length of two characters.
The first is the value for residue pairs in contact,
the second one is the value for non contacts.
Returns:
Dict[Tuple[int, int], bool]. Returns a dict object from position pairs to
boolean values that indicates that the corresponding pair is in contact or
not. Position index start at 1.
"""
contact_line_pattern = re.compile(f"[{chars}]( [{chars}])+$")
position_index = 0
contact_map = {}
for line in file_handle:
line = line.strip().lower()
c_match = re.match(contact_line_pattern, line)
if c_match:
position_index += 1
c_contacts = line.split()
contact_map.update({
(position_index, x+1): c == chars[0]
for x, c in enumerate(c_contacts)})
return contact_map
def calculate_distances_between_regions(
pdbsrc:PDBSource,
chain1:Chain,
chain2:Chain,
region1:List[Position],
region2:List[Position]
) -> DistanceDataLong:
"""
Calculate the distances between residues of two regions
in a pdb structure.
The distance between two residues is the shortest distance between any
two atoms in those residues.
Args:
pdbsrc (PDBSource): An input pdb structure.
chain1 (str): The chain ID of the first region.
chain2 (str): The chain ID of the second region.
region1 (List[int]): A list with the residue numbers of the first region.
region2 (List[int]): A list with the residue numbers of the second region.
Returns:
DistanceDataLong: A list with the distances, atom information and
residue information for all residue pairs. Between regions.
"""
def _select_residues(struc, residues, c_chain):
selected = []
for st_chain in struc[0].get_chains():
if not st_chain.id == c_chain:
continue
for res in st_chain.get_residues():
_, rid, _ = res.id
if not rid in residues:
continue
selected.append(res)
return selected
struct = pdb_structure_from(pdbsrc)
res1 = _select_residues(struct, region1, chain1)
res2 = _select_residues(
struct, region2, chain2
)
distances = [
maybe_dist
for r1, r2 in product(res1, res2)
for maybe_dist in [
_shorter_distance_between_residues(r1, r2, all_atoms_selector)
]
if maybe_dist is not None
]
return distances
Functions
def all_atoms_selector(atom1: Bio.PDB.Atom.Atom, atom2: Bio.PDB.Atom.Atom) ‑> bool
-
Accepts two any atoms.
Args
atom1
:Atom
- An Atom
atom2
:Atom
- An Atom
Returns
bool
- True, always.
Expand source code
def all_atoms_selector(atom1:Atom, atom2:Atom) -> bool: """ Accepts two any atoms. Args: atom1 (Atom): An Atom atom2 (Atom): An Atom Returns: bool: True, always. """ #pylint: disable=unused-argument return True
def calculate_distances(pdb_source: Union[str, Bio.PDB.Structure.Structure], atom_selector: Callable[[Bio.PDB.Atom.Atom, Bio.PDB.Atom.Atom], bool] = <function all_atoms_selector>, include_extra_info: bool = False) ‑> List[Union[Tuple[str, int, str, int, float], Tuple[str, int, str, str, str, int, str, str, float]]]
-
Compute distances between residues
Args
pdb_source
:PDBSource
- a path to a pdb file, a Bio.PDB.Structure or a Bio.PDB.Model
atom_selector
:AtomSelector
- all_atoms_selector. a function that allows to select pairs of atoms to include into the distance calculation.
include_extra_info
:bool
- False. If True adds residue name and atom name for each contacting atom to the output.
Returns
DistanceDataSH
- The distances calculated.
Throws
ValueError: If a PDB model cannot be found in the PDB source.
Expand source code
def calculate_distances( pdb_source:PDBSource, atom_selector:AtomSelector=all_atoms_selector, include_extra_info:bool=False ) -> DistanceDataSH: """ Compute distances between residues Args: pdb_source (PDBSource): a path to a pdb file, a Bio.PDB.Structure or a Bio.PDB.Model atom_selector (AtomSelector): all_atoms_selector. a function that allows to select pairs of atoms to include into the distance calculation. include_extra_info (bool): False. If True adds residue name and atom name for each contacting atom to the output. Returns: DistanceDataSH: The distances calculated. Throws: ValueError: If a PDB model cannot be found in the PDB source. """ model = _pick_pdb_model(pdb_source) if not model: raise ValueError("PDB source not recognized") chains = model.get_chains() out = [] for chain1, chain2 in combinations_with_replacement(chains, 2): if chain1 is chain2: res_iter = combinations(chain1, 2) else: res_iter = product(chain1, chain2) for res1, res2 in res_iter: min_res_data = None if not res1 is res2: min_res_data = _shorter_distance_between_residues( res1, res2, atom_selector ) if min_res_data: if include_extra_info: out.append(min_res_data) else: out.append(( min_res_data[0], min_res_data[1], min_res_data[4], min_res_data[5], min_res_data[8] )) return out
def calculate_distances_between_regions(pdbsrc: Union[str, Bio.PDB.Structure.Structure], chain1: str, chain2: str, region1: List[int], region2: List[int]) ‑> List[Tuple[str, int, str, str, str, int, str, str, float]]
-
Calculate the distances between residues of two regions in a pdb structure. The distance between two residues is the shortest distance between any two atoms in those residues.
Args
pdbsrc
:PDBSource
- An input pdb structure.
chain1
:str
- The chain ID of the first region.
chain2
:str
- The chain ID of the second region.
region1
:List[int]
- A list with the residue numbers of the first region.
region2
:List[int]
- A list with the residue numbers of the second region.
Returns
DistanceDataLong
- A list with the distances, atom information and residue information for all residue pairs. Between regions.
Expand source code
def calculate_distances_between_regions( pdbsrc:PDBSource, chain1:Chain, chain2:Chain, region1:List[Position], region2:List[Position] ) -> DistanceDataLong: """ Calculate the distances between residues of two regions in a pdb structure. The distance between two residues is the shortest distance between any two atoms in those residues. Args: pdbsrc (PDBSource): An input pdb structure. chain1 (str): The chain ID of the first region. chain2 (str): The chain ID of the second region. region1 (List[int]): A list with the residue numbers of the first region. region2 (List[int]): A list with the residue numbers of the second region. Returns: DistanceDataLong: A list with the distances, atom information and residue information for all residue pairs. Between regions. """ def _select_residues(struc, residues, c_chain): selected = [] for st_chain in struc[0].get_chains(): if not st_chain.id == c_chain: continue for res in st_chain.get_residues(): _, rid, _ = res.id if not rid in residues: continue selected.append(res) return selected struct = pdb_structure_from(pdbsrc) res1 = _select_residues(struct, region1, chain1) res2 = _select_residues( struct, region2, chain2 ) distances = [ maybe_dist for r1, r2 in product(res1, res2) for maybe_dist in [ _shorter_distance_between_residues(r1, r2, all_atoms_selector) ] if maybe_dist is not None ] return distances
def carbon_alfa_selector(atom1: Bio.PDB.Atom.Atom, atom2: Bio.PDB.Atom.Atom) ‑> bool
-
Accepts two alpha carbon atoms
Args
atom1
:Atom
- An Atom
atom2
:Atom
- An Atom
Returns
bool
- True if both atom are Alpha Carbons.
Expand source code
def carbon_alfa_selector(atom1:Atom, atom2:Atom) -> bool: """ Accepts two alpha carbon atoms Args: atom1 (Atom): An Atom atom2 (Atom): An Atom Returns: bool: True if both atom are Alpha Carbons. """ return atom1.id == 'CA' and atom2.id == 'CA'
def carbon_beta_selector(atom1: Bio.PDB.Atom.Atom, atom2: Bio.PDB.Atom.Atom) ‑> bool
-
Accepts two beta carbon atoms
Args
atom1
:Atom
- An Atom
atom2
:Atom
- An Atom
Returns
bool
- True, if both atoms a Beta Carbons.
Expand source code
def carbon_beta_selector(atom1:Atom, atom2:Atom) -> bool: """ Accepts two beta carbon atoms Args: atom1 (Atom): An Atom atom2 (Atom): An Atom Returns: bool: True, if both atoms a Beta Carbons. """ return atom1.id == 'CB' and atom2.id == 'CB'
def contact_map_from_scpe(file_handle:
, quaternary: bool = False, chars: str = '10') ‑> Dict[Tuple[int, int], bool] -
Read contact from SCPE output.
The file content should have as many lines as positions in the protein. Each line should have characters from chars argument, separated by a space. There should be as many characters in every line as positions in the protein.
Args
file_handle
:TextIO
- handle to the contact map file.
quaternary
:bool
- a boolean value that indicates if quaternary contacts should be included.
chars
:str
- Characters accepted in the contact map. This argument is expected to have a length of two characters. The first is the value for residue pairs in contact, the second one is the value for non contacts. Defaults to "10".
Returns
Dict[Tuple[int, int], bool]. Returns a dict object from position pairs to boolean values that indicates that the corresponding pair is in contact or not. Position index start at 1.
Expand source code
def contact_map_from_scpe( file_handle: TextIO, quaternary: bool = False, chars: str = "10" ) -> Dict[Tuple[int, int], bool]: """ Read contact from SCPE output. The file content should have as many lines as positions in the protein. Each line should have characters from chars argument, separated by a space. There should be as many characters in every line as positions in the protein. Args: file_handle (TextIO): handle to the contact map file. quaternary (bool): a boolean value that indicates if quaternary contacts should be included. chars (str): Characters accepted in the contact map. This argument is expected to have a length of two characters. The first is the value for residue pairs in contact, the second one is the value for non contacts. Defaults to "10". Returns: Dict[Tuple[int, int], bool]. Returns a dict object from position pairs to boolean values that indicates that the corresponding pair is in contact or not. Position index start at 1. """ contact_line_pattern = re.compile(f"[{chars}]( [{chars}])+$") qtag = "quaternary" ttag = "terciary" tags = [qtag, ttag] target_tag = qtag if quaternary else ttag correct_section = False position_index = 0 contact_map = {} for line in file_handle: line = line.lower() c_match = re.match(contact_line_pattern, line) if c_match and correct_section: position_index += 1 c_contacts = line.split() contact_map.update({ (position_index, x+1): c == chars[0] for x, c in enumerate(c_contacts)}) else: line = line.strip() if any(line == t for t in tags): correct_section = line == target_tag return contact_map
def contact_map_from_text(file_handle:
, chars: str = '10') ‑> Dict[Tuple[int, int], bool] -
Reads the content of a file object as a contact map.
The file content should have as many lines as positions in the protein. Each line should have characters from chars argument, separated by a space. There should be as many characters in every line as positions in the protein.
Args
- file_handle (TextIO):. handle to the contact map file.
chars
:str
- Characters accepted in the contact map. This argument is expected to have a length of two characters. The first is the value for residue pairs in contact, the second one is the value for non contacts.
Returns
Dict[Tuple[int, int], bool]. Returns a dict object from position pairs to boolean values that indicates that the corresponding pair is in contact or not. Position index start at 1.
Expand source code
def contact_map_from_text( file_handle: TextIO, chars: str = "10" ) -> Dict[Tuple[int, int], bool]: """ Reads the content of a file object as a contact map. The file content should have as many lines as positions in the protein. Each line should have characters from chars argument, separated by a space. There should be as many characters in every line as positions in the protein. Args: file_handle (TextIO):. handle to the contact map file. chars (str): Characters accepted in the contact map. This argument is expected to have a length of two characters. The first is the value for residue pairs in contact, the second one is the value for non contacts. Returns: Dict[Tuple[int, int], bool]. Returns a dict object from position pairs to boolean values that indicates that the corresponding pair is in contact or not. Position index start at 1. """ contact_line_pattern = re.compile(f"[{chars}]( [{chars}])+$") position_index = 0 contact_map = {} for line in file_handle: line = line.strip().lower() c_match = re.match(contact_line_pattern, line) if c_match: position_index += 1 c_contacts = line.split() contact_map.update({ (position_index, x+1): c == chars[0] for x, c in enumerate(c_contacts)}) return contact_map
def from_mitos(dist_file: str) ‑> List[Tuple[str, int, str, int, float]]
-
Loads data of residue distances from a file generated by MIToS.
Input data should look like:
# model_i,chain_i,group_i,pdbe_i,number_i,name_i,model_j,chain_j,group_j,pdbe_j,number_j,name_j,distance 1,A,ATOM,,55,LEU,1,A,ATOM,,56,LEU,1.3247309160731473
Args
dist_file
:str
- A string to a text file with the distance data.
Returns
list[tuple[str, int, str, int, float]]
- The distances in the file.
Expand source code
def from_mitos(dist_file: str) -> DistanceData: """ Loads data of residue distances from a file generated by MIToS. Input data should look like: <pre> # model_i,chain_i,group_i,pdbe_i,number_i,name_i,model_j,chain_j,group_j,pdbe_j,number_j,name_j,distance 1,A,ATOM,,55,LEU,1,A,ATOM,,56,LEU,1.3247309160731473 </pre> Args: dist_file (str): A string to a text file with the distance data. Returns: list[tuple[str, int, str, int, float]]: The distances in the file. """ # pylint: disable=line-too-long d_pattern = re.compile( r"(\d+),(.),(.+),.*,(\d+),(.+),(\d+),(.),(.+),.*,(\d+),(.+),(.+)$" ) res = [] with open(dist_file, "r", encoding="utf8") as handle: for line in handle: line = line.strip() if not line.startswith("#"): match = re.match(d_pattern, line) if not match: continue try: res.append(( match.group(2), # Chain 1 int(match.group(4)), # Pos res 1 match.group(7), # Chain 2 int(match.group(9)), # Pos res 2 float(match.group(11)))) # distance except (IndexError, AttributeError): pass return res
def is_back_bone(atom: Bio.PDB.Atom.Atom) ‑> bool
-
Decides if an atom belongs to the backbone of a prototein by their name.
Args
atom
:Atom
- An atom.
Returns
bool
- True if the given atom belongs to the backbone of the protein.
Expand source code
def is_back_bone(atom:Atom) -> bool: """ Decides if an atom belongs to the backbone of a prototein by their name. Args: atom (Atom): An atom. Returns: bool: True if the given atom belongs to the backbone of the protein. """ return atom.id in ['N', 'CA', 'CB']
def read_distances(distance_file: str, add_extra_info: bool = False) ‑> List[Union[Tuple[str, int, str, int, float], Tuple[str, int, str, str, str, int, str, str, float]]]
-
Read distance data file.
Args
distance_file
:str
- The input file.
add_extra_info
:bool
- Read extra information from input data.
Returns
DistanceDataSH. A List of distances.
Expand source code
def read_distances( distance_file: str, add_extra_info:bool=False ) -> DistanceDataSH: """ Read distance data file. Args: distance_file (str): The input file. add_extra_info (bool): Read extra information from input data. Returns: DistanceDataSH. A List of distances. """ out = [] with open(distance_file, "r", encoding="utf-8") as csv_file: csv_reader = csv.reader(csv_file, delimiter=' ') for row in csv_reader: if len(row) == 9: # Data with additional info. if add_extra_info: out.append([ row[0], int(row[1]), row[2], row[3], row[4], int(row[5]), row[6], row[7], float(row[8]) ]) else: out.append([ row[0], int(row[1]), row[4], int(row[5]), float(row[8]) ]) return out
def save_distances(dist_data: List[Union[Tuple[str, int, str, int, float], Tuple[str, int, str, str, str, int, str, str, float]]], outfile: str)
-
Saves distance data to a file.
Despite the content of the dist_data list, the output file will contain nine fields. Missing data fill filled with NA fields.
Args
dist_data
:DistanceDataSH
- data generated with calculate_distance function.
outfile
:str
- exported file.
Throws
ValueError: If input data has wrong number of elements.
Expand source code
def save_distances( dist_data:DistanceDataSH, outfile:str ): """ Saves distance data to a file. Despite the content of the dist_data list, the output file will contain nine fields. Missing data fill filled with NA fields. Args: dist_data (DistanceDataSH): data generated with calculate_distance function. outfile (str): exported file. Throws: ValueError: If input data has wrong number of elements. """ with open(outfile, 'w', encoding='utf8') as text_handle: for row in dist_data: if len(row) == 9: # Data with additional info. pass elif len(row) == 5: # Data with no additional info. row = [ str(row[0]), str(row[1]), "NA", "NA", str(row[2]), str(row[3]), "NA", "NA", str(row[4]) ] else: raise ValueError("Distance data has wrong number of element") text_handle.write(" ".join([str(x) for x in row])) text_handle.write("\n")
def side_chain_selector(atom1: Bio.PDB.Atom.Atom, atom2: Bio.PDB.Atom.Atom) ‑> bool
-
Accepts two atoms that are part of the sidechain of an aminoacid.
Args
atom1
:Atom
- An Atom.
atom2
:Atom
- An Atom.
Returns
bool
- True if both atom are part of the side chain of a residue.
Expand source code
def side_chain_selector(atom1:Atom, atom2:Atom) -> bool: """ Accepts two atoms that are part of the sidechain of an aminoacid. Args: atom1 (Atom): An Atom. atom2 (Atom): An Atom. Returns: bool: True if both atom are part of the side chain of a residue. """ return not is_back_bone(atom1) and not is_back_bone(atom2)
Classes
class Distances (dist_data: List[Tuple[str, int, str, int, float]])
-
Store and access distance data for residues from a protein structure.
Creates a new instance from distance data.
Args
dist_data
:DistanceData
- Distance data should be a list of tuples of five elements: (chain1, pos1, chain2, pos2, distance).
Expand source code
class Distances(): ''' Store and access distance data for residues from a protein structure. ''' def __init__( self, dist_data: DistanceData ): """ Creates a new instance from distance data. Args: dist_data (DistanceData): Distance data should be a list of tuples of five elements: (chain1, pos1, chain2, pos2, distance). """ dis:Dict[Tuple[Chain, Position], Dict[Tuple[Chain, Position], float]] = {} for ch1, po1, ch2, po2, dist in dist_data: if (ch1, po1) not in dis: dis[(ch1, po1)] = {} dis[(ch1, po1)][(ch2, po2)] = dist self._distances = dis def raw_distances(self) -> DistanceData: """ Returns: DistanceData: Returns the distances data of the object as a list of tuples. Each tuples has five elements: (chain1, pos1, chain2, pos2, distance). """ return [ (chain1, pos1, chain2, pos2, dist) for (chain1, pos1), c_pos in self._distances.items() for (chain2, pos2), dist in c_pos.items() ] def of( #pylint: disable=invalid-name self, chain_a:Chain, pos_a:Position, chain_b:Chain, pos_b:Position ) -> Optional[Distance]: """ Retrieves distance for a residue pair. Args: chain_a (str): A string specifying the first residue chain. pos_a (int): An integer specifying the first residue position. chain_b (str): A string specifying the second residue chain. pos_b (int): An integer specifying the second residue position. Returns: Optional[float]: The distance between two residue positions. If the pair is not found, None is returned. """ pair1 = ((chain_a, pos_a)) pair2 = ((chain_b, pos_b)) if pair1 == pair2: # Special case for distance with the same residue. return 0 distance = self._distances.get(pair1, {}).get(pair2) if not distance: distance = self._distances.get(pair2, {}).get(pair1) return distance def remap_positions( self, mapping: dict[Chain, dict[Position, Position]] ): """ Remap index positions. If a positions could not be mapped it is excluded from the results. Args: mapping (dict[str, dict[int, float]]): a dict that maps old positions to new positions. """ T = TypeVar("T") def _remap(dic: dict[tuple[str, int], T]) -> dict[tuple[str, int], T]: return { (chain, mapping[chain][pos]):value for (chain, pos), value in dic.items() if pos in mapping.get(chain, {}) } self._distances = _remap( { (c1, p1):_remap(r2) for (c1, p1), r2 in self._distances.items() } ) def is_contact( #pylint: disable=too-many-arguments self, chain_a: Chain, pos_a: Position, chain_b: Chain, pos_b: Position, distance_cutoff:Distance=6.05 ) -> bool: ''' Args: chain_a (str): A string specifying the first residue chain. pos_a (int): An integer specifying the first residue position. chain_b (str): A string specifying the second residue chain. pos_b (str): An integer specifying the second residue position. distance_cutoff (float): a float with the distance cutoff (defaults to 6.05 angstroms) Returns: bool: Returns True if a given pair's distance is lower or equal than a given distance cutoff. ''' dist = self.of(chain_a, pos_a, chain_b, pos_b) if dist is None: return False return dist <= distance_cutoff @staticmethod def _sum_true(boolean_list: list[bool]): return reduce(lambda a, b: a+(1 if b else 0), boolean_list, 0) def mean_intramolecular(self) -> dict[Chain, float]: """ Returns: Return the mean number of intramolecular contacts across all residues for every chain. """ def _pos_contacts(chain:str, pos1:int, all_positions: list[int]): return [ self.is_contact(chain, pos1, chain, pos2) for pos2 in all_positions if not pos1 == pos2 ] all_residues = set(self._distances.keys()).union( { pair2 for pair1 in self._distances.keys() for pair2 in self._distances[pair1].keys() } ) all_chains = {chain for chain, _ in all_residues} pos_by_chain = { chain: [p for c, p in all_residues if c == chain] for chain in all_chains } n_contacts = { chain: [ self._sum_true(_pos_contacts(chain, pos, pos_by_chain[chain])) for pos in pos_by_chain[chain] ] for chain in all_chains } n_contacts = { chain: float(reduce(operator.add, n, 0)) / max(1, len(n)) for chain, n in n_contacts.items() } return n_contacts @staticmethod def from_contact_map( contact_map: Dict[Tuple[int, int], bool] ) -> 'Distances': """ Create a new Distance object from a contact map. Set contact to a distace of 1 and non contacts to 10. Sets the chain to be 'A'. """ dist_data = [] for (pos1, pos2), is_contact in contact_map.items(): dist_data.append( ('A', pos1, 'A', pos2, 1 if is_contact else 10) ) return Distances(dist_data)
Static methods
def from_contact_map(contact_map: Dict[Tuple[int, int], bool]) ‑> Distances
-
Create a new Distance object from a contact map. Set contact to a distace of 1 and non contacts to 10. Sets the chain to be 'A'.
Expand source code
@staticmethod def from_contact_map( contact_map: Dict[Tuple[int, int], bool] ) -> 'Distances': """ Create a new Distance object from a contact map. Set contact to a distace of 1 and non contacts to 10. Sets the chain to be 'A'. """ dist_data = [] for (pos1, pos2), is_contact in contact_map.items(): dist_data.append( ('A', pos1, 'A', pos2, 1 if is_contact else 10) ) return Distances(dist_data)
Methods
def is_contact(self, chain_a: str, pos_a: int, chain_b: str, pos_b: int, distance_cutoff: float = 6.05) ‑> bool
-
Args
chain_a
:str
- A string specifying the first residue chain.
pos_a
:int
- An integer specifying the first residue position.
chain_b
:str
- A string specifying the second residue chain.
pos_b
:str
- An integer specifying the second residue position.
distance_cutoff
:float
- a float with the distance cutoff (defaults to 6.05 angstroms)
Returns
bool
- Returns True if a given pair's distance is lower or equal than a
given distance cutoff.
Expand source code
def is_contact( #pylint: disable=too-many-arguments self, chain_a: Chain, pos_a: Position, chain_b: Chain, pos_b: Position, distance_cutoff:Distance=6.05 ) -> bool: ''' Args: chain_a (str): A string specifying the first residue chain. pos_a (int): An integer specifying the first residue position. chain_b (str): A string specifying the second residue chain. pos_b (str): An integer specifying the second residue position. distance_cutoff (float): a float with the distance cutoff (defaults to 6.05 angstroms) Returns: bool: Returns True if a given pair's distance is lower or equal than a given distance cutoff. ''' dist = self.of(chain_a, pos_a, chain_b, pos_b) if dist is None: return False return dist <= distance_cutoff
def mean_intramolecular(self) ‑> dict
-
Returns
Return the mean number of intramolecular contacts across all residues for every chain.
Expand source code
def mean_intramolecular(self) -> dict[Chain, float]: """ Returns: Return the mean number of intramolecular contacts across all residues for every chain. """ def _pos_contacts(chain:str, pos1:int, all_positions: list[int]): return [ self.is_contact(chain, pos1, chain, pos2) for pos2 in all_positions if not pos1 == pos2 ] all_residues = set(self._distances.keys()).union( { pair2 for pair1 in self._distances.keys() for pair2 in self._distances[pair1].keys() } ) all_chains = {chain for chain, _ in all_residues} pos_by_chain = { chain: [p for c, p in all_residues if c == chain] for chain in all_chains } n_contacts = { chain: [ self._sum_true(_pos_contacts(chain, pos, pos_by_chain[chain])) for pos in pos_by_chain[chain] ] for chain in all_chains } n_contacts = { chain: float(reduce(operator.add, n, 0)) / max(1, len(n)) for chain, n in n_contacts.items() } return n_contacts
def of(self, chain_a: str, pos_a: int, chain_b: str, pos_b: int) ‑> Optional[float]
-
Retrieves distance for a residue pair.
Args
chain_a
:str
- A string specifying the first residue chain.
pos_a
:int
- An integer specifying the first residue position.
chain_b
:str
- A string specifying the second residue chain.
pos_b
:int
- An integer specifying the second residue position.
Returns
Optional[float]
- The distance between two residue positions. If the pair is not found, None is returned.
Expand source code
def of( #pylint: disable=invalid-name self, chain_a:Chain, pos_a:Position, chain_b:Chain, pos_b:Position ) -> Optional[Distance]: """ Retrieves distance for a residue pair. Args: chain_a (str): A string specifying the first residue chain. pos_a (int): An integer specifying the first residue position. chain_b (str): A string specifying the second residue chain. pos_b (int): An integer specifying the second residue position. Returns: Optional[float]: The distance between two residue positions. If the pair is not found, None is returned. """ pair1 = ((chain_a, pos_a)) pair2 = ((chain_b, pos_b)) if pair1 == pair2: # Special case for distance with the same residue. return 0 distance = self._distances.get(pair1, {}).get(pair2) if not distance: distance = self._distances.get(pair2, {}).get(pair1) return distance
def raw_distances(self) ‑> List[Tuple[str, int, str, int, float]]
-
Returns
DistanceData
- Returns the distances data of the object as a list of tuples. Each tuples has five elements: (chain1, pos1, chain2, pos2, distance).
Expand source code
def raw_distances(self) -> DistanceData: """ Returns: DistanceData: Returns the distances data of the object as a list of tuples. Each tuples has five elements: (chain1, pos1, chain2, pos2, distance). """ return [ (chain1, pos1, chain2, pos2, dist) for (chain1, pos1), c_pos in self._distances.items() for (chain2, pos2), dist in c_pos.items() ]
def remap_positions(self, mapping: dict)
-
Remap index positions. If a positions could not be mapped it is excluded from the results.
Args
mapping
:dict[str, dict[int, float]]
- a dict that maps old positions to new positions.
Expand source code
def remap_positions( self, mapping: dict[Chain, dict[Position, Position]] ): """ Remap index positions. If a positions could not be mapped it is excluded from the results. Args: mapping (dict[str, dict[int, float]]): a dict that maps old positions to new positions. """ T = TypeVar("T") def _remap(dic: dict[tuple[str, int], T]) -> dict[tuple[str, int], T]: return { (chain, mapping[chain][pos]):value for (chain, pos), value in dic.items() if pos in mapping.get(chain, {}) } self._distances = _remap( { (c1, p1):_remap(r2) for (c1, p1), r2 in self._distances.items() } )