xi_covutils.read_results module
Read results from covariation files
"""
Read results from covariation files
"""
from Bio import SeqIO
def from_ccmpred(infile):
'''
Reads a the results of the covariation files from CCMPRED.
Returns a dictionary of tuples of indices (i,j) where i<=j as keys
and score as value. The indices i,j starts at 1.
:param infile: A string with the path of the input file.
'''
raw = open(infile).readlines()
scores = {(i+1, j+1):float(v)
for i, l in enumerate(raw)
for j, v in enumerate(l.split()) if i <= j}
return scores
def remap_paired(cov_data, msa_file, chain_a_len, chain_a_id="1", chain_b_id="2"):
'''
Remaps the positions of the covariation scores of a paired MSA to each individual
ungapped chain sequence.
Return a new dict, which keys has the form ((chain_a, index_a), (chain_b, index_b)) and the
values are the covariation scores.
:param cov_data: The result of covariation scores, as a dict of indices as keys and scores as values
:param msa_file: The path to a fasta file containing the MSA.
:param chain_a_len: The number of positions of the MSA that corresponds to the first chain.
:param chain_a_id: An identifier for the first chain.
:param chain_b_id: An identifier for the second chain.
'''
records = SeqIO.parse(msa_file, "fasta")
first_chain_seq = str(records.next().seq)[:chain_a_len]
first_chain_ungapped_length = len(first_chain_seq.replace("-", ""))
def _adapt_index(index):
if index <= first_chain_ungapped_length:
return (chain_a_id, index)
return (chain_b_id, index-first_chain_ungapped_length)
return {(_adapt_index(i), _adapt_index(j)):v for (i, j), v in cov_data.items()}
def remap_tuple_positions(cov_data, mapping):
'''
Remaps the positions of cov_data. Cov_data should be represented as a dict
with keys of the form ((chain_a, index_a), (chain_b, index_b)) and scores as values.
:param cov_data: a dict with covariation scores.
:param mapping: A dict to map the positions, the dict should have chain ids as keys, and
values should be dicts that maps from old positions to new positions.
'''
return {((c1, mapping[c1][p1]), (c2, mapping[c2][p2])):s
for ((c1, p1), (c2, p2)), s in cov_data.items()
if p1 in mapping[c1] and p2 in mapping[c2]}
def remove_trivial_tuple(cov_data, min_pos_dif=5):
"""
Removes positions from covariation data from residue pairs that are
a lesser distance than five positions in sequence.
Covariation data is assumed to be a dict with keys of the form
((chain_a, index_a), (chain_b, index_b)) and scores as keys.
:param cov_data: The input covariation data dict.
:param min_pos_dif=5: Minimum distance that two residues should
have to be included.
"""
return {((c1, p1), (c2, p2)):score
for ((c1, p1), (c2, p2)), score in cov_data.items()
if not ((c1 == c2) and (abs(p2 - p1) < min_pos_dif))}
def intra_covariation(cov_data):
"""
Extract intra-chain interactions from paired covariation data.
Returns a new dict which chain ids are keys and values are subsets
of cov_data that correspond to intra chain residue pairs.
:param cov_data: Covariation data from a paired MSA.
"""
chains = {c for ((c1, _), (c2, _)) in cov_data
for c in [c1, c2]}
intra_data = {c:{} for c in chains}
for ((ch1, po1), (ch2, po2)), score in cov_data.items():
if ch1 == ch2:
intra_data[ch1][((ch1, po1), (ch2, po2))] = score
return intra_data
def merge(cov_data_iter):
"""
Merge covariation data.
:param cov_data_iter: Is a iterator where each element is covariation data.
"""
return {pair:s for cov in cov_data_iter
for pair, s in cov.items()}
def inter_covariation(cov_data):
"""
Extract inter-chain interactions from paired covariation data.
Returns a new dict which chain id tuples are keys and values are subsets
of cov_data that correspond to inter chain residue pairs.
Chain ids in tuple keys are sorted lexicographically.
:param cov_data: Covariation data from a paired MSA.
"""
chain_pairs = {tuple(sorted([c1, c2])) for ((c1, _), (c2, _)) in cov_data
if not c1 == c2}
inter_data = {c:{} for c in chain_pairs}
for ((ch1, po1), (ch2, po2)), score in cov_data.items():
if not ch1 == ch2:
key_positions = tuple(sorted([(ch1, po1), (ch2, po2)], key=lambda x: x[0]))
key_chains = tuple(sorted([ch1, ch2]))
inter_data[key_chains][key_positions] = score
return inter_data
Functions
def from_ccmpred(
infile)
Reads a the results of the covariation files from CCMPRED. Returns a dictionary of tuples of indices (i,j) where i<=j as keys and score as value. The indices i,j starts at 1. :param infile: A string with the path of the input file.
def from_ccmpred(infile):
'''
Reads a the results of the covariation files from CCMPRED.
Returns a dictionary of tuples of indices (i,j) where i<=j as keys
and score as value. The indices i,j starts at 1.
:param infile: A string with the path of the input file.
'''
raw = open(infile).readlines()
scores = {(i+1, j+1):float(v)
for i, l in enumerate(raw)
for j, v in enumerate(l.split()) if i <= j}
return scores
def inter_covariation(
cov_data)
Extract inter-chain interactions from paired covariation data.
Returns a new dict which chain id tuples are keys and values are subsets of cov_data that correspond to inter chain residue pairs. Chain ids in tuple keys are sorted lexicographically.
:param cov_data: Covariation data from a paired MSA.
def inter_covariation(cov_data):
"""
Extract inter-chain interactions from paired covariation data.
Returns a new dict which chain id tuples are keys and values are subsets
of cov_data that correspond to inter chain residue pairs.
Chain ids in tuple keys are sorted lexicographically.
:param cov_data: Covariation data from a paired MSA.
"""
chain_pairs = {tuple(sorted([c1, c2])) for ((c1, _), (c2, _)) in cov_data
if not c1 == c2}
inter_data = {c:{} for c in chain_pairs}
for ((ch1, po1), (ch2, po2)), score in cov_data.items():
if not ch1 == ch2:
key_positions = tuple(sorted([(ch1, po1), (ch2, po2)], key=lambda x: x[0]))
key_chains = tuple(sorted([ch1, ch2]))
inter_data[key_chains][key_positions] = score
return inter_data
def intra_covariation(
cov_data)
Extract intra-chain interactions from paired covariation data.
Returns a new dict which chain ids are keys and values are subsets of cov_data that correspond to intra chain residue pairs.
:param cov_data: Covariation data from a paired MSA.
def intra_covariation(cov_data):
"""
Extract intra-chain interactions from paired covariation data.
Returns a new dict which chain ids are keys and values are subsets
of cov_data that correspond to intra chain residue pairs.
:param cov_data: Covariation data from a paired MSA.
"""
chains = {c for ((c1, _), (c2, _)) in cov_data
for c in [c1, c2]}
intra_data = {c:{} for c in chains}
for ((ch1, po1), (ch2, po2)), score in cov_data.items():
if ch1 == ch2:
intra_data[ch1][((ch1, po1), (ch2, po2))] = score
return intra_data
def merge(
cov_data_iter)
Merge covariation data.
:param cov_data_iter: Is a iterator where each element is covariation data.
def merge(cov_data_iter):
"""
Merge covariation data.
:param cov_data_iter: Is a iterator where each element is covariation data.
"""
return {pair:s for cov in cov_data_iter
for pair, s in cov.items()}
def remap_paired(
cov_data, msa_file, chain_a_len, chain_a_id='1', chain_b_id='2')
Remaps the positions of the covariation scores of a paired MSA to each individual ungapped chain sequence. Return a new dict, which keys has the form ((chain_a, index_a), (chain_b, index_b)) and the values are the covariation scores. :param cov_data: The result of covariation scores, as a dict of indices as keys and scores as values :param msa_file: The path to a fasta file containing the MSA. :param chain_a_len: The number of positions of the MSA that corresponds to the first chain. :param chain_a_id: An identifier for the first chain. :param chain_b_id: An identifier for the second chain.
def remap_paired(cov_data, msa_file, chain_a_len, chain_a_id="1", chain_b_id="2"):
'''
Remaps the positions of the covariation scores of a paired MSA to each individual
ungapped chain sequence.
Return a new dict, which keys has the form ((chain_a, index_a), (chain_b, index_b)) and the
values are the covariation scores.
:param cov_data: The result of covariation scores, as a dict of indices as keys and scores as values
:param msa_file: The path to a fasta file containing the MSA.
:param chain_a_len: The number of positions of the MSA that corresponds to the first chain.
:param chain_a_id: An identifier for the first chain.
:param chain_b_id: An identifier for the second chain.
'''
records = SeqIO.parse(msa_file, "fasta")
first_chain_seq = str(records.next().seq)[:chain_a_len]
first_chain_ungapped_length = len(first_chain_seq.replace("-", ""))
def _adapt_index(index):
if index <= first_chain_ungapped_length:
return (chain_a_id, index)
return (chain_b_id, index-first_chain_ungapped_length)
return {(_adapt_index(i), _adapt_index(j)):v for (i, j), v in cov_data.items()}
def remap_tuple_positions(
cov_data, mapping)
Remaps the positions of cov_data. Cov_data should be represented as a dict with keys of the form ((chain_a, index_a), (chain_b, index_b)) and scores as values.
:param cov_data: a dict with covariation scores. :param mapping: A dict to map the positions, the dict should have chain ids as keys, and values should be dicts that maps from old positions to new positions.
def remap_tuple_positions(cov_data, mapping):
'''
Remaps the positions of cov_data. Cov_data should be represented as a dict
with keys of the form ((chain_a, index_a), (chain_b, index_b)) and scores as values.
:param cov_data: a dict with covariation scores.
:param mapping: A dict to map the positions, the dict should have chain ids as keys, and
values should be dicts that maps from old positions to new positions.
'''
return {((c1, mapping[c1][p1]), (c2, mapping[c2][p2])):s
for ((c1, p1), (c2, p2)), s in cov_data.items()
if p1 in mapping[c1] and p2 in mapping[c2]}
def remove_trivial_tuple(
cov_data, min_pos_dif=5)
Removes positions from covariation data from residue pairs that are a lesser distance than five positions in sequence.
Covariation data is assumed to be a dict with keys of the form ((chain_a, index_a), (chain_b, index_b)) and scores as keys.
:param cov_data: The input covariation data dict.
:param min_pos_dif=5: Minimum distance that two residues should
have to be included.
def remove_trivial_tuple(cov_data, min_pos_dif=5):
"""
Removes positions from covariation data from residue pairs that are
a lesser distance than five positions in sequence.
Covariation data is assumed to be a dict with keys of the form
((chain_a, index_a), (chain_b, index_b)) and scores as keys.
:param cov_data: The input covariation data dict.
:param min_pos_dif=5: Minimum distance that two residues should
have to be included.
"""
return {((c1, p1), (c2, p2)):score
for ((c1, p1), (c2, p2)), score in cov_data.items()
if not ((c1 == c2) and (abs(p2 - p1) < min_pos_dif))}