xi_covutils.roc module
Functions to compute ROC curves and calculate AUC scores.
"""
Functions to compute ROC curves and calculate AUC scores.
"""
from functools import reduce #pylint: disable=redefined-builtin
from operator import add
from itertools import groupby
def curve(binary_result):
'''
Computes the ROC curve, not the AUC for a ordered list of a binary classifier result.
:param binary_result: a list of True or False values.
'''
positives = 0
negatives = 0
c_curve = [(negatives, positives)]
for binary in binary_result:
positives += 1 if binary else 0
negatives += 1 if not binary else 0
c_curve.append((negatives, positives))
c_curve = [(float(x)/max(negatives, 1), float(y)/max(positives, 1)) for x, y in c_curve]
if not c_curve[-1] == (1.0, 1.0):
c_curve.append((1.0, 1.0))
return c_curve
def simplify(a_curve):
'''
Remove redundant points over a horizontal or vertical lines in the curve.
:param curve: is a list of two element tuples of float, between 0 and 1.
'''
points_by_x = {k:[y for (_, y) in g] for k, g in groupby(a_curve, lambda x: x[0])}
points_by_x = [[(x, ys[0])] if len(ys) == 1 else [(x, min(ys)), (x, max(ys))]
for x, ys in sorted(points_by_x.items())]
points_by_x = [xy for l in points_by_x for xy in l]
points_by_y = {k:[x for (x, _) in g] for k, g in groupby(points_by_x, lambda x: x[1])}
points_by_y = [[(xs[0], y)] if len(xs) == 1 else [(min(xs), y), (max(xs), y)]
for y, xs in sorted(points_by_y.items())]
points_by_y = [xy for l in points_by_y for xy in l]
return points_by_y
def auc(a_curve):
'''
Computes the area under a ROC curve.
Assumes that the first element is (0,0), the last element is (1,1) and that
has more than one element.
:param curve: is a list of two element tuples of float, between 0 and 1.
'''
if len(a_curve) <= 1:
raise ValueError("The curve needs two or more points to compute an area.")
subareas = [(x2-x1)*(y2) for (x2, y2), (x1, _) in zip(a_curve[1:], a_curve[:-1])]
return reduce(add, subareas, 0)
def auc_n(a_curve, fpr_limit=0.05):
'''
Computes the area under a ROC curve from the origin until a given value of FPR.
Assumes that the first element is (0,0), the last element is (1,1) and that
has more than one element.
:param curve: is a list of two element tuples of float, between 0 and 1.
'''
a_curve = [(x, y) for x, y in a_curve if x <= fpr_limit]
if not a_curve[-1][0] == fpr_limit:
a_curve.append((fpr_limit, a_curve[-1][1]))
return auc(a_curve)
def curve_to_str(a_curve):
'''
Generates a string representation of the curve intended to be exported into a text file.
:param curve:
'''
return "\n".join(["{}, {}".format(x, y) for x, y in a_curve])
def merge_scores_and_distances(scores, distances):
"""
Merges covariation scores and distances object data into a single
list.
Covariation pairs of the covariation score that do not have an associated
distance are eliminated from the result.
The output list has no order.
:param scores: a dictionary with keys of the form ((chain1, pos1), (chain2, pos2)) and scores as values.
:param distances: a xi_covutils.distances.Distance object
"""
score_contacts = [(score, distances.is_contact(c1, p1, c2, p2))
for ((c1, p1), (c2, p2)), score in scores.items()
if distances.of(c1, p1, c2, p2)]
return score_contacts
def binary_from_merged(merged, greater_is_better=True):
"""
Creates a sorted list of a binary classification (True, False) of contacts
from a merged list of (score, contact) tuples.
The input list can be generated using merge_scores_and_distances function.
:param merged: a list of tuples, each tuple element is of the
form: (float, bool) , where the float is the cov score and
the bool is if correspond to a covariation pair that is in
contact.
:param greater_is_better: if True, cov scores of merged are assumed to be
better when they are greater and worst if smaller.
"""
return [v for _, v in sorted(merged, key=lambda x: x[0], reverse=greater_is_better)]
Functions
def auc(
a_curve)
Computes the area under a ROC curve. Assumes that the first element is (0,0), the last element is (1,1) and that has more than one element.
:param curve: is a list of two element tuples of float, between 0 and 1.
def auc(a_curve):
'''
Computes the area under a ROC curve.
Assumes that the first element is (0,0), the last element is (1,1) and that
has more than one element.
:param curve: is a list of two element tuples of float, between 0 and 1.
'''
if len(a_curve) <= 1:
raise ValueError("The curve needs two or more points to compute an area.")
subareas = [(x2-x1)*(y2) for (x2, y2), (x1, _) in zip(a_curve[1:], a_curve[:-1])]
return reduce(add, subareas, 0)
def auc_n(
a_curve, fpr_limit=0.05)
Computes the area under a ROC curve from the origin until a given value of FPR. Assumes that the first element is (0,0), the last element is (1,1) and that has more than one element.
:param curve: is a list of two element tuples of float, between 0 and 1.
def auc_n(a_curve, fpr_limit=0.05):
'''
Computes the area under a ROC curve from the origin until a given value of FPR.
Assumes that the first element is (0,0), the last element is (1,1) and that
has more than one element.
:param curve: is a list of two element tuples of float, between 0 and 1.
'''
a_curve = [(x, y) for x, y in a_curve if x <= fpr_limit]
if not a_curve[-1][0] == fpr_limit:
a_curve.append((fpr_limit, a_curve[-1][1]))
return auc(a_curve)
def binary_from_merged(
merged, greater_is_better=True)
Creates a sorted list of a binary classification (True, False) of contacts from a merged list of (score, contact) tuples.
The input list can be generated using merge_scores_and_distances function.
:param merged: a list of tuples, each tuple element is of the
form: (float, bool) , where the float is the cov score and
the bool is if correspond to a covariation pair that is in
contact.
:param greater_is_better: if True, cov scores of merged are assumed to be
better when they are greater and worst if smaller.
def binary_from_merged(merged, greater_is_better=True):
"""
Creates a sorted list of a binary classification (True, False) of contacts
from a merged list of (score, contact) tuples.
The input list can be generated using merge_scores_and_distances function.
:param merged: a list of tuples, each tuple element is of the
form: (float, bool) , where the float is the cov score and
the bool is if correspond to a covariation pair that is in
contact.
:param greater_is_better: if True, cov scores of merged are assumed to be
better when they are greater and worst if smaller.
"""
return [v for _, v in sorted(merged, key=lambda x: x[0], reverse=greater_is_better)]
def curve(
binary_result)
Computes the ROC curve, not the AUC for a ordered list of a binary classifier result.
:param binary_result: a list of True or False values.
def curve(binary_result):
'''
Computes the ROC curve, not the AUC for a ordered list of a binary classifier result.
:param binary_result: a list of True or False values.
'''
positives = 0
negatives = 0
c_curve = [(negatives, positives)]
for binary in binary_result:
positives += 1 if binary else 0
negatives += 1 if not binary else 0
c_curve.append((negatives, positives))
c_curve = [(float(x)/max(negatives, 1), float(y)/max(positives, 1)) for x, y in c_curve]
if not c_curve[-1] == (1.0, 1.0):
c_curve.append((1.0, 1.0))
return c_curve
def curve_to_str(
a_curve)
Generates a string representation of the curve intended to be exported into a text file. :param curve:
def curve_to_str(a_curve):
'''
Generates a string representation of the curve intended to be exported into a text file.
:param curve:
'''
return "\n".join(["{}, {}".format(x, y) for x, y in a_curve])
def merge_scores_and_distances(
scores, distances)
Merges covariation scores and distances object data into a single list.
Covariation pairs of the covariation score that do not have an associated distance are eliminated from the result. The output list has no order.
:param scores: a dictionary with keys of the form ((chain1, pos1), (chain2, pos2)) and scores as values.
:param distances: a xi_covutils.distances.Distance object
def merge_scores_and_distances(scores, distances):
"""
Merges covariation scores and distances object data into a single
list.
Covariation pairs of the covariation score that do not have an associated
distance are eliminated from the result.
The output list has no order.
:param scores: a dictionary with keys of the form ((chain1, pos1), (chain2, pos2)) and scores as values.
:param distances: a xi_covutils.distances.Distance object
"""
score_contacts = [(score, distances.is_contact(c1, p1, c2, p2))
for ((c1, p1), (c2, p2)), score in scores.items()
if distances.of(c1, p1, c2, p2)]
return score_contacts
def simplify(
a_curve)
Remove redundant points over a horizontal or vertical lines in the curve.
:param curve: is a list of two element tuples of float, between 0 and 1.
def simplify(a_curve):
'''
Remove redundant points over a horizontal or vertical lines in the curve.
:param curve: is a list of two element tuples of float, between 0 and 1.
'''
points_by_x = {k:[y for (_, y) in g] for k, g in groupby(a_curve, lambda x: x[0])}
points_by_x = [[(x, ys[0])] if len(ys) == 1 else [(x, min(ys)), (x, max(ys))]
for x, ys in sorted(points_by_x.items())]
points_by_x = [xy for l in points_by_x for xy in l]
points_by_y = {k:[x for (x, _) in g] for k, g in groupby(points_by_x, lambda x: x[1])}
points_by_y = [[(xs[0], y)] if len(xs) == 1 else [(min(xs), y), (max(xs), y)]
for y, xs in sorted(points_by_y.items())]
points_by_y = [xy for l in points_by_y for xy in l]
return points_by_y