Module PdmContext.utils.distances

Expand source code
import statistics
import numpy as np
from numpy.linalg import norm
from numpy.fft import fft, ifft

from PdmContext.utils.structure import Context


def nearest(TargetSet :list[Context], query : Context, threshold: float,distance):
    '''
    This method searches if there is a similar context object as query in the TargetSet.
    Where the similar means with similarity at least as threshold

    **Parameters**:

    **TargetSet**: A list from context objects to search for similar ones

    **query** : The query context object

    **threshold** : The similarity threshold (real value in [0,1]
    '''
    maxdist = 0
    # starting=time.time()
    for fp in TargetSet:

        if query["timestamp"] > fp["timestamp"]:  # + dt.timedelta(hours=24):
            dist, parts = distance(query, fp)
            if dist > maxdist:
                maxdist = dist
                if maxdist > threshold:
                    break
    return maxdist


def np_pearson_cor(x, y):
    xv = x - x.mean(axis=0)
    yv = y - y.mean(axis=0)
    xvss = (xv * xv).sum(axis=0)
    yvss = (yv * yv).sum(axis=0)
    result = np.matmul(xv.transpose(), yv) / np.sqrt(np.outer(xvss, yvss))
    # bound the values to -1 to 1 in the event of precision issues
    return np.maximum(np.minimum(result, 1.0), -1.0)



def _z_norm(series):
    if min(series) != max(series):
        ms1 = statistics.mean(series)
        ss1 = statistics.stdev(series)
        series = [(s1 - ms1) / ss1 for s1 in series]
    else:
        series = [0 for i in range(len(series))]
    return series

def distance_cc(context1 : Context, context2 : Context,a,b,verbose=False):
    common_values = []
    uncommon_values = []
    for key in context1.CD.keys():
        if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation":
            if key in context2.CD.keys():
                if context1.CD[key] is not None and context2.CD[key] is not None:
                    common_values.append(key)
                else:
                    uncommon_values.append(key)
            else:
                uncommon_values.append(key)
    for key in context2.CD.keys():
        if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation":
            if key not in context1.CD.keys():
                uncommon_values.append(key)
    if len(common_values)>0 and a>0.0000000001:
        All_common_cc=[]
        for key in common_values:
            sizee = min(len(context1.CD[key]), len(context2.CD[key]))
            if sizee < 2:
                continue
            firtsseries = context1.CD[key][-sizee:]
            secondseries = context2.CD[key][-sizee:]

            firtsseries=_z_norm(firtsseries)
            secondseries=_z_norm(secondseries)

            cc_array=_ncc_c(firtsseries,secondseries)
            All_common_cc.append(cc_array)
        all_cc_means=[]
        for i in range(len(All_common_cc[0])):
            summ=0
            for j in range(len(All_common_cc)):
                summ+=All_common_cc[j][i]
            all_cc_means.append(summ/len(All_common_cc))
        in_cc_m=max(all_cc_means)
        position_max=all_cc_means.index(in_cc_m)
        in_cc_m = (in_cc_m + 1) / 2
        cc_m=in_cc_m*len(All_common_cc)/(len(All_common_cc)+len(uncommon_values))
        if verbose:
            print(f"Max position: {position_max-len(firtsseries)}")
            print(f"Common cc_m = {in_cc_m}")
            print(f"uncommon_values: {len(uncommon_values)}")
            print(f"Final cc_m = {cc_m}")
    else:
        cc_m=0
    # cc_m ε [-1,1] -> [0,1]

    # check common causes-characterizations:
    common = 0

    edges1=ignoreOrder(context1)
    edges2=ignoreOrder(context2)

    for edge in edges1:
        for edge2  in edges2:
            if edge[0] == edge2[0] and edge[1] == edge2[1]:
                common += 1

    if (len(edges1) + len(edges2) - common) >0:
        if common == 0:
            jaccard = 0
        else:
            jaccard = common / (len(edges1) + len(edges2) - common)
        similarity = jaccard
    # there are no samples Jaccard(empty,empty) = ? , in that case we use only first part
    else:
        if a<0.0000001:
            similarity = 1
        else:
            similarity=None
    if similarity is None:
        return cc_m, (cc_m, similarity)
    else:
        return a * cc_m + b * similarity

def ignoreOrder(context1: Context):
    edges1 = []

    for edge in context1.CR['edges']:
        if edge[0] > edge[1]:
            potential = (edge[0], edge[1])
        else:
            potential = (edge[1], edge[0])
        if potential not in edges1:
            edges1.append(potential)
    return edges1

def ignoreOrderList(edgeslist1):
    edges1 = []

    for edge in edgeslist1:
        if edge[0] > edge[1]:
            potential = (edge[0], edge[1])
        else:
            potential = (edge[1], edge[0])
        if potential not in edges1:
            edges1.append(potential)
    return edges1


def _sbd( x, y):
    ncc = _ncc_c(x, y)
    idx = ncc.argmax()
    dist = 1 - ncc[idx]
    return dist, None


def _ncc_c( x, y):
    den = np.array(norm(x) * norm(y))
    den[den == 0] = np.Inf

    x_len = len(x)
    fft_size = 1 << (2 * x_len - 1).bit_length()
    cc = ifft(fft(x, fft_size) * np.conj(fft(y, fft_size)))
    cc = np.concatenate((cc[-(x_len - 1):], cc[:x_len]))
    return np.real(cc) / den

Functions

def distance_cc(context1: Context, context2: Context, a, b, verbose=False)
Expand source code
def distance_cc(context1 : Context, context2 : Context,a,b,verbose=False):
    common_values = []
    uncommon_values = []
    for key in context1.CD.keys():
        if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation":
            if key in context2.CD.keys():
                if context1.CD[key] is not None and context2.CD[key] is not None:
                    common_values.append(key)
                else:
                    uncommon_values.append(key)
            else:
                uncommon_values.append(key)
    for key in context2.CD.keys():
        if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation":
            if key not in context1.CD.keys():
                uncommon_values.append(key)
    if len(common_values)>0 and a>0.0000000001:
        All_common_cc=[]
        for key in common_values:
            sizee = min(len(context1.CD[key]), len(context2.CD[key]))
            if sizee < 2:
                continue
            firtsseries = context1.CD[key][-sizee:]
            secondseries = context2.CD[key][-sizee:]

            firtsseries=_z_norm(firtsseries)
            secondseries=_z_norm(secondseries)

            cc_array=_ncc_c(firtsseries,secondseries)
            All_common_cc.append(cc_array)
        all_cc_means=[]
        for i in range(len(All_common_cc[0])):
            summ=0
            for j in range(len(All_common_cc)):
                summ+=All_common_cc[j][i]
            all_cc_means.append(summ/len(All_common_cc))
        in_cc_m=max(all_cc_means)
        position_max=all_cc_means.index(in_cc_m)
        in_cc_m = (in_cc_m + 1) / 2
        cc_m=in_cc_m*len(All_common_cc)/(len(All_common_cc)+len(uncommon_values))
        if verbose:
            print(f"Max position: {position_max-len(firtsseries)}")
            print(f"Common cc_m = {in_cc_m}")
            print(f"uncommon_values: {len(uncommon_values)}")
            print(f"Final cc_m = {cc_m}")
    else:
        cc_m=0
    # cc_m ε [-1,1] -> [0,1]

    # check common causes-characterizations:
    common = 0

    edges1=ignoreOrder(context1)
    edges2=ignoreOrder(context2)

    for edge in edges1:
        for edge2  in edges2:
            if edge[0] == edge2[0] and edge[1] == edge2[1]:
                common += 1

    if (len(edges1) + len(edges2) - common) >0:
        if common == 0:
            jaccard = 0
        else:
            jaccard = common / (len(edges1) + len(edges2) - common)
        similarity = jaccard
    # there are no samples Jaccard(empty,empty) = ? , in that case we use only first part
    else:
        if a<0.0000001:
            similarity = 1
        else:
            similarity=None
    if similarity is None:
        return cc_m, (cc_m, similarity)
    else:
        return a * cc_m + b * similarity
def ignoreOrder(context1: Context)
Expand source code
def ignoreOrder(context1: Context):
    edges1 = []

    for edge in context1.CR['edges']:
        if edge[0] > edge[1]:
            potential = (edge[0], edge[1])
        else:
            potential = (edge[1], edge[0])
        if potential not in edges1:
            edges1.append(potential)
    return edges1
def ignoreOrderList(edgeslist1)
Expand source code
def ignoreOrderList(edgeslist1):
    edges1 = []

    for edge in edgeslist1:
        if edge[0] > edge[1]:
            potential = (edge[0], edge[1])
        else:
            potential = (edge[1], edge[0])
        if potential not in edges1:
            edges1.append(potential)
    return edges1
def nearest(TargetSet: list[Context], query: Context, threshold: float, distance)

This method searches if there is a similar context object as query in the TargetSet. Where the similar means with similarity at least as threshold

Parameters:

TargetSet: A list from context objects to search for similar ones

query : The query context object

threshold : The similarity threshold (real value in [0,1]

Expand source code
def nearest(TargetSet :list[Context], query : Context, threshold: float,distance):
    '''
    This method searches if there is a similar context object as query in the TargetSet.
    Where the similar means with similarity at least as threshold

    **Parameters**:

    **TargetSet**: A list from context objects to search for similar ones

    **query** : The query context object

    **threshold** : The similarity threshold (real value in [0,1]
    '''
    maxdist = 0
    # starting=time.time()
    for fp in TargetSet:

        if query["timestamp"] > fp["timestamp"]:  # + dt.timedelta(hours=24):
            dist, parts = distance(query, fp)
            if dist > maxdist:
                maxdist = dist
                if maxdist > threshold:
                    break
    return maxdist
def np_pearson_cor(x, y)
Expand source code
def np_pearson_cor(x, y):
    xv = x - x.mean(axis=0)
    yv = y - y.mean(axis=0)
    xvss = (xv * xv).sum(axis=0)
    yvss = (yv * yv).sum(axis=0)
    result = np.matmul(xv.transpose(), yv) / np.sqrt(np.outer(xvss, yvss))
    # bound the values to -1 to 1 in the event of precision issues
    return np.maximum(np.minimum(result, 1.0), -1.0)