Module PdmContext.utils.distances
Expand source code
import statistics
import numpy as np
from numpy.linalg import norm
from numpy.fft import fft, ifft
from PdmContext.utils.structure import Context
def nearest(TargetSet: list[Context], query: Context, threshold: float, distance):
'''
This method searches if there is a similar context object as query in the TargetSet.
Where the similar means with similarity at least as threshold
**Parameters**:
**TargetSet**: A list from context objects to search for similar ones
**query** : The query context object
**threshold** : The similarity threshold (real value in [0,1]
'''
maxdist = 0
# starting=time.time()
for fp in TargetSet:
if query.timestamp > fp.timestamp: # + dt.timedelta(hours=24):
dist, parts = distance(query, fp)
if dist > maxdist:
maxdist = dist
if maxdist > threshold:
break
return maxdist
def np_pearson_cor(x, y):
xv = x - x.mean(axis=0)
yv = y - y.mean(axis=0)
xvss = (xv * xv).sum(axis=0)
yvss = (yv * yv).sum(axis=0)
result = np.matmul(xv.transpose(), yv) / np.sqrt(np.outer(xvss, yvss))
# bound the values to -1 to 1 in the event of precision issues
return np.maximum(np.minimum(result, 1.0), -1.0)
def distance_eu_z(context1: Context, context2: Context, a, verbose=False):
"""
Calculation of similarity between two Context objects based on two quantities:
1) The first quantity is based on the Euclidean distance after z_normalization
We calculate a similarity based on the Euclidean distance between common values in the context CD,
equal to Euclidean(c1,c2)/(norm(c1)+norm(c2) to be in [0,1]
where each time we use the last n values (where n is the size of the shorter series)
2) Jaccard similarity of the edges in the CR (if we ignore the direction)
**context1**: A context object
**context2**: A context object
**a**: the weight of Euclidean similarity
**verbose**:
**return**: a similarity between 0 and 1
"""
b = 1 - a
common_values = []
uncommon_values = []
for key in context1.CD.keys():
if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation":
if key in context2.CD.keys():
if context1.CD[key] is not None and context2.CD[key] is not None:
common_values.append(key)
else:
uncommon_values.append(key)
else:
uncommon_values.append(key)
for key in context2.CD.keys():
if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation":
if key not in context1.CD.keys():
uncommon_values.append(key)
if len(common_values) > 0 and a > 0.0000000001:
if len(context2.CD[common_values[0]]) > 3 and len(context1.CD[common_values[0]]) > 3:
All_common_eu = []
for key in common_values:
sizee = min(len(context1.CD[key]), len(context2.CD[key]))
if sizee < 2:
continue
firtsseries = context1.CD[key][-sizee:]
secondseries = context2.CD[key][-sizee:]
firtsseries = _z_norm(firtsseries)
secondseries = _z_norm(secondseries)
den = np.linalg.norm(firtsseries) + np.linalg.norm(secondseries)
if den > 0:
dist = np.linalg.norm(np.array(firtsseries) - np.array(secondseries)) / den
else:
dist = 0
All_common_eu.append(dist)
in_cc_m = 1 - sum(All_common_eu) / len(All_common_eu)
cc_m = in_cc_m * len(All_common_eu) / (len(All_common_eu) + len(uncommon_values))
if verbose:
print(f"uncommon_values: {len(uncommon_values)}")
print(f"Final cc_m = {cc_m}")
else:
cc_m = 0
else:
cc_m = 0
# cc_m ε [-1,1] -> [0,1]
if b > 0.000000001:
# check common causes-characterizations:
common = 0
edges1 = ignore_order(context1)
edges2 = ignore_order(context2)
for edge in edges1:
for edge2 in edges2:
if edge[0] == edge2[0] and edge[1] == edge2[1]:
common += 1
if (len(edges1) + len(edges2) - common) > 0:
if common == 0:
jaccard = 0
else:
jaccard = common / (len(edges1) + len(edges2) - common)
similarity = jaccard
# there are no samples Jaccard(empty,empty) = ? , in that case we use only first part
else:
if a < 0.0000001:
similarity = 1
else:
similarity = None
else:
similarity = 0
if similarity is None:
return cc_m, (cc_m, similarity)
else:
return a * cc_m + b * similarity
def _z_norm(series):
if min(series) != max(series):
ms1 = statistics.mean(series)
ss1 = statistics.stdev(series)
series = [(s1 - ms1) / ss1 for s1 in series]
else:
series = [0 for i in range(len(series))]
return series
def distance_cc(context1: Context, context2: Context, a, verbose=False):
"""
Calculation of similarity between two Context objects based on two quantities:
1) The first quantity is based on the sbd distance
We calculate the minimum (average) sbd between all common series in the CD of contexts, from all possible shifts.
The shifts apply to all series each time.
Each time we use the last n values (where n is the size of the shorter series)
Which is also weighted from the ratio of common values.
2) Jaccard similarity of the edges in the CR (if we ignore the direction)
**context1**: A context object
**context2**: A context object
**a**: the weight of SBD similarity
**verbose**:
**return**: a similarity between 0 and 1
"""
b = 1 - a
common_values = []
uncommon_values = []
for key in context1.CD.keys():
if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation":
if key in context2.CD.keys():
if context1.CD[key] is not None and context2.CD[key] is not None:
common_values.append(key)
else:
uncommon_values.append(key)
else:
uncommon_values.append(key)
for key in context2.CD.keys():
if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation":
if key not in context1.CD.keys():
uncommon_values.append(key)
if len(common_values) > 0 and a > 0.0000000001:
if len(context2.CD[common_values[0]]) > 5 and len(context1.CD[common_values[0]]) > 5:
All_common_cc = []
for key in common_values:
sizee = min(len(context1.CD[key]), len(context2.CD[key]))
if sizee < 2:
continue
firtsseries = context1.CD[key][-sizee:]
secondseries = context2.CD[key][-sizee:]
firtsseries = _z_norm(firtsseries)
secondseries = _z_norm(secondseries)
cc_array = _ncc_c(firtsseries, secondseries)
All_common_cc.append(cc_array)
all_cc_means = []
for i in range(len(All_common_cc[0])):
summ = 0
for j in range(len(All_common_cc)):
summ += All_common_cc[j][i]
all_cc_means.append(summ / len(All_common_cc))
in_cc_m = max(all_cc_means)
position_max = all_cc_means.index(in_cc_m)
in_cc_m = (in_cc_m + 1) / 2
cc_m = in_cc_m * len(All_common_cc) / (len(All_common_cc) + len(uncommon_values))
if verbose:
print(f"Max position: {position_max - len(firtsseries)}")
print(f"Common cc_m = {in_cc_m}")
print(f"uncommon_values: {len(uncommon_values)}")
print(f"Final cc_m = {cc_m}")
else:
cc_m = 0
else:
cc_m = 0
# cc_m ε [-1,1] -> [0,1]
# check common causes-characterizations:
if b > 0.000000001:
# check common causes-characterizations:
common = 0
edges1 = ignore_order(context1)
edges2 = ignore_order(context2)
for edge in edges1:
for edge2 in edges2:
if edge[0] == edge2[0] and edge[1] == edge2[1]:
common += 1
if (len(edges1) + len(edges2) - common) > 0:
if common == 0:
jaccard = 0
else:
jaccard = common / (len(edges1) + len(edges2) - common)
similarity = jaccard
# there are no samples Jaccard(empty,empty) = ? , in that case we use only first part
else:
if a < 0.0000001:
similarity = 1
else:
similarity = None
else:
similarity = 0
if similarity is None:
return cc_m, (cc_m, similarity)
else:
return a * cc_m + b * similarity
def ignore_order(context1: Context):
edges1 = []
for edge in context1.CR['edges']:
if edge[0] > edge[1]:
potential = (edge[0], edge[1])
else:
potential = (edge[1], edge[0])
if potential not in edges1:
edges1.append(potential)
return edges1
def ignore_order_list(edgeslist1):
edges1 = []
for edge in edgeslist1:
if edge[0] > edge[1]:
potential = (edge[0], edge[1])
else:
potential = (edge[1], edge[0])
if potential not in edges1:
edges1.append(potential)
return edges1
def _sbd(x, y):
ncc = _ncc_c(x, y)
idx = ncc.argmax()
dist = 1 - ncc[idx]
return dist, None
def _ncc_c(x, y):
den = np.array(norm(x) * norm(y))
den[den == 0] = np.Inf
x_len = len(x)
fft_size = 1 << (2 * x_len - 1).bit_length()
cc = ifft(fft(x, fft_size) * np.conj(fft(y, fft_size)))
cc = np.concatenate((cc[-(x_len - 1):], cc[:x_len]))
return np.real(cc) / den
Functions
def distance_cc(context1: Context, context2: Context, a, verbose=False)
-
Calculation of similarity between two Context objects based on two quantities: 1) The first quantity is based on the sbd distance We calculate the minimum (average) sbd between all common series in the CD of contexts, from all possible shifts. The shifts apply to all series each time. Each time we use the last n values (where n is the size of the shorter series) Which is also weighted from the ratio of common values. 2) Jaccard similarity of the edges in the CR (if we ignore the direction)
context1: A context object
context2: A context object
a: the weight of SBD similarity
verbose:
return: a similarity between 0 and 1
Expand source code
def distance_cc(context1: Context, context2: Context, a, verbose=False): """ Calculation of similarity between two Context objects based on two quantities: 1) The first quantity is based on the sbd distance We calculate the minimum (average) sbd between all common series in the CD of contexts, from all possible shifts. The shifts apply to all series each time. Each time we use the last n values (where n is the size of the shorter series) Which is also weighted from the ratio of common values. 2) Jaccard similarity of the edges in the CR (if we ignore the direction) **context1**: A context object **context2**: A context object **a**: the weight of SBD similarity **verbose**: **return**: a similarity between 0 and 1 """ b = 1 - a common_values = [] uncommon_values = [] for key in context1.CD.keys(): if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation": if key in context2.CD.keys(): if context1.CD[key] is not None and context2.CD[key] is not None: common_values.append(key) else: uncommon_values.append(key) else: uncommon_values.append(key) for key in context2.CD.keys(): if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation": if key not in context1.CD.keys(): uncommon_values.append(key) if len(common_values) > 0 and a > 0.0000000001: if len(context2.CD[common_values[0]]) > 5 and len(context1.CD[common_values[0]]) > 5: All_common_cc = [] for key in common_values: sizee = min(len(context1.CD[key]), len(context2.CD[key])) if sizee < 2: continue firtsseries = context1.CD[key][-sizee:] secondseries = context2.CD[key][-sizee:] firtsseries = _z_norm(firtsseries) secondseries = _z_norm(secondseries) cc_array = _ncc_c(firtsseries, secondseries) All_common_cc.append(cc_array) all_cc_means = [] for i in range(len(All_common_cc[0])): summ = 0 for j in range(len(All_common_cc)): summ += All_common_cc[j][i] all_cc_means.append(summ / len(All_common_cc)) in_cc_m = max(all_cc_means) position_max = all_cc_means.index(in_cc_m) in_cc_m = (in_cc_m + 1) / 2 cc_m = in_cc_m * len(All_common_cc) / (len(All_common_cc) + len(uncommon_values)) if verbose: print(f"Max position: {position_max - len(firtsseries)}") print(f"Common cc_m = {in_cc_m}") print(f"uncommon_values: {len(uncommon_values)}") print(f"Final cc_m = {cc_m}") else: cc_m = 0 else: cc_m = 0 # cc_m ε [-1,1] -> [0,1] # check common causes-characterizations: if b > 0.000000001: # check common causes-characterizations: common = 0 edges1 = ignore_order(context1) edges2 = ignore_order(context2) for edge in edges1: for edge2 in edges2: if edge[0] == edge2[0] and edge[1] == edge2[1]: common += 1 if (len(edges1) + len(edges2) - common) > 0: if common == 0: jaccard = 0 else: jaccard = common / (len(edges1) + len(edges2) - common) similarity = jaccard # there are no samples Jaccard(empty,empty) = ? , in that case we use only first part else: if a < 0.0000001: similarity = 1 else: similarity = None else: similarity = 0 if similarity is None: return cc_m, (cc_m, similarity) else: return a * cc_m + b * similarity
def distance_eu_z(context1: Context, context2: Context, a, verbose=False)
-
Calculation of similarity between two Context objects based on two quantities: 1) The first quantity is based on the Euclidean distance after z_normalization We calculate a similarity based on the Euclidean distance between common values in the context CD, equal to Euclidean(c1,c2)/(norm(c1)+norm(c2) to be in [0,1] where each time we use the last n values (where n is the size of the shorter series) 2) Jaccard similarity of the edges in the CR (if we ignore the direction)
context1: A context object
context2: A context object
a: the weight of Euclidean similarity
verbose:
return: a similarity between 0 and 1
Expand source code
def distance_eu_z(context1: Context, context2: Context, a, verbose=False): """ Calculation of similarity between two Context objects based on two quantities: 1) The first quantity is based on the Euclidean distance after z_normalization We calculate a similarity based on the Euclidean distance between common values in the context CD, equal to Euclidean(c1,c2)/(norm(c1)+norm(c2) to be in [0,1] where each time we use the last n values (where n is the size of the shorter series) 2) Jaccard similarity of the edges in the CR (if we ignore the direction) **context1**: A context object **context2**: A context object **a**: the weight of Euclidean similarity **verbose**: **return**: a similarity between 0 and 1 """ b = 1 - a common_values = [] uncommon_values = [] for key in context1.CD.keys(): if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation": if key in context2.CD.keys(): if context1.CD[key] is not None and context2.CD[key] is not None: common_values.append(key) else: uncommon_values.append(key) else: uncommon_values.append(key) for key in context2.CD.keys(): if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation": if key not in context1.CD.keys(): uncommon_values.append(key) if len(common_values) > 0 and a > 0.0000000001: if len(context2.CD[common_values[0]]) > 3 and len(context1.CD[common_values[0]]) > 3: All_common_eu = [] for key in common_values: sizee = min(len(context1.CD[key]), len(context2.CD[key])) if sizee < 2: continue firtsseries = context1.CD[key][-sizee:] secondseries = context2.CD[key][-sizee:] firtsseries = _z_norm(firtsseries) secondseries = _z_norm(secondseries) den = np.linalg.norm(firtsseries) + np.linalg.norm(secondseries) if den > 0: dist = np.linalg.norm(np.array(firtsseries) - np.array(secondseries)) / den else: dist = 0 All_common_eu.append(dist) in_cc_m = 1 - sum(All_common_eu) / len(All_common_eu) cc_m = in_cc_m * len(All_common_eu) / (len(All_common_eu) + len(uncommon_values)) if verbose: print(f"uncommon_values: {len(uncommon_values)}") print(f"Final cc_m = {cc_m}") else: cc_m = 0 else: cc_m = 0 # cc_m ε [-1,1] -> [0,1] if b > 0.000000001: # check common causes-characterizations: common = 0 edges1 = ignore_order(context1) edges2 = ignore_order(context2) for edge in edges1: for edge2 in edges2: if edge[0] == edge2[0] and edge[1] == edge2[1]: common += 1 if (len(edges1) + len(edges2) - common) > 0: if common == 0: jaccard = 0 else: jaccard = common / (len(edges1) + len(edges2) - common) similarity = jaccard # there are no samples Jaccard(empty,empty) = ? , in that case we use only first part else: if a < 0.0000001: similarity = 1 else: similarity = None else: similarity = 0 if similarity is None: return cc_m, (cc_m, similarity) else: return a * cc_m + b * similarity
def ignore_order(context1: Context)
-
Expand source code
def ignore_order(context1: Context): edges1 = [] for edge in context1.CR['edges']: if edge[0] > edge[1]: potential = (edge[0], edge[1]) else: potential = (edge[1], edge[0]) if potential not in edges1: edges1.append(potential) return edges1
def ignore_order_list(edgeslist1)
-
Expand source code
def ignore_order_list(edgeslist1): edges1 = [] for edge in edgeslist1: if edge[0] > edge[1]: potential = (edge[0], edge[1]) else: potential = (edge[1], edge[0]) if potential not in edges1: edges1.append(potential) return edges1
def nearest(TargetSet: list[Context], query: Context, threshold: float, distance)
-
This method searches if there is a similar context object as query in the TargetSet. Where the similar means with similarity at least as threshold
Parameters:
TargetSet: A list from context objects to search for similar ones
query : The query context object
threshold : The similarity threshold (real value in [0,1]
Expand source code
def nearest(TargetSet: list[Context], query: Context, threshold: float, distance): ''' This method searches if there is a similar context object as query in the TargetSet. Where the similar means with similarity at least as threshold **Parameters**: **TargetSet**: A list from context objects to search for similar ones **query** : The query context object **threshold** : The similarity threshold (real value in [0,1] ''' maxdist = 0 # starting=time.time() for fp in TargetSet: if query.timestamp > fp.timestamp: # + dt.timedelta(hours=24): dist, parts = distance(query, fp) if dist > maxdist: maxdist = dist if maxdist > threshold: break return maxdist
def np_pearson_cor(x, y)
-
Expand source code
def np_pearson_cor(x, y): xv = x - x.mean(axis=0) yv = y - y.mean(axis=0) xvss = (xv * xv).sum(axis=0) yvss = (yv * yv).sum(axis=0) result = np.matmul(xv.transpose(), yv) / np.sqrt(np.outer(xvss, yvss)) # bound the values to -1 to 1 in the event of precision issues return np.maximum(np.minimum(result, 1.0), -1.0)