Module PdmContext.utils.distances
Expand source code
import statistics
import numpy as np
from numpy.linalg import norm
from numpy.fft import fft, ifft
from PdmContext.utils.structure import Context
def nearest(TargetSet :list[Context], query : Context, threshold: float,distance):
'''
This method searches if there is a similar context object as query in the TargetSet.
Where the similar means with similarity at least as threshold
**Parameters**:
**TargetSet**: A list from context objects to search for similar ones
**query** : The query context object
**threshold** : The similarity threshold (real value in [0,1]
'''
maxdist = 0
# starting=time.time()
for fp in TargetSet:
if query["timestamp"] > fp["timestamp"]: # + dt.timedelta(hours=24):
dist, parts = distance(query, fp)
if dist > maxdist:
maxdist = dist
if maxdist > threshold:
break
return maxdist
def np_pearson_cor(x, y):
xv = x - x.mean(axis=0)
yv = y - y.mean(axis=0)
xvss = (xv * xv).sum(axis=0)
yvss = (yv * yv).sum(axis=0)
result = np.matmul(xv.transpose(), yv) / np.sqrt(np.outer(xvss, yvss))
# bound the values to -1 to 1 in the event of precision issues
return np.maximum(np.minimum(result, 1.0), -1.0)
def _z_norm(series):
if min(series) != max(series):
ms1 = statistics.mean(series)
ss1 = statistics.stdev(series)
series = [(s1 - ms1) / ss1 for s1 in series]
else:
series = [0 for i in range(len(series))]
return series
def distance_cc(context1 : Context, context2 : Context,a,b,verbose=False):
common_values = []
uncommon_values = []
for key in context1.CD.keys():
if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation":
if key in context2.CD.keys():
if context1.CD[key] is not None and context2.CD[key] is not None:
common_values.append(key)
else:
uncommon_values.append(key)
else:
uncommon_values.append(key)
for key in context2.CD.keys():
if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation":
if key not in context1.CD.keys():
uncommon_values.append(key)
if len(common_values)>0 and a>0.0000000001:
All_common_cc=[]
for key in common_values:
sizee = min(len(context1.CD[key]), len(context2.CD[key]))
if sizee < 2:
continue
firtsseries = context1.CD[key][-sizee:]
secondseries = context2.CD[key][-sizee:]
firtsseries=_z_norm(firtsseries)
secondseries=_z_norm(secondseries)
cc_array=_ncc_c(firtsseries,secondseries)
All_common_cc.append(cc_array)
all_cc_means=[]
for i in range(len(All_common_cc[0])):
summ=0
for j in range(len(All_common_cc)):
summ+=All_common_cc[j][i]
all_cc_means.append(summ/len(All_common_cc))
in_cc_m=max(all_cc_means)
position_max=all_cc_means.index(in_cc_m)
in_cc_m = (in_cc_m + 1) / 2
cc_m=in_cc_m*len(All_common_cc)/(len(All_common_cc)+len(uncommon_values))
if verbose:
print(f"Max position: {position_max-len(firtsseries)}")
print(f"Common cc_m = {in_cc_m}")
print(f"uncommon_values: {len(uncommon_values)}")
print(f"Final cc_m = {cc_m}")
else:
cc_m=0
# cc_m ε [-1,1] -> [0,1]
# check common causes-characterizations:
common = 0
edges1=ignoreOrder(context1)
edges2=ignoreOrder(context2)
for edge in edges1:
for edge2 in edges2:
if edge[0] == edge2[0] and edge[1] == edge2[1]:
common += 1
if (len(edges1) + len(edges2) - common) >0:
if common == 0:
jaccard = 0
else:
jaccard = common / (len(edges1) + len(edges2) - common)
similarity = jaccard
# there are no samples Jaccard(empty,empty) = ? , in that case we use only first part
else:
if a<0.0000001:
similarity = 1
else:
similarity=None
if similarity is None:
return cc_m, (cc_m, similarity)
else:
return a * cc_m + b * similarity
def ignoreOrder(context1: Context):
edges1 = []
for edge in context1.CR['edges']:
if edge[0] > edge[1]:
potential = (edge[0], edge[1])
else:
potential = (edge[1], edge[0])
if potential not in edges1:
edges1.append(potential)
return edges1
def ignoreOrderList(edgeslist1):
edges1 = []
for edge in edgeslist1:
if edge[0] > edge[1]:
potential = (edge[0], edge[1])
else:
potential = (edge[1], edge[0])
if potential not in edges1:
edges1.append(potential)
return edges1
def _sbd( x, y):
ncc = _ncc_c(x, y)
idx = ncc.argmax()
dist = 1 - ncc[idx]
return dist, None
def _ncc_c( x, y):
den = np.array(norm(x) * norm(y))
den[den == 0] = np.Inf
x_len = len(x)
fft_size = 1 << (2 * x_len - 1).bit_length()
cc = ifft(fft(x, fft_size) * np.conj(fft(y, fft_size)))
cc = np.concatenate((cc[-(x_len - 1):], cc[:x_len]))
return np.real(cc) / den
Functions
def distance_cc(context1: Context, context2: Context, a, b, verbose=False)
-
Expand source code
def distance_cc(context1 : Context, context2 : Context,a,b,verbose=False): common_values = [] uncommon_values = [] for key in context1.CD.keys(): if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation": if key in context2.CD.keys(): if context1.CD[key] is not None and context2.CD[key] is not None: common_values.append(key) else: uncommon_values.append(key) else: uncommon_values.append(key) for key in context2.CD.keys(): if key != "timestamp" and key != "edges" and key != "characterization" and key != "interpertation": if key not in context1.CD.keys(): uncommon_values.append(key) if len(common_values)>0 and a>0.0000000001: All_common_cc=[] for key in common_values: sizee = min(len(context1.CD[key]), len(context2.CD[key])) if sizee < 2: continue firtsseries = context1.CD[key][-sizee:] secondseries = context2.CD[key][-sizee:] firtsseries=_z_norm(firtsseries) secondseries=_z_norm(secondseries) cc_array=_ncc_c(firtsseries,secondseries) All_common_cc.append(cc_array) all_cc_means=[] for i in range(len(All_common_cc[0])): summ=0 for j in range(len(All_common_cc)): summ+=All_common_cc[j][i] all_cc_means.append(summ/len(All_common_cc)) in_cc_m=max(all_cc_means) position_max=all_cc_means.index(in_cc_m) in_cc_m = (in_cc_m + 1) / 2 cc_m=in_cc_m*len(All_common_cc)/(len(All_common_cc)+len(uncommon_values)) if verbose: print(f"Max position: {position_max-len(firtsseries)}") print(f"Common cc_m = {in_cc_m}") print(f"uncommon_values: {len(uncommon_values)}") print(f"Final cc_m = {cc_m}") else: cc_m=0 # cc_m ε [-1,1] -> [0,1] # check common causes-characterizations: common = 0 edges1=ignoreOrder(context1) edges2=ignoreOrder(context2) for edge in edges1: for edge2 in edges2: if edge[0] == edge2[0] and edge[1] == edge2[1]: common += 1 if (len(edges1) + len(edges2) - common) >0: if common == 0: jaccard = 0 else: jaccard = common / (len(edges1) + len(edges2) - common) similarity = jaccard # there are no samples Jaccard(empty,empty) = ? , in that case we use only first part else: if a<0.0000001: similarity = 1 else: similarity=None if similarity is None: return cc_m, (cc_m, similarity) else: return a * cc_m + b * similarity
def ignoreOrder(context1: Context)
-
Expand source code
def ignoreOrder(context1: Context): edges1 = [] for edge in context1.CR['edges']: if edge[0] > edge[1]: potential = (edge[0], edge[1]) else: potential = (edge[1], edge[0]) if potential not in edges1: edges1.append(potential) return edges1
def ignoreOrderList(edgeslist1)
-
Expand source code
def ignoreOrderList(edgeslist1): edges1 = [] for edge in edgeslist1: if edge[0] > edge[1]: potential = (edge[0], edge[1]) else: potential = (edge[1], edge[0]) if potential not in edges1: edges1.append(potential) return edges1
def nearest(TargetSet: list[Context], query: Context, threshold: float, distance)
-
This method searches if there is a similar context object as query in the TargetSet. Where the similar means with similarity at least as threshold
Parameters:
TargetSet: A list from context objects to search for similar ones
query : The query context object
threshold : The similarity threshold (real value in [0,1]
Expand source code
def nearest(TargetSet :list[Context], query : Context, threshold: float,distance): ''' This method searches if there is a similar context object as query in the TargetSet. Where the similar means with similarity at least as threshold **Parameters**: **TargetSet**: A list from context objects to search for similar ones **query** : The query context object **threshold** : The similarity threshold (real value in [0,1] ''' maxdist = 0 # starting=time.time() for fp in TargetSet: if query["timestamp"] > fp["timestamp"]: # + dt.timedelta(hours=24): dist, parts = distance(query, fp) if dist > maxdist: maxdist = dist if maxdist > threshold: break return maxdist
def np_pearson_cor(x, y)
-
Expand source code
def np_pearson_cor(x, y): xv = x - x.mean(axis=0) yv = y - y.mean(axis=0) xvss = (xv * xv).sum(axis=0) yvss = (yv * yv).sum(axis=0) result = np.matmul(xv.transpose(), yv) / np.sqrt(np.outer(xvss, yvss)) # bound the values to -1 to 1 in the event of precision issues return np.maximum(np.minimum(result, 1.0), -1.0)