Source code for utool.oldalg

# -*- coding: utf-8 -*-
# TODO Licence:
#
# TODO:  move library intensive functions to vtool
from __future__ import absolute_import, division, print_function, unicode_literals
import operator
import six
from six.moves import zip, range, reduce
from utool import util_type
from utool import util_inject
from utool import util_decor
try:
    import numpy as np
    HAVE_NUMPY = True
except ImportError:
    HAVE_NUMPY = False
    # TODO remove numpy
    pass
try:
    import scipy.spatial.distance as spdist
    HAVE_SCIPY = True
except ImportError:
    HAVE_SCIPY = False
print, rrr, profile = util_inject.inject2(__name__, '[alg]')


PHI = 1.61803398875
PHI_A = (1 / PHI)
PHI_B = 1 - PHI_A


[docs]def bayesnet(): """ References: https://class.coursera.org/pgm-003/lecture/17 http://www.cs.ubc.ca/~murphyk/Bayes/bnintro.html http://www3.cs.stonybrook.edu/~sael/teaching/cse537/Slides/chapter14d_BP.pdf http://www.cse.unsw.edu.au/~cs9417ml/Bayes/Pages/PearlPropagation.html https://github.com/pgmpy/pgmpy.git http://pgmpy.readthedocs.org/en/latest/ http://nipy.bic.berkeley.edu:5000/download/11 """ # import operator as op # # Enumerate all possible events # varcard_list = list(map(op.attrgetter('variable_card'), cpd_list)) # _esdat = list(ut.iprod(*map(range, varcard_list))) # _escol = list(map(op.attrgetter('variable'), cpd_list)) # event_space = pd.DataFrame(_esdat, columns=_escol) # # Custom compression of event space to inspect a specific graph # def compress_space_flags(event_space, var1, var2, var3, cmp12_): # """ # var1, var2, cmp_ = 'Lj', 'Lk', op.eq # """ # import vtool as vt # data = event_space # other_cols = ut.setdiff_ordered(data.columns.tolist(), [var1, var2, var3]) # case_flags12 = cmp12_(data[var1], data[var2]).values # # case_flags23 = cmp23_(data[var2], data[var3]).values # # case_flags = np.logical_and(case_flags12, case_flags23) # case_flags = case_flags12 # case_flags = case_flags.astype(np.int64) # subspace = np.hstack((case_flags[:, None], data[other_cols].values)) # sel_ = vt.unique_row_indexes(subspace) # mask = vt.index_to_boolmask(sel_, len(data)) # flags = np.logical_and(mask, case_flags) # return flags # # Build special cases # case_same = event_space.loc[compress_space_flags(event_space, 'Li', 'Lj', 'Lk', op.eq)] # case_diff = event_space.loc[compress_space_flags(event_space, 'Li', 'Lj', 'Lk', op.ne)] # special_cases = [ # case_same, # case_diff, # ] from pgmpy.factors import TabularCPD from pgmpy.models import BayesianModel import pandas as pd from pgmpy.inference import BeliefPropagation # NOQA from pgmpy.inference import VariableElimination # NOQA name_nice = ['n1', 'n2', 'n3'] score_nice = ['low', 'high'] match_nice = ['diff', 'same'] num_names = len(name_nice) num_scores = len(score_nice) nid_basis = list(range(num_names)) score_basis = list(range(num_scores)) semtype2_nice = { 'score': score_nice, 'name': name_nice, 'match': match_nice, } var2_cpd = { } globals()['semtype2_nice'] = semtype2_nice globals()['var2_cpd'] = var2_cpd name_combo = np.array(list(ut.iprod(nid_basis, nid_basis))) combo_is_same = name_combo.T[0] == name_combo.T[1] def get_expected_scores_prob(level1, level2): part1 = combo_is_same * level1 part2 = (1 - combo_is_same) * (1 - (level2)) expected_scores_level = part1 + part2 return expected_scores_level # def make_cpd(): def name_cpd(aid): from pgmpy.factors import TabularCPD cpd = TabularCPD( variable='N' + aid, variable_card=num_names, values=[[1.0 / num_names] * num_names]) cpd.semtype = 'name' return cpd name_cpds = [name_cpd('i'), name_cpd('j'), name_cpd('k')] var2_cpd.update(dict(zip([cpd.variable for cpd in name_cpds], name_cpds))) if True: num_same_diff = 2 samediff_measure = np.array([ # get_expected_scores_prob(.12, .2), # get_expected_scores_prob(.88, .8), get_expected_scores_prob(0, 0), get_expected_scores_prob(1, 1), ]) samediff_vals = (samediff_measure / samediff_measure.sum(axis=0)).tolist() def samediff_cpd(aid1, aid2): cpd = TabularCPD( variable='A' + aid1 + aid2, variable_card=num_same_diff, values=samediff_vals, evidence=['N' + aid1, 'N' + aid2], # [::-1], evidence_card=[num_names, num_names]) # [::-1]) cpd.semtype = 'match' return cpd samediff_cpds = [samediff_cpd('i', 'j'), samediff_cpd('j', 'k'), samediff_cpd('k', 'i')] var2_cpd.update(dict(zip([cpd.variable for cpd in samediff_cpds], samediff_cpds))) if True: def score_cpd(aid1, aid2): semtype = 'score' evidence = ['A' + aid1 + aid2, 'N' + aid1, 'N' + aid2] evidence_cpds = [var2_cpd[key] for key in evidence] evidence_nice = [semtype2_nice[cpd.semtype] for cpd in evidence_cpds] evidence_card = list(map(len, evidence_nice)) evidence_states = list(ut.iprod(*evidence_nice)) variable_basis = semtype2_nice[semtype] variable_values = [] for mystate in variable_basis: row = [] for state in evidence_states: if state[0] == state[1]: if state[2] == 'same': val = .2 if mystate == 'low' else .8 else: val = 1 # val = .5 if mystate == 'low' else .5 elif state[0] != state[1]: if state[2] == 'same': val = .5 if mystate == 'low' else .5 else: val = 1 # val = .9 if mystate == 'low' else .1 row.append(val) variable_values.append(row) cpd = TabularCPD( variable='S' + aid1 + aid2, variable_card=len(variable_basis), values=variable_values, evidence=evidence, # [::-1], evidence_card=evidence_card) # [::-1]) cpd.semtype = semtype return cpd else: score_values = [ [.8, .1], [.2, .9], ] def score_cpd(aid1, aid2): cpd = TabularCPD( variable='S' + aid1 + aid2, variable_card=num_scores, values=score_values, evidence=['A' + aid1 + aid2], # [::-1], evidence_card=[num_same_diff]) # [::-1]) cpd.semtype = 'score' return cpd score_cpds = [score_cpd('i', 'j'), score_cpd('j', 'k')] cpd_list = name_cpds + score_cpds + samediff_cpds else: score_measure = np.array([get_expected_scores_prob(level1, level2) for level1, level2 in zip(np.linspace(.1, .9, num_scores), np.linspace(.2, .8, num_scores))]) score_values = (score_measure / score_measure.sum(axis=0)).tolist() def score_cpd(aid1, aid2): cpd = TabularCPD( variable='S' + aid1 + aid2, variable_card=num_scores, values=score_values, evidence=['N' + aid1, 'N' + aid2], evidence_card=[num_names, num_names]) cpd.semtype = 'score' return cpd score_cpds = [score_cpd('i', 'j'), score_cpd('j', 'k')] cpd_list = name_cpds + score_cpds pass input_graph = [] for cpd in cpd_list: if cpd.evidence is not None: for evar in cpd.evidence: input_graph.append((evar, cpd.variable)) name_model = BayesianModel(input_graph) name_model.add_cpds(*cpd_list) var2_cpd.update(dict(zip([cpd.variable for cpd in cpd_list], cpd_list))) globals()['var2_cpd'] = var2_cpd varnames = [cpd.variable for cpd in cpd_list] # --- PRINT CPDS --- cpd = score_cpds[0] def print_cpd(cpd): print('CPT: %r' % (cpd,)) index = semtype2_nice[cpd.semtype] if cpd.evidence is None: columns = ['None'] else: basis_lists = [semtype2_nice[var2_cpd[ename].semtype] for ename in cpd.evidence] columns = [','.join(x) for x in ut.iprod(*basis_lists)] data = cpd.get_cpd() print(pd.DataFrame(data, index=index, columns=columns)) for cpd in name_model.get_cpds(): print('----') print(cpd._str('phi')) print_cpd(cpd) # --- INFERENCE --- Ni = name_cpds[0] event_space_combos = {} event_space_combos[Ni.variable] = 0 # Set ni to always be Fred for cpd in cpd_list: if cpd.semtype == 'score': event_space_combos[cpd.variable] = list(range(cpd.variable_card)) evidence_dict = ut.all_dict_combinations(event_space_combos) # Query about name of annotation k given different event space params def pretty_evidence(evidence): return [key + '=' + str(semtype2_nice[var2_cpd[key].semtype][val]) for key, val in evidence.items()] def print_factor(factor): row_cards = factor.cardinality row_vars = factor.variables values = factor.values.reshape(np.prod(row_cards), 1).flatten() # col_cards = 1 # col_vars = [''] basis_lists = list(zip(*list(ut.iprod(*[range(c) for c in row_cards])))) nice_basis_lists = [] for varname, basis in zip(row_vars, basis_lists): cpd = var2_cpd[varname] _nice_basis = ut.take(semtype2_nice[cpd.semtype], basis) nice_basis = ['%s=%s' % (varname, val) for val in _nice_basis] nice_basis_lists.append(nice_basis) row_lbls = [', '.join(sorted(x)) for x in zip(*nice_basis_lists)] print(ut.repr3(dict(zip(row_lbls, values)), precision=3, align=True, key_order_metric='-val')) # name_belief = BeliefPropagation(name_model) name_belief = VariableElimination(name_model) import pgmpy import six # NOQA def try_query(evidence): print('--------') query_vars = ut.setdiff_ordered(varnames, list(evidence.keys())) evidence_str = ', '.join(pretty_evidence(evidence)) probs = name_belief.query(query_vars, evidence) factor_list = probs.values() joint_factor = pgmpy.factors.factor_product(*factor_list) print('P(' + ', '.join(query_vars) + ' | ' + evidence_str + ')') # print(six.text_type(joint_factor)) factor = joint_factor # NOQA # print_factor(factor) # import utool as ut print(ut.hz_str([(f._str(phi_or_p='phi')) for f in factor_list])) for evidence in evidence_dict: try_query(evidence) evidence = {'Aij': 1, 'Ajk': 1, 'Aki': 1, 'Ni': 0} try_query(evidence) evidence = {'Aij': 0, 'Ajk': 0, 'Aki': 0, 'Ni': 0} try_query(evidence) globals()['score_nice'] = score_nice globals()['name_nice'] = name_nice globals()['score_basis'] = score_basis globals()['nid_basis'] = nid_basis print('Independencies') print(name_model.get_independencies()) print(name_model.local_independencies([Ni.variable])) # name_belief = BeliefPropagation(name_model) # # name_belief = VariableElimination(name_model) # for case in special_cases: # test_data = case.drop('Lk', axis=1) # test_data = test_data.reset_index(drop=True) # print('----') # for i in range(test_data.shape[0]): # evidence = test_data.loc[i].to_dict() # probs = name_belief.query(['Lk'], evidence) # factor = probs['Lk'] # probs = factor.values # evidence_ = evidence.copy() # evidence_['Li'] = name_nice[evidence['Li']] # evidence_['Lj'] = name_nice[evidence['Lj']] # evidence_['Sij'] = score_nice[evidence['Sij']] # evidence_['Sjk'] = score_nice[evidence['Sjk']] # nice2_prob = ut.odict(zip(name_nice, probs.tolist())) # ut.print_python_code('P(Lk | {evidence}) = {cpt}'.format( # evidence=(ut.repr2(evidence_, explicit=True, nobraces=True, strvals=True)), # cpt=ut.repr3(nice2_prob, precision=3, align=True, key_order_metric='-val') # )) # for case in special_cases: # test_data = case.drop('Lk', axis=1) # test_data = test_data.drop('Lj', axis=1) # test_data = test_data.reset_index(drop=True) # print('----') # for i in range(test_data.shape[0]): # evidence = test_data.loc[i].to_dict() # query_vars = ['Lk', 'Lj'] # probs = name_belief.query(query_vars, evidence) # for queryvar in query_vars: # factor = probs[queryvar] # print(factor._str('phi')) # probs = factor.values # evidence_ = evidence.copy() # evidence_['Li'] = name_nice[evidence['Li']] # evidence_['Sij'] = score_nice[evidence['Sij']] # evidence_['Sjk'] = score_nice[evidence['Sjk']] # nice2_prob = ut.odict(zip([queryvar + '=' + x for x in name_nice], probs.tolist())) # ut.print_python_code('P({queryvar} | {evidence}) = {cpt}'.format( # query_var=query_var, # evidence=(ut.repr2(evidence_, explicit=True, nobraces=True, strvals=True)), # cpt=ut.repr3(nice2_prob, precision=3, align=True, key_order_metric='-val') # )) # _ draw model import plottool as pt import networkx as netx fig = pt.figure() # NOQA fig.clf() ax = pt.gca() netx_nodes = [(node, {}) for node in name_model.nodes()] netx_edges = [(etup[0], etup[1], {}) for etup in name_model.edges()] netx_graph = netx.DiGraph() netx_graph.add_nodes_from(netx_nodes) netx_graph.add_edges_from(netx_edges) # pos = netx.graphviz_layout(netx_graph) pos = netx.pydot_layout(netx_graph, prog='dot') netx.draw(netx_graph, pos=pos, ax=ax, with_labels=True) pt.plt.savefig('foo.png') ut.startfile('foo.png')
[docs]def bayesnet_examples(): from pgmpy.factors import TabularCPD from pgmpy.models import BayesianModel import pandas as pd student_model = BayesianModel([('D', 'G'), ('I', 'G'), ('G', 'L'), ('I', 'S')]) # we can generate some random data. raw_data = np.random.randint(low=0, high=2, size=(1000, 5)) data = pd.DataFrame(raw_data, columns=['D', 'I', 'G', 'L', 'S']) data_train = data[: int(data.shape[0] * 0.75)] student_model.fit(data_train) student_model.get_cpds() data_test = data[int(0.75 * data.shape[0]): data.shape[0]] data_test.drop('D', axis=1, inplace=True) student_model.predict(data_test) grade_cpd = TabularCPD( variable='G', variable_card=3, values=[[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], evidence=['I', 'D'], evidence_card=[2, 2]) difficulty_cpd = TabularCPD( variable='D', variable_card=2, values=[[0.6, 0.4]]) intel_cpd = TabularCPD( variable='I', variable_card=2, values=[[0.7, 0.3]]) letter_cpd = TabularCPD( variable='L', variable_card=2, values=[[0.1, 0.4, 0.99], [0.9, 0.6, 0.01]], evidence=['G'], evidence_card=[3]) sat_cpd = TabularCPD( variable='S', variable_card=2, values=[[0.95, 0.2], [0.05, 0.8]], evidence=['I'], evidence_card=[2]) student_model.add_cpds(grade_cpd, difficulty_cpd, intel_cpd, letter_cpd, sat_cpd)