Module cognet.cognet
Expand source code
import numpy as np
import pandas as pd
import random
from quasinet.qnet import Qnet, qdistance, load_qnet, qdistance_matrix
from quasinet.qsampling import qsample, targeted_qsample
#from mpi4py.futures import MPIPoolExecutor
import sys
import subprocess
from scipy.stats import entropy
import multiprocessing as mp
import time
from cognet.util import embed_to_pca
import pkgutil
import os
class cognet:
"""Aggregate related Qnet functions
"""
def __init__(self):
"""Init
"""
self.year = None
self.n_jobs = 28
self.qnet = None
self.steps = 120
self.num_qsamples = None
self.all_samples = None
self.samples = None
self.samples_as_strings = None
self.features = None
self.cols = None
self.immutable_vars = None
self.mutable_vars = None
self.poles = None
self.polar_features = None
self.polar_indices = None
self.poles_dict = {}
self.d0 = None
self.qdistance_matrix_file = None
self.dissonance_file = None
self.s_null = None
self.D_null = None
self.mask_prob = 0.5
self.variation_weight = None
self.polar_matrix = None
def load_from_model(self,
model,
data_obj,
key,
im_vars=None,
m_vars=None):
"""load parameters from model object
Args:
model (Class): model obj for loading parameters
data_obj (class): instance of dataformatter class
key (str): 'all', 'train', or 'test', corresponding to sample type
im_vars (list[str], optional): Not implemented yet. Defaults to None.
m_vars (list[str], optional): Not implemented yet. Defaults to None.
"""
if model is not None:
self.qnet = model.myQnet
# self.cols = np.array(model.features)
featurenames, samples = data_obj.format_samples(key)
samples = pd.DataFrame(samples)
self.cols = featurenames
self.features = pd.DataFrame(columns=np.array(featurenames))
if any(x is not None for x in [model.immutable_vars, model.mutable_vars]):
if model.immutable_vars is not None:
self.immutable_vars = model.immutable_vars
self.mutable_vars = [x for x in self.features if x not in self.immutable_vars]
elif model.mutable_vars is not None:
self.mutable_vars = model.mutable_vars
self.immutable_vars = [x for x in self.features if x not in self.mutable_vars]
else:
self.mutable_vars = self.features
self.samples = pd.DataFrame(samples)
self.samples.columns = np.array(featurenames)
self.all_samples = self.samples
self.samples_as_strings = self.samples[featurenames].fillna('').values.astype(str)[:]
self.s_null=['']*len(self.samples_as_strings[0])
self.D_null=self.qnet.predict_distributions(self.s_null)
variation_weight = []
for d in self.D_null:
v=[]
for val in d.values():
v=np.append(v,val)
variation_weight.append(entropy(v,base=len(v)))
self.variation_weight = variation_weight
def load_from_dataformatter(self,
data_obj,
key):
"""read in either train or test data, specified by key, from data obj
Args:
data_obj (class): instance of dataformatter class
key (str): 'all', 'train', or 'test', corresponding to sample type
"""
featurenames, samples = data_obj.format_samples(key)
if any(x is not None for x in [self.features, self.samples]):
print("replacing original features/samples with dataformatter data")
self.cols = featurenames
self.features = pd.DataFrame(columns=self.cols)
self.samples = pd.DataFrame(samples,columns=self.features)
self.all_samples = self.samples
self.samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:]
self.s_null=['']*len(self.samples_as_strings[0])
return featurenames, samples
def load_data(self,
year,
features_by_year,
samples,
qnet):
'''load cols, features, samples, and qnet.
Args:
year (str): to identify cols/features.
features_by_year (str): file containing all features by year of the dataset.
samples (str): file of samples for that year.
Qnet (str): Qnet file location.
'''
self.qnet = load_qnet(qnet)
self.year = year
self.cols = np.array((pd.read_csv(features_by_year,
keep_default_na=True,
index_col=0).set_index(
'year')).loc[int(year)].apply(
eval).values[0])
self.features = pd.DataFrame(columns=self.cols)
self.mutable_vars = [x for x in self.cols]
#[self.cols].fillna('').values.astype(str)[:]
self.samples=pd.read_csv(samples)
self.samples = pd.concat([self.samples,self.features], axis=0)
self.all_samples = self.samples
self.samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:]
self.s_null=['']*len(self.samples_as_strings[0])
self.D_null=self.qnet.predict_distributions(self.s_null)
variation_weight = []
for d in self.D_null:
v=[]
for val in d.values():
v=np.append(v,val)
variation_weight.append(entropy(v,base=len(v)))
self.variation_weight = variation_weight
def set_immutable_vars(self,
IMMUTABLE_FILE):
'''set vars to immutable and mutable,
can prob combine this with the load_data func: only set the immutable vars if necessary
Args:
IMMUTABLE_FILE (str): file containing the immutable features/vars
'''
if self.cols is None:
raise ValueError("load_data first!")
self.immutable_vars = pd.read_csv(IMMUTABLE_FILE,index_col=0).transpose()
self.mutable_vars = None
self.mutable_vars = [x for x in self.cols
if x.upper() not in self.immutable_vars.columns]
def set_nsamples(self,
num_samples):
'''select a subset of the samples
Args:
num_samples (int): Set num of samples to subset
'''
self.samples = self.all_samples
if all(x is not None for x in [num_samples, self.samples]):
if num_samples > len(self.samples.index):
string = 'The number of selected samples ({}) ' + \
'is greater than the number of samples ({})!'
string = string.format(num_samples, len(self.samples.index))
raise ValueError(string)
if num_samples == len(self.samples.index):
string = 'The number of selected samples ({}) ' + \
'is equal to the number of samples ({})!'
string = string.format(num_samples, len(self.samples.index))
print(string)
self.samples = self.samples.sample(num_samples)
self.samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:]
elif self.samples is None:
raise ValueError("load_data first!")
def __variation_weight(self,
index):
"""
"""
d_=self.D_null[index]
v=[]
for val in d_.values():
v=np.append(v,val)
return entropy(v,base=len(v))
def getBaseFrequency(self,
sample):
'''get frequency of the variables
helper func for qsampling
Args:
sample (list[str]): vector of sample, must have the same num of features as the qnet
'''
MUTABLE=pd.DataFrame(np.zeros(len(self.cols)),index=self.cols).transpose()
for m in self.mutable_vars:
MUTABLE[m]=1.0
mutable_x=MUTABLE.values[0]
base_frequency=mutable_x/mutable_x.sum()
# commented out for now for testing using smaller qnet
for i in range(len(base_frequency)):
if base_frequency[i]>0.0:
base_frequency[i]= self.variation_weight[i]*base_frequency[i]
return base_frequency/base_frequency.sum()
def qsampling(self,
sample,
steps,
immutable=False):
'''perturb the sample based on thet qnet distributions and number of steps
Args:
sample (1d array-like): sample vector, must have the same num of features as the qnet
steps (int): number of steps to qsample
immutable (bool): are there variables that are immutable?
'''
if all(x is not None for x in [self.mutable_vars, sample]):
if immutable == True:
return qsample(sample,self.qnet,steps,self.getBaseFrequency(self.samples))
else:
return qsample(sample,self.qnet,steps)
elif self.mutable_vars is None:
raise ValueError("load_data first!")
def set_poles(self,
POLEFILE,
pole_1,
pole_2,
steps=0,
mutable=False):
'''set the poles and samples such that the samples contain features in poles
Args:
steps (int): number of steps to qsample
POLEFILE (str): file containing poles samples and features
mutable (boolean): Whether or not to set poles as the only mutable_vars
pole_1 (str): column name for first pole to use
pole_2 (str): column name for second pole to use
'''
invalid_count = 0
if all(x is not None for x in [self.samples, self.qnet]):
poles = pd.read_csv(POLEFILE, index_col=0)
self.poles=poles.transpose()
self.polar_features = pd.concat([self.poles, self.features], axis=0)
poles_dict = {}
for column in poles:
p_ = self.polar_features.loc[column][self.cols].fillna('').values.astype(str)[:]
poles_dict[column] = self.qsampling(p_,steps)
self.poles_dict = poles_dict
self.pL = self.poles_dict[pole_1]
self.pR = self.poles_dict[pole_2]
# self.pL = list(poles_dict.values())[0]
# self.pR = list(poles_dict.values())[1]
self.d0 = qdistance(self.pL, self.pR, self.qnet, self.qnet)
cols = [x for x in self.poles.columns if x in self.samples.columns]
self.samples=self.samples[cols]
for x in self.poles.columns:
if x not in self.samples.columns:
invalid_count += 1
self.samples[x]=np.nan
self.samples = pd.concat([self.samples,self.features], axis=0)
self.all_samples = self.samples
self.samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:]
if mutable:
self.mutable_vars=[x for x in self.cols if x in self.poles.columns]
elif self.samples is None:
raise ValueError("load_data first!")
print("{} pole features not found in sample features".format(invalid_count))
def distance(self,
sample1,
sample2,
nsteps1=0,
nsteps2=0):
"""qsamples each sample set num of steps, then takes qdistance
Args:
sample1 (list[str]): sample vector 1, must have the same num of features as the qnet
sample2 (list[str]): sample vector 2, must have the same num of features as the qnet
nsteps1 (int, optional): number of steps to qsample for sample1
nsteps2 (int, optional): number of steps to qsample for sample2
Returns:
float: qdistance
"""
if self.qnet is None:
raise ValueError("load qnet first!")
bp1 = self.getBaseFrequency(sample1)
bp2 = self.getBaseFrequency(sample2)
sample1 = qsample(sample1, self.qnet, nsteps1)#, baseline_prob=bp1)
sample2 = qsample(sample2, self.qnet, nsteps2)#, baseline_prob=bp2)
return qdistance(sample1, sample2, self.qnet, self.qnet)
def __distfunc(self,
x,
y):
'''Compute distance between two samples
Args:
x (list[str]): first sample
y (list[str]): second sample
'''
d=qdistance(x,y,self.qnet,self.qnet)
return d
def polarDistance(self,
i,
return_dict=None):
"""return the distance from a sample to the poles
Args:
i (int): index of sample to take
return_dict (dict): dict used for multiple sample function
Returns:
[type]: [description]
"""
samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:]
p = samples_as_strings[i]
distances = []
for index, row in self.polar_features[self.cols].iterrows():
row = row.fillna('').values.astype(str)[:]
distances.append(self.distance(p, np.array(row)))
if return_dict is not None:
return_dict[i] = distances
return distances
def polarDistance_multiple(self,
outfile):
"""return the distance from all samples to the poles
Args:
outfile (str): desired output filename and path
"""
if all(x is not None for x in [self.samples, self.cols,
self.polar_features]):
manager = mp.Manager()
return_dict = manager.dict()
processes = []
for i in range(len(self.samples)):
p = mp.Process(target=self.polarDistance, args=(i, return_dict))
processes.append(p)
[x.start() for x in processes]
[x.join() for x in processes]
pole_names = []
for index, row in self.polar_features[self.cols].iterrows():
pole_names.append(index)
result=[x for x in return_dict.values()]
result=pd.DataFrame(result,columns=pole_names).to_csv(outfile)
else:
raise ValueError("load data first!")
return return_dict
def distfunc_line(self,
i,
return_dict=None):
'''compute the dist for a row, or vector of samples
Args:
i (int): row
Return:
numpy.ndarray(float)
'''
if all(x is not None for x in [self.samples, self.features]):
w = self.samples.index.size
line = np.zeros(w)
y = self.samples_as_strings[i]
for j in range(w):
# only compute half of the distance matrix
if j > i:
x = self.samples_as_strings[j]
line[j] = self.__distfunc(x, y)
else:
raise ValueError("load_data first!")
if return_dict is not None:
return_dict[i] = line
return line
def distfunc_multiples(self,
outfile):
"""compute distance matrix for all samples in the dataset
Args:
outfile (str): desired output filename and path
"""
if all(x is not None for x in [self.samples, self.features]):
manager = mp.Manager()
return_dict = manager.dict()
processes = []
for i in range(len(self.samples)):
p = mp.Process(target=self.distfunc_line, args=(i, return_dict))
processes.append(p)
[x.start() for x in processes]
[x.join() for x in processes]
result=[x for x in return_dict.values()]
columns = [i for i in range(len(self.samples))]
result=pd.DataFrame(result,columns=columns, index=columns).sort_index(ascending=False)
result = result.to_numpy()
result = pd.DataFrame(np.maximum(result, result.transpose()))
result.to_csv(outfile)
else:
raise ValueError("load data first!")
return return_dict
def polar_separation(self,
nsteps=0):
"""returns the distance between poles as a qdistance matrix
Args:
nsteps (int, optional): [description]. Defaults to 0.
"""
polar_arraydata = self.polar_features[self.cols].fillna('').values.astype(str)[:]
samples_ = []
for vector in polar_arraydata:
bp = self.getBaseFrequency(vector)
sample = qsample(vector, self.qnet, nsteps, baseline_prob=bp)
samples_.append(sample)
samples_ = np.array(samples_)
self.polar_matrix = qdistance_matrix(samples_, samples_, self.qnet, self.qnet)
return self.polar_matrix
def embed(self,
infile,
name_pref,
out_dir,
pca_model=False,
EMBED_BINARY=None):
'''
embed data
Args:
infile (str): input file to be embedded
name_pref (str): preferred name for output file
out_dir (str): output dir for results
pca_model (bool): whether or not to generate PCA model
EMBED_BINARY (os.path.abspath): path to embed binary
'''
if all(x is not None for x in [self.year]):
yr = self.year
PREF = name_pref
FILE = infile
if EMBED_BINARY is None:
EMBED = pkgutil.get_data("cognet.bin", "__embed__.so")
else:
EMBED = EMBED_BINARY
DATAFILE = out_dir + 'data_' +yr
EFILE = out_dir + PREF + '_E_' +yr
DFILE = out_dir + PREF + '_D_' +yr
pd.read_csv(FILE, index_col=0).to_csv(DATAFILE,sep=' ',header=None,index=None)
STR=EMBED+' -f '+DATAFILE+' -E '+EFILE+' -D '+DFILE
subprocess.call(STR,shell=True)
if pca_model:
embed_to_pca(EFILE, EFILE+'_PCA')
elif self.year is None:
raise ValueError("load_data first!")
def __calc_d0(self,
pole_1,
pole_2):
"""calculate distance between two poles
Args:
pole_1 (list[str]): a polar vector, must have same number of features as qnet
pole_2 (list[str]): a polar vector, must have same number of features as qnet
"""
self.pL = self.poles_dict[pole_1]
self.pR = self.poles_dict[pole_2]
self.d0 = qdistance(self.pL, self.pR, self.qnet, self.qnet)
def ideology(self,
i,
return_dict=None,
pole_1=None,
pole_2=None):
"""return ideology index (left-leaning or right-leaning) for a singular sample
Args:
i (int): index of sample
pole_1 (int): index of Pole One to calc as base distance. Defaults to 0.
pole_2 (int): index of Pole Two to calc as base distance. Defaults to 1.
return_dict (dict, optional): dict containing results
"""
if pole_1 is not None or pole_2 is not None:
self.__calc_d0(pole_1, pole_2)
p = self.samples_as_strings[i]
dR = qdistance(self.pR, p, self.qnet, self.qnet)
dL = qdistance(self.pL, p, self.qnet, self.qnet)
ideology_index = (dR-dL)/self.d0
if return_dict is not None:
return_dict[i] = [ideology_index, dR, dL, self.d0]
return [ideology_index, dR, dL, self.d0]
def dispersion(self,
i,
return_dict=None):
"""qsamples a sample n times and takes distance matrix
to determine max and std of distances between qsamples
Args:
i (int): index of sample
return_dict (dict): dict containing results
Returns:
list[float]: std and max of the distances btwn qsamples
"""
p = self.samples_as_strings[i]
Qset = [qsample(p, self.qnet, self.steps) for j in np.arange(self.num_qsamples)]
Qset = np.array(Qset)
matrix = (qdistance_matrix(Qset, Qset, self.qnet, self.qnet))
Q = matrix.max()
Qsd = matrix.std()
if return_dict is not None:
return_dict[i] = [Qsd, Q]
return [Qsd, Q]
def compute_DLI_samples(self,
type,
outfile,
num_qsamples=40,
steps=120,
n_jobs=28,
pole_1=0,
pole_2=1):
"""compute and save ideology index or dispersion for all samples
Args:
num_qsamples (int): number of qsamples to compute
outfile (str): output file for results
type (str): whether to calc dispersion or ideology
steps (int): number of steps to qsample
n_jobs (int, optional): sets the number of jobs for parallelization. Defaults to 28.
pole_1 (int, optional): index of Pole One to calc as base distance. Defaults to 0.
pole_2 (int, optional): index of Pole Two to calc as base distance. Defaults to 1.
Raises:
ValueError: set poles if poles are not set
ValueError: load data if samples or features are not present
"""
if all(x is not None for x in [self.samples, self.features,
self.pL, self.pR]):
self.num_qsamples = num_qsamples
self.steps = steps
if pole_1 != 0 or pole_2 != 1:
self.__calc_d0(pole_1, pole_2)
# testing
# pd.DataFrame(self.samples_as_strings).to_csv('examples_results/class_allsamples_2018.csv')
manager = mp.Manager()
return_dict = manager.dict()
processes = []
if type == 'ideology':
for i in range(len(self.samples)):
p = mp.Process(target=self.ideology, args=(i, return_dict))
processes.append(p)
columns=['ideology', 'dR', 'dL', 'd0']
elif type == 'dispersion':
for i in range(len(self.samples)):
p = mp.Process(target=self.dispersion, args=(i, return_dict))
processes.append(p)
columns=['Qsd', 'Qmax']
else:
raise ValueError("Type must be either dispersion or ideology!")
[x.start() for x in processes]
[x.join() for x in processes]
result=[x for x in return_dict.values()]
result=pd.DataFrame(result,columns=columns).to_csv(outfile)
elif self.pL is None or self.pR is None:
raise ValueError("set_poles first!")
else:
raise ValueError("load_data first!")
return result
def compute_polar_indices(self,
num_samples = None,
polar_comp = False,
POLEFILE = None,
steps = 5):
'''set up polar indices for dissonance func
Args:
num_samples (int): subset of samples to take
polar_comp (bool): whether or not to set poles
POLEFILE (None): file containing pole samples and features
steps (int): number of steps to qsample
'''
if all(x is not None for x in [self.samples, self.features, self.poles]):
if num_samples is not None:
self.set_nsamples(num_samples)
# read sample data
if polar_comp:
self.set_poles(self.qnet, steps, POLEFILE)
polar_features = pd.concat([self.poles, self.features], axis=0)
self.polar_indices=np.where(polar_features[self.cols].fillna('XXXX').values[0]!='XXXX')[0]
elif self.poles is None:
raise ValueError("set_poles first!")
else:
raise ValueError("load_data first!")
def dissonance(self,
sample_index,
return_dict=None,
MISSING_VAL=0.0):
'''compute dissonance for each sample_index, helper function for all_dissonance
Args:
sample_index (int): index of the sample to compute dissonance
return_dict (dict): dict containing results
MISSING_VAL (float): default dissonance value
'''
if all(x is not None for x in [self.samples, self.features]):
s = self.samples_as_strings[sample_index]
if self.polar_indices is None:
self.polar_indices = range(len(s))
Ds=self.qnet.predict_distributions(s)
diss=np.ones(len(Ds))*MISSING_VAL
for i in self.polar_indices:
if s[i] != '':
if s[i] in Ds[i].keys():
diss[i]=1-Ds[i][s[i]]/np.max(
list(Ds[i].values()))
else:
diss[i]=1.0
if return_dict is not None:
return_dict[sample_index] = diss[self.polar_indices]
return diss[self.polar_indices]
else:
raise ValueError("load_data first!")
def dissonance_matrix(self,
output_file='/example_results/DISSONANCE_matrix.csv',
n_jobs=28):
'''get the dissonance for all samples
Args:
output_file (str): directory and/or file for output
n_jobs (int): number of jobs for pdqm
Returns:
pandas.DataFrame
'''
manager = mp.Manager()
return_dict = manager.dict()
processes = []
for i in range(len(self.samples)):
p = mp.Process(target=self.dissonance, args=(i, return_dict))
processes.append(p)
[x.start() for x in processes]
[x.join() for x in processes]
result=[x for x in return_dict.values()]
if self.polar_indices is not None:
polar_features = pd.concat([self.poles, self.features], axis=0)
cols = polar_features[self.cols].dropna(axis=1).columns
else:
cols = self.cols
result=pd.DataFrame(result,columns=cols).to_csv(output_file)
self.dissonance_file = output_file
return pd.DataFrame(return_dict.copy())
def __choose_one(self,
X):
'''returns a random element of X
Args:
X (1D array-like): vector from which random element is to be chosen
'''
X=list(X)
if len(X)>0:
return X[np.random.randint(len(X))]
return None
def getMaskedSample(self,
s,
mask_prob=0.5,
allow_all_mutable=False):
'''inputs a sample and randomly mask elements of the sample
Args:
s (list[str]): vector of sample, must have the same num of features as the qnet
mask_prob (float): float btwn 0 and 1, prob to mask element of sample
allow_all_mutable (bool): whether or not all variables are mutable
'''
if self.samples is not None:
MUTABLE=pd.DataFrame(np.zeros(len(self.cols)),index=self.cols).transpose()
WITHVAL=[x for x in self.cols[np.where(s)[0]] if x in self.mutable_vars ]
MASKrand=[x for x in WITHVAL if random.random() < mask_prob ]
for m in MASKrand:
MUTABLE[m]=1.0
mutable_x=MUTABLE.values[0]
base_frequency=mutable_x/mutable_x.sum()
# if np.isnan(base_frequency).any():
# return np.nan,np.nan,np.nan
# return self.getMaskedSample(s)
s1=s.copy()
for i in range(len(base_frequency)):
if base_frequency[i]>0.0001:
s1[i]=''
s_rand=np.copy(s)
rnd_match_prob=[]
max_match_prob=[]
D=self.qnet.predict_distributions(s)
for i in MASKrand:
s_rand[np.where(
self.cols==i)[0][0]]=self.__choose_one(
self.D_null[np.where(self.cols==i)[0][0]].keys())
rnd_match_prob=np.append(rnd_match_prob,1/len(
self.D_null[np.where(self.cols==i)[0][0]].keys()))
max_match_prob=np.append(
max_match_prob,np.max(
list(D[np.where(
self.cols==i)[0][0]].values())))
if allow_all_mutable:
for m in mutable_vars:
MUTABLE[m]=1.0
mutable_x=MUTABLE.values[0]
base_frequency=mutable_x/mutable_x.sum()
return s1,base_frequency,MASKrand,np.where(
base_frequency)[0],np.mean(rnd_match_prob),np.mean(max_match_prob),s_rand
else:
raise ValueError("load_data first!")
def randomMaskReconstruction(self,
index=None,
return_dict=None,
sample=None):
"""reconstruct the masked sample by qsampling and comparing to original
set self.mask_prob and self.steps if needed
Args:
return_dict (dict): dict containing results. Defaults to None.
sample (list[str], optional): sample vector, must have the same num of features as the qnet. Defaults to None.
index (int): index of sample to take. Defaults to None.
Raises:
ValueError: Neither sample or index were given
ValueError: Both sample and index were given
Returns:
[type]: [description]
"""
if all(x is None for x in [sample, index]):
raise ValueError("Must input either sample or index!")
elif all(x is not None for x in [sample, index]):
raise ValueError("Must input either sample or index not both!")
elif sample is not None:
s=np.array(pd.DataFrame(sample).fillna('').values.astype(str)[:])
elif index is not None:
s=self.samples_as_strings[index]
s1,bp,mask_,maskindex,rmatch_u,rmatch,s_rand=self.getMaskedSample(s,
mask_prob=self.mask_prob)
if np.isnan(bp).any():
return_dict[index] = np.nan,np.nan,np.nan
return np.nan,np.nan,np.nan
qs=qsample(s1,self.qnet,self.steps,bp)
dqestim=qdistance(s,qs,self.qnet,self.qnet)
dactual=qdistance(s,s1,self.qnet,self.qnet)
qdistance_time_end = time.time()
cmpf=pd.DataFrame([s,qs,s_rand],columns=self.cols,index=['s','q','r'])[mask_].transpose()
cmpf.index.name='gssvar'
cmpf.to_csv('examples_results/CMPF_2018/CMPF-'+str(index)+'.csv')
return_dict[index] = (1 - (dqestim/dactual))*100,rmatch_u,rmatch
return (1 - (dqestim/dactual))*100,rmatch_u,rmatch,s,qs,s_rand,mask_
def randomMaskReconstruction_multiple(self,
out_file):
'''runs and saves the results of the predicted masked sample
Args:
output_file (str): directory and/or file for output
'''
manager = mp.Manager()
return_dict = manager.dict()
processes = []
for i in range(len(self.samples)):
p = mp.Process(target=self.randomMaskReconstruction, args=(i, return_dict))
processes.append(p)
[x.start() for x in processes]
[x.join() for x in processes]
#result=pd.DataFrame(return_dict.items())[1]#, columns=['sample','rederr','r_prob','rand_err','s','q','r'])
result=[x for x in return_dict.values() if isinstance(x, tuple)]
# #result=pd.DataFrame(result.tolist())
# print(result)
# cmprdf=result[[3,4,5]]
# mask_=result[[6]]
# cmprdf.columns=['s','q','r']#[mask_].transpose()
# cmprdf.to_csv("examples_results/CMPF_"+"tmp"+".csv")
# print(cmprdf)
# result=result[[0,1,2]]
result=pd.DataFrame(result,columns=['rederr','r_prob','rand_err'])
result.rederr=result.rederr.astype(float)
if self.poles is not None:
result.to_csv(out_file)
else:
result.to_csv(out_file)
return result.rederr.mean(), result.rand_err.mean()
def dmat_filewriter(self,
pyfile,
QNETPATH,
MPI_SETUP_FILE="mpi_setup.sh",
MPI_RUN_FILE="mpi_run.sh",
MPI_LAUNCHER_FILE="mpi_launcher.sh",
YEARS='2016',
NODES=4,
T=12,
num_samples=None,
OUTFILE='tmp_distmatrix.csv'):
if all(x is not None for x in [self.poles_dict,self.features,
self.qnet, self.cols]):
if num_samples is not None:
self.set_nsamples(num_samples)
tmp_path = "mpi_tmp/"
pd.DataFrame(self.samples_as_strings).to_csv(tmp_path+"tmp_samples_as_strings.csv", header=None, index=None)
w = self.samples.index.size
if not os.path.exists(tmp_path):
os.makedirs(tmp_path)
with open(tmp_path+pyfile, 'w+') as f:
f.writelines(["from mpi4py.futures import MPIPoolExecutor\n",
"import numpy as np\n",
"import pandas as pd\n",
"from quasinet.qnet import Qnet, qdistance, load_qnet, qdistance_matrix\n",
"from quasinet.qsampling import qsample, targeted_qsample\n\n",
"qnet=load_qnet(\'{}\')\n".format(QNETPATH)])
f.writelines(["w = {}\n".format(w),
"h = w\n",
"p_all = pd.read_csv(\"tmp_samples_as_strings.csv\")\n\n"])
f.writelines(["def distfunc(x,y):\n",
"\td=qdistance(x,y,qnet,qnet)\n",
"\treturn d\n\n"])
f.writelines(["def dfunc_line(k):\n",
"\tline = np.zeros(w)\n",
"\ty = np.array(p_all.iloc[k])\n",
"\tfor j in range(w):\n",
"\t\tif j > k:\n",
"\t\t\tx = np.array(p_all.iloc[j])\n",
"\t\t\tline[j] = distfunc(x, y)\n",
"\treturn line\n\n"])
f.writelines(["if __name__ == '__main__':\n",
"\twith MPIPoolExecutor() as executor:\n",
"\t\tresult = executor.map(dfunc_line, range(h))\n",
"\t\tpd.DataFrame(result).to_csv(\'{}\',index=None,header=None)".format(OUTFILE)])
# with open(MPI_LAUNCHER_FILE, 'wx') as ml:
# ml.writelines(["#!/bin/bash\n\n",
# "PROG=\"\"\n",
# "RSTR=`cat /dev/urandom | tr -dc \'a-zA-Z0-9\' | fold -w 20 | head -n 1 | cut -c 1-6`\n",
# "JOBN=IXC\"$RSTR\"\n",
# "TIME=10\n",
# "NCOR=28\n",
# "MEMR=10\n",
# "NODE=1\n",
# "EXECUTE=0\n",
# "DEPEND=""\n",
# "CDIR=`pwd`\n",
# "DRY_RUN=0\n",
# "ANYDEPEND=""\n",
# "PART='sandyb'\n"
# ])
with open(tmp_path+MPI_SETUP_FILE, 'w+') as ms:
ms.writelines(["#!/bin/bash\n",
"YEAR=$1\n\n",
"if [ $# -gt 1 ] ; then\n",
"\tNODES=$2\n",
"else\n",
"\tNODES=3\n",
"fi\n",
"if [ $# -gt 2 ] ; then\n",
"\tNUM=$3\n",
"else\n",
"\tNUM='all'\n",
"fi\n",
"if [ $# -gt 3 ] ; then\n",
"\tPROG=$4\n",
"else\n",
"\tPROG=$(tty)\n",
"fi\n\n",
"NUMPROC=`expr 28 \* $NODES`\n",
"echo \"module load midway2\" >> $PROG\n",
"echo \"module unload python\" >> $PROG\n",
"echo \"module unload openmpi\" >> $PROG\n",
"echo \"module load python/anaconda-2020.02\" >> $PROG\n",
"echo \"module load mpi4py\" >> $PROG\n",
"echo \"date; mpiexec -n \"$NUMPROC\" python3 -m mpi4py.futures {}; date\" >> $PROG\n".format(pyfile),
])
with open(tmp_path+MPI_RUN_FILE, 'w+') as mr:
mr.writelines(["#!/bin/bash\n",
"YEARS=\'{}\'\n".format(YEARS),
"# nodes requested\n",
"NODES={}\n".format(NODES),
"# time requested\n",
"T={}\n".format(T),
"NUM=\'all\'\n",
"LAUNCH=\'../mpi_launcher.sh\'\n\n",
"for yr in `echo $YEARS`\n",
"do\n",
"\techo $yr\n",
"\t./{} $yr $NODES $NUM tmp_\"$yr\"\n".format(MPI_SETUP_FILE),
"\t$LAUNCH -P tmp_\"$yr\" -F -T $T -N \"$NODES\" -C 28 -p broadwl -J ACRDALL_\"$yr\" -M 56\n",
"done\n",
"rm tmp*\n"])
else:
raise ValueError("load data first!")
print("running")
Classes
class cognet
-
Aggregate related Qnet functions
Init
Expand source code
class cognet: """Aggregate related Qnet functions """ def __init__(self): """Init """ self.year = None self.n_jobs = 28 self.qnet = None self.steps = 120 self.num_qsamples = None self.all_samples = None self.samples = None self.samples_as_strings = None self.features = None self.cols = None self.immutable_vars = None self.mutable_vars = None self.poles = None self.polar_features = None self.polar_indices = None self.poles_dict = {} self.d0 = None self.qdistance_matrix_file = None self.dissonance_file = None self.s_null = None self.D_null = None self.mask_prob = 0.5 self.variation_weight = None self.polar_matrix = None def load_from_model(self, model, data_obj, key, im_vars=None, m_vars=None): """load parameters from model object Args: model (Class): model obj for loading parameters data_obj (class): instance of dataformatter class key (str): 'all', 'train', or 'test', corresponding to sample type im_vars (list[str], optional): Not implemented yet. Defaults to None. m_vars (list[str], optional): Not implemented yet. Defaults to None. """ if model is not None: self.qnet = model.myQnet # self.cols = np.array(model.features) featurenames, samples = data_obj.format_samples(key) samples = pd.DataFrame(samples) self.cols = featurenames self.features = pd.DataFrame(columns=np.array(featurenames)) if any(x is not None for x in [model.immutable_vars, model.mutable_vars]): if model.immutable_vars is not None: self.immutable_vars = model.immutable_vars self.mutable_vars = [x for x in self.features if x not in self.immutable_vars] elif model.mutable_vars is not None: self.mutable_vars = model.mutable_vars self.immutable_vars = [x for x in self.features if x not in self.mutable_vars] else: self.mutable_vars = self.features self.samples = pd.DataFrame(samples) self.samples.columns = np.array(featurenames) self.all_samples = self.samples self.samples_as_strings = self.samples[featurenames].fillna('').values.astype(str)[:] self.s_null=['']*len(self.samples_as_strings[0]) self.D_null=self.qnet.predict_distributions(self.s_null) variation_weight = [] for d in self.D_null: v=[] for val in d.values(): v=np.append(v,val) variation_weight.append(entropy(v,base=len(v))) self.variation_weight = variation_weight def load_from_dataformatter(self, data_obj, key): """read in either train or test data, specified by key, from data obj Args: data_obj (class): instance of dataformatter class key (str): 'all', 'train', or 'test', corresponding to sample type """ featurenames, samples = data_obj.format_samples(key) if any(x is not None for x in [self.features, self.samples]): print("replacing original features/samples with dataformatter data") self.cols = featurenames self.features = pd.DataFrame(columns=self.cols) self.samples = pd.DataFrame(samples,columns=self.features) self.all_samples = self.samples self.samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:] self.s_null=['']*len(self.samples_as_strings[0]) return featurenames, samples def load_data(self, year, features_by_year, samples, qnet): '''load cols, features, samples, and qnet. Args: year (str): to identify cols/features. features_by_year (str): file containing all features by year of the dataset. samples (str): file of samples for that year. Qnet (str): Qnet file location. ''' self.qnet = load_qnet(qnet) self.year = year self.cols = np.array((pd.read_csv(features_by_year, keep_default_na=True, index_col=0).set_index( 'year')).loc[int(year)].apply( eval).values[0]) self.features = pd.DataFrame(columns=self.cols) self.mutable_vars = [x for x in self.cols] #[self.cols].fillna('').values.astype(str)[:] self.samples=pd.read_csv(samples) self.samples = pd.concat([self.samples,self.features], axis=0) self.all_samples = self.samples self.samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:] self.s_null=['']*len(self.samples_as_strings[0]) self.D_null=self.qnet.predict_distributions(self.s_null) variation_weight = [] for d in self.D_null: v=[] for val in d.values(): v=np.append(v,val) variation_weight.append(entropy(v,base=len(v))) self.variation_weight = variation_weight def set_immutable_vars(self, IMMUTABLE_FILE): '''set vars to immutable and mutable, can prob combine this with the load_data func: only set the immutable vars if necessary Args: IMMUTABLE_FILE (str): file containing the immutable features/vars ''' if self.cols is None: raise ValueError("load_data first!") self.immutable_vars = pd.read_csv(IMMUTABLE_FILE,index_col=0).transpose() self.mutable_vars = None self.mutable_vars = [x for x in self.cols if x.upper() not in self.immutable_vars.columns] def set_nsamples(self, num_samples): '''select a subset of the samples Args: num_samples (int): Set num of samples to subset ''' self.samples = self.all_samples if all(x is not None for x in [num_samples, self.samples]): if num_samples > len(self.samples.index): string = 'The number of selected samples ({}) ' + \ 'is greater than the number of samples ({})!' string = string.format(num_samples, len(self.samples.index)) raise ValueError(string) if num_samples == len(self.samples.index): string = 'The number of selected samples ({}) ' + \ 'is equal to the number of samples ({})!' string = string.format(num_samples, len(self.samples.index)) print(string) self.samples = self.samples.sample(num_samples) self.samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:] elif self.samples is None: raise ValueError("load_data first!") def __variation_weight(self, index): """ """ d_=self.D_null[index] v=[] for val in d_.values(): v=np.append(v,val) return entropy(v,base=len(v)) def getBaseFrequency(self, sample): '''get frequency of the variables helper func for qsampling Args: sample (list[str]): vector of sample, must have the same num of features as the qnet ''' MUTABLE=pd.DataFrame(np.zeros(len(self.cols)),index=self.cols).transpose() for m in self.mutable_vars: MUTABLE[m]=1.0 mutable_x=MUTABLE.values[0] base_frequency=mutable_x/mutable_x.sum() # commented out for now for testing using smaller qnet for i in range(len(base_frequency)): if base_frequency[i]>0.0: base_frequency[i]= self.variation_weight[i]*base_frequency[i] return base_frequency/base_frequency.sum() def qsampling(self, sample, steps, immutable=False): '''perturb the sample based on thet qnet distributions and number of steps Args: sample (1d array-like): sample vector, must have the same num of features as the qnet steps (int): number of steps to qsample immutable (bool): are there variables that are immutable? ''' if all(x is not None for x in [self.mutable_vars, sample]): if immutable == True: return qsample(sample,self.qnet,steps,self.getBaseFrequency(self.samples)) else: return qsample(sample,self.qnet,steps) elif self.mutable_vars is None: raise ValueError("load_data first!") def set_poles(self, POLEFILE, pole_1, pole_2, steps=0, mutable=False): '''set the poles and samples such that the samples contain features in poles Args: steps (int): number of steps to qsample POLEFILE (str): file containing poles samples and features mutable (boolean): Whether or not to set poles as the only mutable_vars pole_1 (str): column name for first pole to use pole_2 (str): column name for second pole to use ''' invalid_count = 0 if all(x is not None for x in [self.samples, self.qnet]): poles = pd.read_csv(POLEFILE, index_col=0) self.poles=poles.transpose() self.polar_features = pd.concat([self.poles, self.features], axis=0) poles_dict = {} for column in poles: p_ = self.polar_features.loc[column][self.cols].fillna('').values.astype(str)[:] poles_dict[column] = self.qsampling(p_,steps) self.poles_dict = poles_dict self.pL = self.poles_dict[pole_1] self.pR = self.poles_dict[pole_2] # self.pL = list(poles_dict.values())[0] # self.pR = list(poles_dict.values())[1] self.d0 = qdistance(self.pL, self.pR, self.qnet, self.qnet) cols = [x for x in self.poles.columns if x in self.samples.columns] self.samples=self.samples[cols] for x in self.poles.columns: if x not in self.samples.columns: invalid_count += 1 self.samples[x]=np.nan self.samples = pd.concat([self.samples,self.features], axis=0) self.all_samples = self.samples self.samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:] if mutable: self.mutable_vars=[x for x in self.cols if x in self.poles.columns] elif self.samples is None: raise ValueError("load_data first!") print("{} pole features not found in sample features".format(invalid_count)) def distance(self, sample1, sample2, nsteps1=0, nsteps2=0): """qsamples each sample set num of steps, then takes qdistance Args: sample1 (list[str]): sample vector 1, must have the same num of features as the qnet sample2 (list[str]): sample vector 2, must have the same num of features as the qnet nsteps1 (int, optional): number of steps to qsample for sample1 nsteps2 (int, optional): number of steps to qsample for sample2 Returns: float: qdistance """ if self.qnet is None: raise ValueError("load qnet first!") bp1 = self.getBaseFrequency(sample1) bp2 = self.getBaseFrequency(sample2) sample1 = qsample(sample1, self.qnet, nsteps1)#, baseline_prob=bp1) sample2 = qsample(sample2, self.qnet, nsteps2)#, baseline_prob=bp2) return qdistance(sample1, sample2, self.qnet, self.qnet) def __distfunc(self, x, y): '''Compute distance between two samples Args: x (list[str]): first sample y (list[str]): second sample ''' d=qdistance(x,y,self.qnet,self.qnet) return d def polarDistance(self, i, return_dict=None): """return the distance from a sample to the poles Args: i (int): index of sample to take return_dict (dict): dict used for multiple sample function Returns: [type]: [description] """ samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:] p = samples_as_strings[i] distances = [] for index, row in self.polar_features[self.cols].iterrows(): row = row.fillna('').values.astype(str)[:] distances.append(self.distance(p, np.array(row))) if return_dict is not None: return_dict[i] = distances return distances def polarDistance_multiple(self, outfile): """return the distance from all samples to the poles Args: outfile (str): desired output filename and path """ if all(x is not None for x in [self.samples, self.cols, self.polar_features]): manager = mp.Manager() return_dict = manager.dict() processes = [] for i in range(len(self.samples)): p = mp.Process(target=self.polarDistance, args=(i, return_dict)) processes.append(p) [x.start() for x in processes] [x.join() for x in processes] pole_names = [] for index, row in self.polar_features[self.cols].iterrows(): pole_names.append(index) result=[x for x in return_dict.values()] result=pd.DataFrame(result,columns=pole_names).to_csv(outfile) else: raise ValueError("load data first!") return return_dict def distfunc_line(self, i, return_dict=None): '''compute the dist for a row, or vector of samples Args: i (int): row Return: numpy.ndarray(float) ''' if all(x is not None for x in [self.samples, self.features]): w = self.samples.index.size line = np.zeros(w) y = self.samples_as_strings[i] for j in range(w): # only compute half of the distance matrix if j > i: x = self.samples_as_strings[j] line[j] = self.__distfunc(x, y) else: raise ValueError("load_data first!") if return_dict is not None: return_dict[i] = line return line def distfunc_multiples(self, outfile): """compute distance matrix for all samples in the dataset Args: outfile (str): desired output filename and path """ if all(x is not None for x in [self.samples, self.features]): manager = mp.Manager() return_dict = manager.dict() processes = [] for i in range(len(self.samples)): p = mp.Process(target=self.distfunc_line, args=(i, return_dict)) processes.append(p) [x.start() for x in processes] [x.join() for x in processes] result=[x for x in return_dict.values()] columns = [i for i in range(len(self.samples))] result=pd.DataFrame(result,columns=columns, index=columns).sort_index(ascending=False) result = result.to_numpy() result = pd.DataFrame(np.maximum(result, result.transpose())) result.to_csv(outfile) else: raise ValueError("load data first!") return return_dict def polar_separation(self, nsteps=0): """returns the distance between poles as a qdistance matrix Args: nsteps (int, optional): [description]. Defaults to 0. """ polar_arraydata = self.polar_features[self.cols].fillna('').values.astype(str)[:] samples_ = [] for vector in polar_arraydata: bp = self.getBaseFrequency(vector) sample = qsample(vector, self.qnet, nsteps, baseline_prob=bp) samples_.append(sample) samples_ = np.array(samples_) self.polar_matrix = qdistance_matrix(samples_, samples_, self.qnet, self.qnet) return self.polar_matrix def embed(self, infile, name_pref, out_dir, pca_model=False, EMBED_BINARY=None): ''' embed data Args: infile (str): input file to be embedded name_pref (str): preferred name for output file out_dir (str): output dir for results pca_model (bool): whether or not to generate PCA model EMBED_BINARY (os.path.abspath): path to embed binary ''' if all(x is not None for x in [self.year]): yr = self.year PREF = name_pref FILE = infile if EMBED_BINARY is None: EMBED = pkgutil.get_data("cognet.bin", "__embed__.so") else: EMBED = EMBED_BINARY DATAFILE = out_dir + 'data_' +yr EFILE = out_dir + PREF + '_E_' +yr DFILE = out_dir + PREF + '_D_' +yr pd.read_csv(FILE, index_col=0).to_csv(DATAFILE,sep=' ',header=None,index=None) STR=EMBED+' -f '+DATAFILE+' -E '+EFILE+' -D '+DFILE subprocess.call(STR,shell=True) if pca_model: embed_to_pca(EFILE, EFILE+'_PCA') elif self.year is None: raise ValueError("load_data first!") def __calc_d0(self, pole_1, pole_2): """calculate distance between two poles Args: pole_1 (list[str]): a polar vector, must have same number of features as qnet pole_2 (list[str]): a polar vector, must have same number of features as qnet """ self.pL = self.poles_dict[pole_1] self.pR = self.poles_dict[pole_2] self.d0 = qdistance(self.pL, self.pR, self.qnet, self.qnet) def ideology(self, i, return_dict=None, pole_1=None, pole_2=None): """return ideology index (left-leaning or right-leaning) for a singular sample Args: i (int): index of sample pole_1 (int): index of Pole One to calc as base distance. Defaults to 0. pole_2 (int): index of Pole Two to calc as base distance. Defaults to 1. return_dict (dict, optional): dict containing results """ if pole_1 is not None or pole_2 is not None: self.__calc_d0(pole_1, pole_2) p = self.samples_as_strings[i] dR = qdistance(self.pR, p, self.qnet, self.qnet) dL = qdistance(self.pL, p, self.qnet, self.qnet) ideology_index = (dR-dL)/self.d0 if return_dict is not None: return_dict[i] = [ideology_index, dR, dL, self.d0] return [ideology_index, dR, dL, self.d0] def dispersion(self, i, return_dict=None): """qsamples a sample n times and takes distance matrix to determine max and std of distances between qsamples Args: i (int): index of sample return_dict (dict): dict containing results Returns: list[float]: std and max of the distances btwn qsamples """ p = self.samples_as_strings[i] Qset = [qsample(p, self.qnet, self.steps) for j in np.arange(self.num_qsamples)] Qset = np.array(Qset) matrix = (qdistance_matrix(Qset, Qset, self.qnet, self.qnet)) Q = matrix.max() Qsd = matrix.std() if return_dict is not None: return_dict[i] = [Qsd, Q] return [Qsd, Q] def compute_DLI_samples(self, type, outfile, num_qsamples=40, steps=120, n_jobs=28, pole_1=0, pole_2=1): """compute and save ideology index or dispersion for all samples Args: num_qsamples (int): number of qsamples to compute outfile (str): output file for results type (str): whether to calc dispersion or ideology steps (int): number of steps to qsample n_jobs (int, optional): sets the number of jobs for parallelization. Defaults to 28. pole_1 (int, optional): index of Pole One to calc as base distance. Defaults to 0. pole_2 (int, optional): index of Pole Two to calc as base distance. Defaults to 1. Raises: ValueError: set poles if poles are not set ValueError: load data if samples or features are not present """ if all(x is not None for x in [self.samples, self.features, self.pL, self.pR]): self.num_qsamples = num_qsamples self.steps = steps if pole_1 != 0 or pole_2 != 1: self.__calc_d0(pole_1, pole_2) # testing # pd.DataFrame(self.samples_as_strings).to_csv('examples_results/class_allsamples_2018.csv') manager = mp.Manager() return_dict = manager.dict() processes = [] if type == 'ideology': for i in range(len(self.samples)): p = mp.Process(target=self.ideology, args=(i, return_dict)) processes.append(p) columns=['ideology', 'dR', 'dL', 'd0'] elif type == 'dispersion': for i in range(len(self.samples)): p = mp.Process(target=self.dispersion, args=(i, return_dict)) processes.append(p) columns=['Qsd', 'Qmax'] else: raise ValueError("Type must be either dispersion or ideology!") [x.start() for x in processes] [x.join() for x in processes] result=[x for x in return_dict.values()] result=pd.DataFrame(result,columns=columns).to_csv(outfile) elif self.pL is None or self.pR is None: raise ValueError("set_poles first!") else: raise ValueError("load_data first!") return result def compute_polar_indices(self, num_samples = None, polar_comp = False, POLEFILE = None, steps = 5): '''set up polar indices for dissonance func Args: num_samples (int): subset of samples to take polar_comp (bool): whether or not to set poles POLEFILE (None): file containing pole samples and features steps (int): number of steps to qsample ''' if all(x is not None for x in [self.samples, self.features, self.poles]): if num_samples is not None: self.set_nsamples(num_samples) # read sample data if polar_comp: self.set_poles(self.qnet, steps, POLEFILE) polar_features = pd.concat([self.poles, self.features], axis=0) self.polar_indices=np.where(polar_features[self.cols].fillna('XXXX').values[0]!='XXXX')[0] elif self.poles is None: raise ValueError("set_poles first!") else: raise ValueError("load_data first!") def dissonance(self, sample_index, return_dict=None, MISSING_VAL=0.0): '''compute dissonance for each sample_index, helper function for all_dissonance Args: sample_index (int): index of the sample to compute dissonance return_dict (dict): dict containing results MISSING_VAL (float): default dissonance value ''' if all(x is not None for x in [self.samples, self.features]): s = self.samples_as_strings[sample_index] if self.polar_indices is None: self.polar_indices = range(len(s)) Ds=self.qnet.predict_distributions(s) diss=np.ones(len(Ds))*MISSING_VAL for i in self.polar_indices: if s[i] != '': if s[i] in Ds[i].keys(): diss[i]=1-Ds[i][s[i]]/np.max( list(Ds[i].values())) else: diss[i]=1.0 if return_dict is not None: return_dict[sample_index] = diss[self.polar_indices] return diss[self.polar_indices] else: raise ValueError("load_data first!") def dissonance_matrix(self, output_file='/example_results/DISSONANCE_matrix.csv', n_jobs=28): '''get the dissonance for all samples Args: output_file (str): directory and/or file for output n_jobs (int): number of jobs for pdqm Returns: pandas.DataFrame ''' manager = mp.Manager() return_dict = manager.dict() processes = [] for i in range(len(self.samples)): p = mp.Process(target=self.dissonance, args=(i, return_dict)) processes.append(p) [x.start() for x in processes] [x.join() for x in processes] result=[x for x in return_dict.values()] if self.polar_indices is not None: polar_features = pd.concat([self.poles, self.features], axis=0) cols = polar_features[self.cols].dropna(axis=1).columns else: cols = self.cols result=pd.DataFrame(result,columns=cols).to_csv(output_file) self.dissonance_file = output_file return pd.DataFrame(return_dict.copy()) def __choose_one(self, X): '''returns a random element of X Args: X (1D array-like): vector from which random element is to be chosen ''' X=list(X) if len(X)>0: return X[np.random.randint(len(X))] return None def getMaskedSample(self, s, mask_prob=0.5, allow_all_mutable=False): '''inputs a sample and randomly mask elements of the sample Args: s (list[str]): vector of sample, must have the same num of features as the qnet mask_prob (float): float btwn 0 and 1, prob to mask element of sample allow_all_mutable (bool): whether or not all variables are mutable ''' if self.samples is not None: MUTABLE=pd.DataFrame(np.zeros(len(self.cols)),index=self.cols).transpose() WITHVAL=[x for x in self.cols[np.where(s)[0]] if x in self.mutable_vars ] MASKrand=[x for x in WITHVAL if random.random() < mask_prob ] for m in MASKrand: MUTABLE[m]=1.0 mutable_x=MUTABLE.values[0] base_frequency=mutable_x/mutable_x.sum() # if np.isnan(base_frequency).any(): # return np.nan,np.nan,np.nan # return self.getMaskedSample(s) s1=s.copy() for i in range(len(base_frequency)): if base_frequency[i]>0.0001: s1[i]='' s_rand=np.copy(s) rnd_match_prob=[] max_match_prob=[] D=self.qnet.predict_distributions(s) for i in MASKrand: s_rand[np.where( self.cols==i)[0][0]]=self.__choose_one( self.D_null[np.where(self.cols==i)[0][0]].keys()) rnd_match_prob=np.append(rnd_match_prob,1/len( self.D_null[np.where(self.cols==i)[0][0]].keys())) max_match_prob=np.append( max_match_prob,np.max( list(D[np.where( self.cols==i)[0][0]].values()))) if allow_all_mutable: for m in mutable_vars: MUTABLE[m]=1.0 mutable_x=MUTABLE.values[0] base_frequency=mutable_x/mutable_x.sum() return s1,base_frequency,MASKrand,np.where( base_frequency)[0],np.mean(rnd_match_prob),np.mean(max_match_prob),s_rand else: raise ValueError("load_data first!") def randomMaskReconstruction(self, index=None, return_dict=None, sample=None): """reconstruct the masked sample by qsampling and comparing to original set self.mask_prob and self.steps if needed Args: return_dict (dict): dict containing results. Defaults to None. sample (list[str], optional): sample vector, must have the same num of features as the qnet. Defaults to None. index (int): index of sample to take. Defaults to None. Raises: ValueError: Neither sample or index were given ValueError: Both sample and index were given Returns: [type]: [description] """ if all(x is None for x in [sample, index]): raise ValueError("Must input either sample or index!") elif all(x is not None for x in [sample, index]): raise ValueError("Must input either sample or index not both!") elif sample is not None: s=np.array(pd.DataFrame(sample).fillna('').values.astype(str)[:]) elif index is not None: s=self.samples_as_strings[index] s1,bp,mask_,maskindex,rmatch_u,rmatch,s_rand=self.getMaskedSample(s, mask_prob=self.mask_prob) if np.isnan(bp).any(): return_dict[index] = np.nan,np.nan,np.nan return np.nan,np.nan,np.nan qs=qsample(s1,self.qnet,self.steps,bp) dqestim=qdistance(s,qs,self.qnet,self.qnet) dactual=qdistance(s,s1,self.qnet,self.qnet) qdistance_time_end = time.time() cmpf=pd.DataFrame([s,qs,s_rand],columns=self.cols,index=['s','q','r'])[mask_].transpose() cmpf.index.name='gssvar' cmpf.to_csv('examples_results/CMPF_2018/CMPF-'+str(index)+'.csv') return_dict[index] = (1 - (dqestim/dactual))*100,rmatch_u,rmatch return (1 - (dqestim/dactual))*100,rmatch_u,rmatch,s,qs,s_rand,mask_ def randomMaskReconstruction_multiple(self, out_file): '''runs and saves the results of the predicted masked sample Args: output_file (str): directory and/or file for output ''' manager = mp.Manager() return_dict = manager.dict() processes = [] for i in range(len(self.samples)): p = mp.Process(target=self.randomMaskReconstruction, args=(i, return_dict)) processes.append(p) [x.start() for x in processes] [x.join() for x in processes] #result=pd.DataFrame(return_dict.items())[1]#, columns=['sample','rederr','r_prob','rand_err','s','q','r']) result=[x for x in return_dict.values() if isinstance(x, tuple)] # #result=pd.DataFrame(result.tolist()) # print(result) # cmprdf=result[[3,4,5]] # mask_=result[[6]] # cmprdf.columns=['s','q','r']#[mask_].transpose() # cmprdf.to_csv("examples_results/CMPF_"+"tmp"+".csv") # print(cmprdf) # result=result[[0,1,2]] result=pd.DataFrame(result,columns=['rederr','r_prob','rand_err']) result.rederr=result.rederr.astype(float) if self.poles is not None: result.to_csv(out_file) else: result.to_csv(out_file) return result.rederr.mean(), result.rand_err.mean() def dmat_filewriter(self, pyfile, QNETPATH, MPI_SETUP_FILE="mpi_setup.sh", MPI_RUN_FILE="mpi_run.sh", MPI_LAUNCHER_FILE="mpi_launcher.sh", YEARS='2016', NODES=4, T=12, num_samples=None, OUTFILE='tmp_distmatrix.csv'): if all(x is not None for x in [self.poles_dict,self.features, self.qnet, self.cols]): if num_samples is not None: self.set_nsamples(num_samples) tmp_path = "mpi_tmp/" pd.DataFrame(self.samples_as_strings).to_csv(tmp_path+"tmp_samples_as_strings.csv", header=None, index=None) w = self.samples.index.size if not os.path.exists(tmp_path): os.makedirs(tmp_path) with open(tmp_path+pyfile, 'w+') as f: f.writelines(["from mpi4py.futures import MPIPoolExecutor\n", "import numpy as np\n", "import pandas as pd\n", "from quasinet.qnet import Qnet, qdistance, load_qnet, qdistance_matrix\n", "from quasinet.qsampling import qsample, targeted_qsample\n\n", "qnet=load_qnet(\'{}\')\n".format(QNETPATH)]) f.writelines(["w = {}\n".format(w), "h = w\n", "p_all = pd.read_csv(\"tmp_samples_as_strings.csv\")\n\n"]) f.writelines(["def distfunc(x,y):\n", "\td=qdistance(x,y,qnet,qnet)\n", "\treturn d\n\n"]) f.writelines(["def dfunc_line(k):\n", "\tline = np.zeros(w)\n", "\ty = np.array(p_all.iloc[k])\n", "\tfor j in range(w):\n", "\t\tif j > k:\n", "\t\t\tx = np.array(p_all.iloc[j])\n", "\t\t\tline[j] = distfunc(x, y)\n", "\treturn line\n\n"]) f.writelines(["if __name__ == '__main__':\n", "\twith MPIPoolExecutor() as executor:\n", "\t\tresult = executor.map(dfunc_line, range(h))\n", "\t\tpd.DataFrame(result).to_csv(\'{}\',index=None,header=None)".format(OUTFILE)]) # with open(MPI_LAUNCHER_FILE, 'wx') as ml: # ml.writelines(["#!/bin/bash\n\n", # "PROG=\"\"\n", # "RSTR=`cat /dev/urandom | tr -dc \'a-zA-Z0-9\' | fold -w 20 | head -n 1 | cut -c 1-6`\n", # "JOBN=IXC\"$RSTR\"\n", # "TIME=10\n", # "NCOR=28\n", # "MEMR=10\n", # "NODE=1\n", # "EXECUTE=0\n", # "DEPEND=""\n", # "CDIR=`pwd`\n", # "DRY_RUN=0\n", # "ANYDEPEND=""\n", # "PART='sandyb'\n" # ]) with open(tmp_path+MPI_SETUP_FILE, 'w+') as ms: ms.writelines(["#!/bin/bash\n", "YEAR=$1\n\n", "if [ $# -gt 1 ] ; then\n", "\tNODES=$2\n", "else\n", "\tNODES=3\n", "fi\n", "if [ $# -gt 2 ] ; then\n", "\tNUM=$3\n", "else\n", "\tNUM='all'\n", "fi\n", "if [ $# -gt 3 ] ; then\n", "\tPROG=$4\n", "else\n", "\tPROG=$(tty)\n", "fi\n\n", "NUMPROC=`expr 28 \* $NODES`\n", "echo \"module load midway2\" >> $PROG\n", "echo \"module unload python\" >> $PROG\n", "echo \"module unload openmpi\" >> $PROG\n", "echo \"module load python/anaconda-2020.02\" >> $PROG\n", "echo \"module load mpi4py\" >> $PROG\n", "echo \"date; mpiexec -n \"$NUMPROC\" python3 -m mpi4py.futures {}; date\" >> $PROG\n".format(pyfile), ]) with open(tmp_path+MPI_RUN_FILE, 'w+') as mr: mr.writelines(["#!/bin/bash\n", "YEARS=\'{}\'\n".format(YEARS), "# nodes requested\n", "NODES={}\n".format(NODES), "# time requested\n", "T={}\n".format(T), "NUM=\'all\'\n", "LAUNCH=\'../mpi_launcher.sh\'\n\n", "for yr in `echo $YEARS`\n", "do\n", "\techo $yr\n", "\t./{} $yr $NODES $NUM tmp_\"$yr\"\n".format(MPI_SETUP_FILE), "\t$LAUNCH -P tmp_\"$yr\" -F -T $T -N \"$NODES\" -C 28 -p broadwl -J ACRDALL_\"$yr\" -M 56\n", "done\n", "rm tmp*\n"]) else: raise ValueError("load data first!") print("running")
Methods
def compute_DLI_samples(self, type, outfile, num_qsamples=40, steps=120, n_jobs=28, pole_1=0, pole_2=1)
-
compute and save ideology index or dispersion for all samples
Args
num_qsamples
:int
- number of qsamples to compute
outfile
:str
- output file for results
type
:str
- whether to calc dispersion or ideology
steps
:int
- number of steps to qsample
n_jobs
:int
, optional- sets the number of jobs for parallelization. Defaults to 28.
pole_1
:int
, optional- index of Pole One to calc as base distance. Defaults to 0.
pole_2
:int
, optional- index of Pole Two to calc as base distance. Defaults to 1.
Raises
ValueError
- set poles if poles are not set
ValueError
- load data if samples or features are not present
Expand source code
def compute_DLI_samples(self, type, outfile, num_qsamples=40, steps=120, n_jobs=28, pole_1=0, pole_2=1): """compute and save ideology index or dispersion for all samples Args: num_qsamples (int): number of qsamples to compute outfile (str): output file for results type (str): whether to calc dispersion or ideology steps (int): number of steps to qsample n_jobs (int, optional): sets the number of jobs for parallelization. Defaults to 28. pole_1 (int, optional): index of Pole One to calc as base distance. Defaults to 0. pole_2 (int, optional): index of Pole Two to calc as base distance. Defaults to 1. Raises: ValueError: set poles if poles are not set ValueError: load data if samples or features are not present """ if all(x is not None for x in [self.samples, self.features, self.pL, self.pR]): self.num_qsamples = num_qsamples self.steps = steps if pole_1 != 0 or pole_2 != 1: self.__calc_d0(pole_1, pole_2) # testing # pd.DataFrame(self.samples_as_strings).to_csv('examples_results/class_allsamples_2018.csv') manager = mp.Manager() return_dict = manager.dict() processes = [] if type == 'ideology': for i in range(len(self.samples)): p = mp.Process(target=self.ideology, args=(i, return_dict)) processes.append(p) columns=['ideology', 'dR', 'dL', 'd0'] elif type == 'dispersion': for i in range(len(self.samples)): p = mp.Process(target=self.dispersion, args=(i, return_dict)) processes.append(p) columns=['Qsd', 'Qmax'] else: raise ValueError("Type must be either dispersion or ideology!") [x.start() for x in processes] [x.join() for x in processes] result=[x for x in return_dict.values()] result=pd.DataFrame(result,columns=columns).to_csv(outfile) elif self.pL is None or self.pR is None: raise ValueError("set_poles first!") else: raise ValueError("load_data first!") return result
def compute_polar_indices(self, num_samples=None, polar_comp=False, POLEFILE=None, steps=5)
-
set up polar indices for dissonance func
Args
num_samples
:int
- subset of samples to take
polar_comp
:bool
- whether or not to set poles
POLEFILE
:None
- file containing pole samples and features
steps
:int
- number of steps to qsample
Expand source code
def compute_polar_indices(self, num_samples = None, polar_comp = False, POLEFILE = None, steps = 5): '''set up polar indices for dissonance func Args: num_samples (int): subset of samples to take polar_comp (bool): whether or not to set poles POLEFILE (None): file containing pole samples and features steps (int): number of steps to qsample ''' if all(x is not None for x in [self.samples, self.features, self.poles]): if num_samples is not None: self.set_nsamples(num_samples) # read sample data if polar_comp: self.set_poles(self.qnet, steps, POLEFILE) polar_features = pd.concat([self.poles, self.features], axis=0) self.polar_indices=np.where(polar_features[self.cols].fillna('XXXX').values[0]!='XXXX')[0] elif self.poles is None: raise ValueError("set_poles first!") else: raise ValueError("load_data first!")
def dispersion(self, i, return_dict=None)
-
qsamples a sample n times and takes distance matrix to determine max and std of distances between qsamples
Args
i
:int
- index of sample
return_dict
:dict
- dict containing results
Returns
list[float]
- std and max of the distances btwn qsamples
Expand source code
def dispersion(self, i, return_dict=None): """qsamples a sample n times and takes distance matrix to determine max and std of distances between qsamples Args: i (int): index of sample return_dict (dict): dict containing results Returns: list[float]: std and max of the distances btwn qsamples """ p = self.samples_as_strings[i] Qset = [qsample(p, self.qnet, self.steps) for j in np.arange(self.num_qsamples)] Qset = np.array(Qset) matrix = (qdistance_matrix(Qset, Qset, self.qnet, self.qnet)) Q = matrix.max() Qsd = matrix.std() if return_dict is not None: return_dict[i] = [Qsd, Q] return [Qsd, Q]
def dissonance(self, sample_index, return_dict=None, MISSING_VAL=0.0)
-
compute dissonance for each sample_index, helper function for all_dissonance
Args
sample_index
:int
- index of the sample to compute dissonance
return_dict
:dict
- dict containing results
MISSING_VAL
:float
- default dissonance value
Expand source code
def dissonance(self, sample_index, return_dict=None, MISSING_VAL=0.0): '''compute dissonance for each sample_index, helper function for all_dissonance Args: sample_index (int): index of the sample to compute dissonance return_dict (dict): dict containing results MISSING_VAL (float): default dissonance value ''' if all(x is not None for x in [self.samples, self.features]): s = self.samples_as_strings[sample_index] if self.polar_indices is None: self.polar_indices = range(len(s)) Ds=self.qnet.predict_distributions(s) diss=np.ones(len(Ds))*MISSING_VAL for i in self.polar_indices: if s[i] != '': if s[i] in Ds[i].keys(): diss[i]=1-Ds[i][s[i]]/np.max( list(Ds[i].values())) else: diss[i]=1.0 if return_dict is not None: return_dict[sample_index] = diss[self.polar_indices] return diss[self.polar_indices] else: raise ValueError("load_data first!")
def dissonance_matrix(self, output_file='/example_results/DISSONANCE_matrix.csv', n_jobs=28)
-
get the dissonance for all samples
Args
output_file
:str
- directory and/or file for output
n_jobs
:int
- number of jobs for pdqm
Returns
pandas.DataFrame
Expand source code
def dissonance_matrix(self, output_file='/example_results/DISSONANCE_matrix.csv', n_jobs=28): '''get the dissonance for all samples Args: output_file (str): directory and/or file for output n_jobs (int): number of jobs for pdqm Returns: pandas.DataFrame ''' manager = mp.Manager() return_dict = manager.dict() processes = [] for i in range(len(self.samples)): p = mp.Process(target=self.dissonance, args=(i, return_dict)) processes.append(p) [x.start() for x in processes] [x.join() for x in processes] result=[x for x in return_dict.values()] if self.polar_indices is not None: polar_features = pd.concat([self.poles, self.features], axis=0) cols = polar_features[self.cols].dropna(axis=1).columns else: cols = self.cols result=pd.DataFrame(result,columns=cols).to_csv(output_file) self.dissonance_file = output_file return pd.DataFrame(return_dict.copy())
def distance(self, sample1, sample2, nsteps1=0, nsteps2=0)
-
qsamples each sample set num of steps, then takes qdistance
Args
sample1
:list[str]
- sample vector 1, must have the same num of features as the qnet
sample2
:list[str]
- sample vector 2, must have the same num of features as the qnet
nsteps1
:int
, optional- number of steps to qsample for sample1
nsteps2
:int
, optional- number of steps to qsample for sample2
Returns
float
- qdistance
Expand source code
def distance(self, sample1, sample2, nsteps1=0, nsteps2=0): """qsamples each sample set num of steps, then takes qdistance Args: sample1 (list[str]): sample vector 1, must have the same num of features as the qnet sample2 (list[str]): sample vector 2, must have the same num of features as the qnet nsteps1 (int, optional): number of steps to qsample for sample1 nsteps2 (int, optional): number of steps to qsample for sample2 Returns: float: qdistance """ if self.qnet is None: raise ValueError("load qnet first!") bp1 = self.getBaseFrequency(sample1) bp2 = self.getBaseFrequency(sample2) sample1 = qsample(sample1, self.qnet, nsteps1)#, baseline_prob=bp1) sample2 = qsample(sample2, self.qnet, nsteps2)#, baseline_prob=bp2) return qdistance(sample1, sample2, self.qnet, self.qnet)
def distfunc_line(self, i, return_dict=None)
-
compute the dist for a row, or vector of samples
Args
i
:int
- row
Return
numpy.ndarray(float)
Expand source code
def distfunc_line(self, i, return_dict=None): '''compute the dist for a row, or vector of samples Args: i (int): row Return: numpy.ndarray(float) ''' if all(x is not None for x in [self.samples, self.features]): w = self.samples.index.size line = np.zeros(w) y = self.samples_as_strings[i] for j in range(w): # only compute half of the distance matrix if j > i: x = self.samples_as_strings[j] line[j] = self.__distfunc(x, y) else: raise ValueError("load_data first!") if return_dict is not None: return_dict[i] = line return line
def distfunc_multiples(self, outfile)
-
compute distance matrix for all samples in the dataset
Args
outfile
:str
- desired output filename and path
Expand source code
def distfunc_multiples(self, outfile): """compute distance matrix for all samples in the dataset Args: outfile (str): desired output filename and path """ if all(x is not None for x in [self.samples, self.features]): manager = mp.Manager() return_dict = manager.dict() processes = [] for i in range(len(self.samples)): p = mp.Process(target=self.distfunc_line, args=(i, return_dict)) processes.append(p) [x.start() for x in processes] [x.join() for x in processes] result=[x for x in return_dict.values()] columns = [i for i in range(len(self.samples))] result=pd.DataFrame(result,columns=columns, index=columns).sort_index(ascending=False) result = result.to_numpy() result = pd.DataFrame(np.maximum(result, result.transpose())) result.to_csv(outfile) else: raise ValueError("load data first!") return return_dict
def dmat_filewriter(self, pyfile, QNETPATH, MPI_SETUP_FILE='mpi_setup.sh', MPI_RUN_FILE='mpi_run.sh', MPI_LAUNCHER_FILE='mpi_launcher.sh', YEARS='2016', NODES=4, T=12, num_samples=None, OUTFILE='tmp_distmatrix.csv')
-
Expand source code
def dmat_filewriter(self, pyfile, QNETPATH, MPI_SETUP_FILE="mpi_setup.sh", MPI_RUN_FILE="mpi_run.sh", MPI_LAUNCHER_FILE="mpi_launcher.sh", YEARS='2016', NODES=4, T=12, num_samples=None, OUTFILE='tmp_distmatrix.csv'): if all(x is not None for x in [self.poles_dict,self.features, self.qnet, self.cols]): if num_samples is not None: self.set_nsamples(num_samples) tmp_path = "mpi_tmp/" pd.DataFrame(self.samples_as_strings).to_csv(tmp_path+"tmp_samples_as_strings.csv", header=None, index=None) w = self.samples.index.size if not os.path.exists(tmp_path): os.makedirs(tmp_path) with open(tmp_path+pyfile, 'w+') as f: f.writelines(["from mpi4py.futures import MPIPoolExecutor\n", "import numpy as np\n", "import pandas as pd\n", "from quasinet.qnet import Qnet, qdistance, load_qnet, qdistance_matrix\n", "from quasinet.qsampling import qsample, targeted_qsample\n\n", "qnet=load_qnet(\'{}\')\n".format(QNETPATH)]) f.writelines(["w = {}\n".format(w), "h = w\n", "p_all = pd.read_csv(\"tmp_samples_as_strings.csv\")\n\n"]) f.writelines(["def distfunc(x,y):\n", "\td=qdistance(x,y,qnet,qnet)\n", "\treturn d\n\n"]) f.writelines(["def dfunc_line(k):\n", "\tline = np.zeros(w)\n", "\ty = np.array(p_all.iloc[k])\n", "\tfor j in range(w):\n", "\t\tif j > k:\n", "\t\t\tx = np.array(p_all.iloc[j])\n", "\t\t\tline[j] = distfunc(x, y)\n", "\treturn line\n\n"]) f.writelines(["if __name__ == '__main__':\n", "\twith MPIPoolExecutor() as executor:\n", "\t\tresult = executor.map(dfunc_line, range(h))\n", "\t\tpd.DataFrame(result).to_csv(\'{}\',index=None,header=None)".format(OUTFILE)]) # with open(MPI_LAUNCHER_FILE, 'wx') as ml: # ml.writelines(["#!/bin/bash\n\n", # "PROG=\"\"\n", # "RSTR=`cat /dev/urandom | tr -dc \'a-zA-Z0-9\' | fold -w 20 | head -n 1 | cut -c 1-6`\n", # "JOBN=IXC\"$RSTR\"\n", # "TIME=10\n", # "NCOR=28\n", # "MEMR=10\n", # "NODE=1\n", # "EXECUTE=0\n", # "DEPEND=""\n", # "CDIR=`pwd`\n", # "DRY_RUN=0\n", # "ANYDEPEND=""\n", # "PART='sandyb'\n" # ]) with open(tmp_path+MPI_SETUP_FILE, 'w+') as ms: ms.writelines(["#!/bin/bash\n", "YEAR=$1\n\n", "if [ $# -gt 1 ] ; then\n", "\tNODES=$2\n", "else\n", "\tNODES=3\n", "fi\n", "if [ $# -gt 2 ] ; then\n", "\tNUM=$3\n", "else\n", "\tNUM='all'\n", "fi\n", "if [ $# -gt 3 ] ; then\n", "\tPROG=$4\n", "else\n", "\tPROG=$(tty)\n", "fi\n\n", "NUMPROC=`expr 28 \* $NODES`\n", "echo \"module load midway2\" >> $PROG\n", "echo \"module unload python\" >> $PROG\n", "echo \"module unload openmpi\" >> $PROG\n", "echo \"module load python/anaconda-2020.02\" >> $PROG\n", "echo \"module load mpi4py\" >> $PROG\n", "echo \"date; mpiexec -n \"$NUMPROC\" python3 -m mpi4py.futures {}; date\" >> $PROG\n".format(pyfile), ]) with open(tmp_path+MPI_RUN_FILE, 'w+') as mr: mr.writelines(["#!/bin/bash\n", "YEARS=\'{}\'\n".format(YEARS), "# nodes requested\n", "NODES={}\n".format(NODES), "# time requested\n", "T={}\n".format(T), "NUM=\'all\'\n", "LAUNCH=\'../mpi_launcher.sh\'\n\n", "for yr in `echo $YEARS`\n", "do\n", "\techo $yr\n", "\t./{} $yr $NODES $NUM tmp_\"$yr\"\n".format(MPI_SETUP_FILE), "\t$LAUNCH -P tmp_\"$yr\" -F -T $T -N \"$NODES\" -C 28 -p broadwl -J ACRDALL_\"$yr\" -M 56\n", "done\n", "rm tmp*\n"]) else: raise ValueError("load data first!") print("running")
def embed(self, infile, name_pref, out_dir, pca_model=False, EMBED_BINARY=None)
-
embed data
Args
infile
:str
- input file to be embedded
name_pref
:str
- preferred name for output file
out_dir
:str
- output dir for results
pca_model
:bool
- whether or not to generate PCA model
EMBED_BINARY
:os.path.abspath
- path to embed binary
Expand source code
def embed(self, infile, name_pref, out_dir, pca_model=False, EMBED_BINARY=None): ''' embed data Args: infile (str): input file to be embedded name_pref (str): preferred name for output file out_dir (str): output dir for results pca_model (bool): whether or not to generate PCA model EMBED_BINARY (os.path.abspath): path to embed binary ''' if all(x is not None for x in [self.year]): yr = self.year PREF = name_pref FILE = infile if EMBED_BINARY is None: EMBED = pkgutil.get_data("cognet.bin", "__embed__.so") else: EMBED = EMBED_BINARY DATAFILE = out_dir + 'data_' +yr EFILE = out_dir + PREF + '_E_' +yr DFILE = out_dir + PREF + '_D_' +yr pd.read_csv(FILE, index_col=0).to_csv(DATAFILE,sep=' ',header=None,index=None) STR=EMBED+' -f '+DATAFILE+' -E '+EFILE+' -D '+DFILE subprocess.call(STR,shell=True) if pca_model: embed_to_pca(EFILE, EFILE+'_PCA') elif self.year is None: raise ValueError("load_data first!")
def getBaseFrequency(self, sample)
-
get frequency of the variables helper func for qsampling
Args
sample
:list[str]
- vector of sample, must have the same num of features as the qnet
Expand source code
def getBaseFrequency(self, sample): '''get frequency of the variables helper func for qsampling Args: sample (list[str]): vector of sample, must have the same num of features as the qnet ''' MUTABLE=pd.DataFrame(np.zeros(len(self.cols)),index=self.cols).transpose() for m in self.mutable_vars: MUTABLE[m]=1.0 mutable_x=MUTABLE.values[0] base_frequency=mutable_x/mutable_x.sum() # commented out for now for testing using smaller qnet for i in range(len(base_frequency)): if base_frequency[i]>0.0: base_frequency[i]= self.variation_weight[i]*base_frequency[i] return base_frequency/base_frequency.sum()
def getMaskedSample(self, s, mask_prob=0.5, allow_all_mutable=False)
-
inputs a sample and randomly mask elements of the sample
Args
s
:list[str]
- vector of sample, must have the same num of features as the qnet
mask_prob
:float
- float btwn 0 and 1, prob to mask element of sample
allow_all_mutable
:bool
- whether or not all variables are mutable
Expand source code
def getMaskedSample(self, s, mask_prob=0.5, allow_all_mutable=False): '''inputs a sample and randomly mask elements of the sample Args: s (list[str]): vector of sample, must have the same num of features as the qnet mask_prob (float): float btwn 0 and 1, prob to mask element of sample allow_all_mutable (bool): whether or not all variables are mutable ''' if self.samples is not None: MUTABLE=pd.DataFrame(np.zeros(len(self.cols)),index=self.cols).transpose() WITHVAL=[x for x in self.cols[np.where(s)[0]] if x in self.mutable_vars ] MASKrand=[x for x in WITHVAL if random.random() < mask_prob ] for m in MASKrand: MUTABLE[m]=1.0 mutable_x=MUTABLE.values[0] base_frequency=mutable_x/mutable_x.sum() # if np.isnan(base_frequency).any(): # return np.nan,np.nan,np.nan # return self.getMaskedSample(s) s1=s.copy() for i in range(len(base_frequency)): if base_frequency[i]>0.0001: s1[i]='' s_rand=np.copy(s) rnd_match_prob=[] max_match_prob=[] D=self.qnet.predict_distributions(s) for i in MASKrand: s_rand[np.where( self.cols==i)[0][0]]=self.__choose_one( self.D_null[np.where(self.cols==i)[0][0]].keys()) rnd_match_prob=np.append(rnd_match_prob,1/len( self.D_null[np.where(self.cols==i)[0][0]].keys())) max_match_prob=np.append( max_match_prob,np.max( list(D[np.where( self.cols==i)[0][0]].values()))) if allow_all_mutable: for m in mutable_vars: MUTABLE[m]=1.0 mutable_x=MUTABLE.values[0] base_frequency=mutable_x/mutable_x.sum() return s1,base_frequency,MASKrand,np.where( base_frequency)[0],np.mean(rnd_match_prob),np.mean(max_match_prob),s_rand else: raise ValueError("load_data first!")
def ideology(self, i, return_dict=None, pole_1=None, pole_2=None)
-
return ideology index (left-leaning or right-leaning) for a singular sample
Args
i
:int
- index of sample
pole_1
:int
- index of Pole One to calc as base distance. Defaults to 0.
pole_2
:int
- index of Pole Two to calc as base distance. Defaults to 1.
return_dict
:dict
, optional- dict containing results
Expand source code
def ideology(self, i, return_dict=None, pole_1=None, pole_2=None): """return ideology index (left-leaning or right-leaning) for a singular sample Args: i (int): index of sample pole_1 (int): index of Pole One to calc as base distance. Defaults to 0. pole_2 (int): index of Pole Two to calc as base distance. Defaults to 1. return_dict (dict, optional): dict containing results """ if pole_1 is not None or pole_2 is not None: self.__calc_d0(pole_1, pole_2) p = self.samples_as_strings[i] dR = qdistance(self.pR, p, self.qnet, self.qnet) dL = qdistance(self.pL, p, self.qnet, self.qnet) ideology_index = (dR-dL)/self.d0 if return_dict is not None: return_dict[i] = [ideology_index, dR, dL, self.d0] return [ideology_index, dR, dL, self.d0]
def load_data(self, year, features_by_year, samples, qnet)
-
load cols, features, samples, and qnet.
Args
year
:str
- to identify cols/features.
features_by_year
:str
- file containing all features by year of the dataset.
samples
:str
- file of samples for that year.
Qnet
:str
- Qnet file location.
Expand source code
def load_data(self, year, features_by_year, samples, qnet): '''load cols, features, samples, and qnet. Args: year (str): to identify cols/features. features_by_year (str): file containing all features by year of the dataset. samples (str): file of samples for that year. Qnet (str): Qnet file location. ''' self.qnet = load_qnet(qnet) self.year = year self.cols = np.array((pd.read_csv(features_by_year, keep_default_na=True, index_col=0).set_index( 'year')).loc[int(year)].apply( eval).values[0]) self.features = pd.DataFrame(columns=self.cols) self.mutable_vars = [x for x in self.cols] #[self.cols].fillna('').values.astype(str)[:] self.samples=pd.read_csv(samples) self.samples = pd.concat([self.samples,self.features], axis=0) self.all_samples = self.samples self.samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:] self.s_null=['']*len(self.samples_as_strings[0]) self.D_null=self.qnet.predict_distributions(self.s_null) variation_weight = [] for d in self.D_null: v=[] for val in d.values(): v=np.append(v,val) variation_weight.append(entropy(v,base=len(v))) self.variation_weight = variation_weight
def load_from_dataformatter(self, data_obj, key)
-
read in either train or test data, specified by key, from data obj
Args
data_obj
:class
- instance of dataformatter class
key
:str
- 'all', 'train', or 'test', corresponding to sample type
Expand source code
def load_from_dataformatter(self, data_obj, key): """read in either train or test data, specified by key, from data obj Args: data_obj (class): instance of dataformatter class key (str): 'all', 'train', or 'test', corresponding to sample type """ featurenames, samples = data_obj.format_samples(key) if any(x is not None for x in [self.features, self.samples]): print("replacing original features/samples with dataformatter data") self.cols = featurenames self.features = pd.DataFrame(columns=self.cols) self.samples = pd.DataFrame(samples,columns=self.features) self.all_samples = self.samples self.samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:] self.s_null=['']*len(self.samples_as_strings[0]) return featurenames, samples
def load_from_model(self, model, data_obj, key, im_vars=None, m_vars=None)
-
load parameters from model object
Args
model
:Class
- model obj for loading parameters
data_obj
:class
- instance of dataformatter class
key
:str
- 'all', 'train', or 'test', corresponding to sample type
im_vars
:list[str]
, optional- Not implemented yet. Defaults to None.
m_vars
:list[str]
, optional- Not implemented yet. Defaults to None.
Expand source code
def load_from_model(self, model, data_obj, key, im_vars=None, m_vars=None): """load parameters from model object Args: model (Class): model obj for loading parameters data_obj (class): instance of dataformatter class key (str): 'all', 'train', or 'test', corresponding to sample type im_vars (list[str], optional): Not implemented yet. Defaults to None. m_vars (list[str], optional): Not implemented yet. Defaults to None. """ if model is not None: self.qnet = model.myQnet # self.cols = np.array(model.features) featurenames, samples = data_obj.format_samples(key) samples = pd.DataFrame(samples) self.cols = featurenames self.features = pd.DataFrame(columns=np.array(featurenames)) if any(x is not None for x in [model.immutable_vars, model.mutable_vars]): if model.immutable_vars is not None: self.immutable_vars = model.immutable_vars self.mutable_vars = [x for x in self.features if x not in self.immutable_vars] elif model.mutable_vars is not None: self.mutable_vars = model.mutable_vars self.immutable_vars = [x for x in self.features if x not in self.mutable_vars] else: self.mutable_vars = self.features self.samples = pd.DataFrame(samples) self.samples.columns = np.array(featurenames) self.all_samples = self.samples self.samples_as_strings = self.samples[featurenames].fillna('').values.astype(str)[:] self.s_null=['']*len(self.samples_as_strings[0]) self.D_null=self.qnet.predict_distributions(self.s_null) variation_weight = [] for d in self.D_null: v=[] for val in d.values(): v=np.append(v,val) variation_weight.append(entropy(v,base=len(v))) self.variation_weight = variation_weight
def polarDistance(self, i, return_dict=None)
-
return the distance from a sample to the poles
Args
i
:int
- index of sample to take
return_dict
:dict
- dict used for multiple sample function
Returns
[type]
- [description]
Expand source code
def polarDistance(self, i, return_dict=None): """return the distance from a sample to the poles Args: i (int): index of sample to take return_dict (dict): dict used for multiple sample function Returns: [type]: [description] """ samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:] p = samples_as_strings[i] distances = [] for index, row in self.polar_features[self.cols].iterrows(): row = row.fillna('').values.astype(str)[:] distances.append(self.distance(p, np.array(row))) if return_dict is not None: return_dict[i] = distances return distances
def polarDistance_multiple(self, outfile)
-
return the distance from all samples to the poles
Args
outfile
:str
- desired output filename and path
Expand source code
def polarDistance_multiple(self, outfile): """return the distance from all samples to the poles Args: outfile (str): desired output filename and path """ if all(x is not None for x in [self.samples, self.cols, self.polar_features]): manager = mp.Manager() return_dict = manager.dict() processes = [] for i in range(len(self.samples)): p = mp.Process(target=self.polarDistance, args=(i, return_dict)) processes.append(p) [x.start() for x in processes] [x.join() for x in processes] pole_names = [] for index, row in self.polar_features[self.cols].iterrows(): pole_names.append(index) result=[x for x in return_dict.values()] result=pd.DataFrame(result,columns=pole_names).to_csv(outfile) else: raise ValueError("load data first!") return return_dict
def polar_separation(self, nsteps=0)
-
returns the distance between poles as a qdistance matrix
Args
nsteps
:int
, optional- [description]. Defaults to 0.
Expand source code
def polar_separation(self, nsteps=0): """returns the distance between poles as a qdistance matrix Args: nsteps (int, optional): [description]. Defaults to 0. """ polar_arraydata = self.polar_features[self.cols].fillna('').values.astype(str)[:] samples_ = [] for vector in polar_arraydata: bp = self.getBaseFrequency(vector) sample = qsample(vector, self.qnet, nsteps, baseline_prob=bp) samples_.append(sample) samples_ = np.array(samples_) self.polar_matrix = qdistance_matrix(samples_, samples_, self.qnet, self.qnet) return self.polar_matrix
def qsampling(self, sample, steps, immutable=False)
-
perturb the sample based on thet qnet distributions and number of steps
Args
sample
:1d array-like
- sample vector, must have the same num of features as the qnet
steps
:int
- number of steps to qsample
immutable
:bool
- are there variables that are immutable?
Expand source code
def qsampling(self, sample, steps, immutable=False): '''perturb the sample based on thet qnet distributions and number of steps Args: sample (1d array-like): sample vector, must have the same num of features as the qnet steps (int): number of steps to qsample immutable (bool): are there variables that are immutable? ''' if all(x is not None for x in [self.mutable_vars, sample]): if immutable == True: return qsample(sample,self.qnet,steps,self.getBaseFrequency(self.samples)) else: return qsample(sample,self.qnet,steps) elif self.mutable_vars is None: raise ValueError("load_data first!")
def randomMaskReconstruction(self, index=None, return_dict=None, sample=None)
-
reconstruct the masked sample by qsampling and comparing to original set self.mask_prob and self.steps if needed
Args
return_dict
:dict
- dict containing results. Defaults to None.
sample
:list[str]
, optional- sample vector, must have the same num of features as the qnet. Defaults to None.
index
:int
- index of sample to take. Defaults to None.
Raises
ValueError
- Neither sample or index were given
ValueError
- Both sample and index were given
Returns
[type]
- [description]
Expand source code
def randomMaskReconstruction(self, index=None, return_dict=None, sample=None): """reconstruct the masked sample by qsampling and comparing to original set self.mask_prob and self.steps if needed Args: return_dict (dict): dict containing results. Defaults to None. sample (list[str], optional): sample vector, must have the same num of features as the qnet. Defaults to None. index (int): index of sample to take. Defaults to None. Raises: ValueError: Neither sample or index were given ValueError: Both sample and index were given Returns: [type]: [description] """ if all(x is None for x in [sample, index]): raise ValueError("Must input either sample or index!") elif all(x is not None for x in [sample, index]): raise ValueError("Must input either sample or index not both!") elif sample is not None: s=np.array(pd.DataFrame(sample).fillna('').values.astype(str)[:]) elif index is not None: s=self.samples_as_strings[index] s1,bp,mask_,maskindex,rmatch_u,rmatch,s_rand=self.getMaskedSample(s, mask_prob=self.mask_prob) if np.isnan(bp).any(): return_dict[index] = np.nan,np.nan,np.nan return np.nan,np.nan,np.nan qs=qsample(s1,self.qnet,self.steps,bp) dqestim=qdistance(s,qs,self.qnet,self.qnet) dactual=qdistance(s,s1,self.qnet,self.qnet) qdistance_time_end = time.time() cmpf=pd.DataFrame([s,qs,s_rand],columns=self.cols,index=['s','q','r'])[mask_].transpose() cmpf.index.name='gssvar' cmpf.to_csv('examples_results/CMPF_2018/CMPF-'+str(index)+'.csv') return_dict[index] = (1 - (dqestim/dactual))*100,rmatch_u,rmatch return (1 - (dqestim/dactual))*100,rmatch_u,rmatch,s,qs,s_rand,mask_
def randomMaskReconstruction_multiple(self, out_file)
-
runs and saves the results of the predicted masked sample
Args
output_file
:str
- directory and/or file for output
Expand source code
def randomMaskReconstruction_multiple(self, out_file): '''runs and saves the results of the predicted masked sample Args: output_file (str): directory and/or file for output ''' manager = mp.Manager() return_dict = manager.dict() processes = [] for i in range(len(self.samples)): p = mp.Process(target=self.randomMaskReconstruction, args=(i, return_dict)) processes.append(p) [x.start() for x in processes] [x.join() for x in processes] #result=pd.DataFrame(return_dict.items())[1]#, columns=['sample','rederr','r_prob','rand_err','s','q','r']) result=[x for x in return_dict.values() if isinstance(x, tuple)] # #result=pd.DataFrame(result.tolist()) # print(result) # cmprdf=result[[3,4,5]] # mask_=result[[6]] # cmprdf.columns=['s','q','r']#[mask_].transpose() # cmprdf.to_csv("examples_results/CMPF_"+"tmp"+".csv") # print(cmprdf) # result=result[[0,1,2]] result=pd.DataFrame(result,columns=['rederr','r_prob','rand_err']) result.rederr=result.rederr.astype(float) if self.poles is not None: result.to_csv(out_file) else: result.to_csv(out_file) return result.rederr.mean(), result.rand_err.mean()
def set_immutable_vars(self, IMMUTABLE_FILE)
-
set vars to immutable and mutable, can prob combine this with the load_data func: only set the immutable vars if necessary
Args
IMMUTABLE_FILE
:str
- file containing the immutable features/vars
Expand source code
def set_immutable_vars(self, IMMUTABLE_FILE): '''set vars to immutable and mutable, can prob combine this with the load_data func: only set the immutable vars if necessary Args: IMMUTABLE_FILE (str): file containing the immutable features/vars ''' if self.cols is None: raise ValueError("load_data first!") self.immutable_vars = pd.read_csv(IMMUTABLE_FILE,index_col=0).transpose() self.mutable_vars = None self.mutable_vars = [x for x in self.cols if x.upper() not in self.immutable_vars.columns]
def set_nsamples(self, num_samples)
-
select a subset of the samples
Args
num_samples
:int
- Set num of samples to subset
Expand source code
def set_nsamples(self, num_samples): '''select a subset of the samples Args: num_samples (int): Set num of samples to subset ''' self.samples = self.all_samples if all(x is not None for x in [num_samples, self.samples]): if num_samples > len(self.samples.index): string = 'The number of selected samples ({}) ' + \ 'is greater than the number of samples ({})!' string = string.format(num_samples, len(self.samples.index)) raise ValueError(string) if num_samples == len(self.samples.index): string = 'The number of selected samples ({}) ' + \ 'is equal to the number of samples ({})!' string = string.format(num_samples, len(self.samples.index)) print(string) self.samples = self.samples.sample(num_samples) self.samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:] elif self.samples is None: raise ValueError("load_data first!")
def set_poles(self, POLEFILE, pole_1, pole_2, steps=0, mutable=False)
-
set the poles and samples such that the samples contain features in poles
Args
steps
:int
- number of steps to qsample
POLEFILE
:str
- file containing poles samples and features
mutable
:boolean
- Whether or not to set poles as the only mutable_vars
pole_1
:str
- column name for first pole to use
pole_2
:str
- column name for second pole to use
Expand source code
def set_poles(self, POLEFILE, pole_1, pole_2, steps=0, mutable=False): '''set the poles and samples such that the samples contain features in poles Args: steps (int): number of steps to qsample POLEFILE (str): file containing poles samples and features mutable (boolean): Whether or not to set poles as the only mutable_vars pole_1 (str): column name for first pole to use pole_2 (str): column name for second pole to use ''' invalid_count = 0 if all(x is not None for x in [self.samples, self.qnet]): poles = pd.read_csv(POLEFILE, index_col=0) self.poles=poles.transpose() self.polar_features = pd.concat([self.poles, self.features], axis=0) poles_dict = {} for column in poles: p_ = self.polar_features.loc[column][self.cols].fillna('').values.astype(str)[:] poles_dict[column] = self.qsampling(p_,steps) self.poles_dict = poles_dict self.pL = self.poles_dict[pole_1] self.pR = self.poles_dict[pole_2] # self.pL = list(poles_dict.values())[0] # self.pR = list(poles_dict.values())[1] self.d0 = qdistance(self.pL, self.pR, self.qnet, self.qnet) cols = [x for x in self.poles.columns if x in self.samples.columns] self.samples=self.samples[cols] for x in self.poles.columns: if x not in self.samples.columns: invalid_count += 1 self.samples[x]=np.nan self.samples = pd.concat([self.samples,self.features], axis=0) self.all_samples = self.samples self.samples_as_strings = self.samples[self.cols].fillna('').values.astype(str)[:] if mutable: self.mutable_vars=[x for x in self.cols if x in self.poles.columns] elif self.samples is None: raise ValueError("load_data first!") print("{} pole features not found in sample features".format(invalid_count))