Source code for pyblip.create_groups

""" Functions for creating candidate groups from posterior samples"""
import copy
import numpy as np
# Tree methods from scipy
import scipy.cluster.hierarchy as hierarchy
import scipy.spatial.distance as ssd

MIN_PEP = 1e-15 # for numerical stability

[docs]class CandidateGroup(): """ A single candidate group as an input to BLiP. Discovering a group asserts there is at least one signal in the group. Parameters ---------- group : list or set Set of locations in the group. pep : float Posterior error probability (1 - PIP) for the group. data : dict Miscallaneous other attributes to associate with the group. """ def __init__(self, group, pep, data=None): self.group = set(group) self.pep = pep if data is None: data = dict() self.data = data def __str__(self): return f"CandidateGroup(group={self.group}, pep={self.pep}, data={self.data})" def __repr__(self): return self.__str__()
[docs] def to_dict(self): """ Converts ``CandidateGroup`` into a dictionary. """ out = dict() out['group'] = list(self.group) out['pep'] = self.pep for key in self.data: if isinstance(self.data[key], set): out[key] = list(self.data[key]) else: out[key] = self.data[key] return out
[docs]def all_cand_groups( samples, X=None, q=0, max_pep=1, max_size=25, prenarrow=True, prefilter_thresholds=[0, 0.01, 0.02, 0.03, 0.05, 0.1, 0.2] ): """ Creates many candidate groups by prefiltering locations at various thresholds and then creating sequential and hierarchical candidate groups. Parameters ---------- samples : np.ndarray An ``(N, p)``-shaped array of posterior samples, where a nonzero value indicates the presence of a signal. X : np.array The n x p deisgn matrix. Defaults to ``None.`` If provided, adds hierarchical groups based on a correlation cluster of X. q : float The nominal level at which to control the error rate. max_pep : float The maximum posterior error probability allowed in a candidate group. Default is 1. max_size : float Maximum size of a group. Default is 25. prenarrow : bool If true, "prenarrows" the candidate groups as described in the paper. Defaults to False. prefilter_thresholds : list List of thresholds at which to prefilter the locations. Returns ------- cand_groups : list A list of ``CandidateGroup`` objects. """ samples = samples != 0 marg_pips = np.mean(samples, axis=0) # pre-filter features at various levels all_groups = set() all_cgs = [] for thresh in prefilter_thresholds: rel_features = np.where(marg_pips > thresh)[0] if len(rel_features) == 0: continue # Sequential groups cgs = sequential_groups( samples[:, rel_features], q=q, max_pep=max_pep, max_size=max_size, prenarrow=prenarrow ) # Distance matrices dms = [_samples_dist_matrix(samples[:, rel_features])] if X is not None: dms.append(np.abs(1 - np.corrcoef(X[:, rel_features].T))) cgs.extend(hierarchical_groups( samples[:, rel_features], dist_matrix=dms, max_pep=max_pep, max_size=max_size, filter_sequential=True, )) # Correct group indices and add to all cgs groups = [] for cg in cgs: group = tuple(sorted(rel_features[list(cg.group)].tolist())) if group not in all_groups: cg.group = set(group) all_cgs.append(cg) groups.append(group) all_groups = all_groups.union(groups) return all_cgs
[docs]def susie_groups( alphas, X, q, max_pep=1, max_size=25, prenarrow=False ): """ Creates candidate groups based on a SuSiE fit. Parameters ---------- alphas : np.array An ``(L, p)``-shaped matrix of alphas from a SuSiE object, ``L`` is the number of SuSiE iterations and ``p`` is the number of covariates. X : np.array The n x p deisgn matrix. Defaults to ``None.`` If provided, adds hierarchical groups based on a correlation cluster of X. q : float The level at which to control the error rate max_pep : float The maximum posterior error probability allowed in a candidate group. Default is 1. max_size : float Maximum size of a group. Default is 25. prenarrow : bool If true, "prenarrows" the candidate groups as described in the paper. Defaults to False. Returns ------- cand_groups : list A list of ``CandidateGroup`` objects. """ L, p = alphas.shape np.random.seed(1) # Start with sequential groups cand_groups = sequential_groups( susie_alphas=alphas, q=q, max_pep=max_pep, max_size=max_size, prenarrow=prenarrow, ) # Add hierarchical groups if X is not None: dist_matrix = np.abs(1 - np.corrcoef(X.T)) groups_to_add = _dist_matrices_to_groups(dist_matrix) else: groups_to_add = [] # Add groups discovered by susie for j in range(L): if np.sum(alphas[j]) >= 1 - q: inds = np.argsort(-1*alphas[j]) k = np.min(np.where(np.cumsum(alphas[j,inds]) >= 1 - q)) groups_to_add.append(inds[0:(k+1)].tolist()) # Add these to cand_groups groups_to_add = _dedup_list_of_lists(groups_to_add) for g in groups_to_add: if len(g) > max_size: continue if np.max(g) - np.min(g) == len(g) - 1: continue iter_peps = 1 - alphas[:,g].sum(axis=1) iter_peps[iter_peps < MIN_PEP] = MIN_PEP # for numerical stability pep = np.exp(np.log(iter_peps).sum()) if pep < max_pep: cand_groups.append(CandidateGroup( group=set(g), pep=pep )) return cand_groups
[docs]def sequential_groups( samples=None, susie_alphas=None, q=0, max_pep=1, max_size=25, prenarrow=False ): """ Calculates all sequential candidate groups below max_size. Parameters ---------- samples : np.ndarray An ``(N, p)``-shaped array of posterior samples, where a nonzero value indicates the presence of a signal. susie_alphas : np.ndarray As an alternative to posterior samples, users may specify an L x p matrix of alphas from a SuSiE object. However, calling ``susie_groups`` is recommended instead. q : float The nominal level at which to control the error rate. max_pep : float The maximum posterior error probability allowed in a candidate group. Default is 1. max_size : float Maximum size of a group. Default is 25. prenarrow : bool If true, "prenarrows" the candidate groups as described in the paper. Defaults to False. Returns ------- cand_groups : list A list of ``CandidateGroup`` objects. """ if samples is not None: samples = samples != 0 # make boolean N, p = samples.shape max_size = min(max_size, p) cum_incs = np.zeros((N, p+1)) cum_incs[:, 1:(p+1)] = np.cumsum(samples, axis=1) # Compute successive groups of size m all_PEPs = {} for m in range(max_size): cum_diffs = cum_incs[:, (m+1):(p+1)] - cum_incs[:, :int(p-m)] all_PEPs[m] = np.mean(cum_diffs == 0, axis=0) elif susie_alphas is not None: L, p = susie_alphas.shape max_size = min(max_size, p) cumalphas = np.zeros((L, p + 1)) cumalphas[:, 1:(p+1)] = np.cumsum(susie_alphas, axis=1) # Compute successive groups of size m all_PEPs = {} for m in range(max_size): cumdiffs = 1 - (cumalphas[:, (m+1):(p+1)] - cumalphas[:, :int(p-m)]) cumdiffs[cumdiffs < MIN_PEP] = MIN_PEP all_PEPs[m] = np.exp(np.log(cumdiffs).sum(axis=0)) else: raise ValueError("Either samples or susie_alphas must be specified.") # the index is the first (smallest) variable in the group which has size m active_inds = {} for m in range(max_size): active_inds[m] = np.where(all_PEPs[m] < max_pep)[0] # prenarrowing # This iteratively updates the list elim_inds so that # when consider the set of groups of size m+1, # elim_inds are all the indices that are redundant if prenarrow: elim_inds = set(np.where(all_PEPs[0] < q / 2)[0].tolist()) for m in range(1, max_size): # If index j is eliminated for level m-1, indexes j and j-1 # are eliminated for level m elim_inds = elim_inds.union(set([x-1 for x in elim_inds])) # Update active_inds[m] update = set(active_inds[m].tolist()) - elim_inds active_inds[m] = np.array(list(update)) # At this level, account for groups with PEP < q elim_inds = elim_inds.union(set( np.where(all_PEPs[m] < q / 2)[0].tolist() )) # Step 3: create candidate groups cand_groups = [] for m in range(max_size): for ind in active_inds[m]: group = set(list(range(ind, ind+m+1))) pep = all_PEPs[m][ind] cand_groups.append(CandidateGroup( group=group, pep=pep )) return cand_groups
[docs]def hierarchical_groups( samples, dist_matrix=None, max_pep=1, max_size=25, filter_sequential=False, **kwargs ): """ Creates candidate groups by hierarchical clustering. Parameters ---------- samples : np.ndarray An ``(N, p)``-shaped array of posterior samples, where a nonzero value indicates the presence of a signal. dist_matrix : np.ndarray or list A square numpy array corresponding to distances between locations. Can also be a list of different distance matrices. The default is to use a correlation matrix based on ``samples.`` max_pep : float The maximum posterior error probability allowed in a candidate group. Default is 1. max_size : float Maximum size of a group. Default is 25. filter_sequential : bool If True, does not calculate PEPs for sequential (contiguous) groups of variables to avoid duplicates. Returns ------- cand_groups : list A list of ``CandidateGroup`` objects. """ # Initial values p = samples.shape[1] samples = samples != 0 # Trivial case where there is only one feature if p == 1: pep = 1 - samples.mean() return [CandidateGroup(group=set([0]), pep=pep)] # Estimate cov matrix from samples if # concatenations ensure no samples are all zero or one if dist_matrix is None: dist_matrix = _samples_dist_matrix(samples) # Create groups groups = _dist_matrices_to_groups(dist_matrix, **kwargs) # Create candidate group objects cand_groups = [] for group in groups: gsize = len(group) if gsize > max_size: continue # Possibly filter out contiguous groups if filter_sequential: if np.max(group) - np.min(group) == gsize - 1: continue pep = 1 - np.any(samples[:, group], axis=1).mean() if pep < max_pep: cand_groups.append( CandidateGroup(group=set(group), pep=pep) ) return cand_groups
def _extract_groups(root, p): """ Extracts the set of all groups from a scipy hierarchical clustering tree. """ output = [] queue = [] queue.append(root) while len(queue) > 0: node = queue.pop(0) if node.left is not None: queue.append(node.left) if node.right is not None: queue.append(node.right) output.append(node.pre_order()) return output def _dedup_list_of_lists(x): return list(set(tuple(sorted(i)) for i in x)) def _dist_matrices_to_groups( dist_matrices, cluster_funcs=None, ): """ Creates groups based on corr_matrix using single, average, and hierarchical clustering. """ if isinstance(dist_matrices, np.ndarray): dist_matrices = [dist_matrices] p = dist_matrices[0].shape[0] all_groups = [] if cluster_funcs is None: cluster_funcs = [hierarchy.single, hierarchy.average, hierarchy.complete] for dist_matrix in dist_matrices: # prevent numerical errors dist_matrix -= np.diag(np.diag(dist_matrix)) #lower_inds = np.tril_indices(p, -1) #dist_matrix[lower_inds] = dist_matrix.T[lower_inds] dist_matrix = (dist_matrix.T + dist_matrix) / 2 # turn into scipy format condensed_dist_matrix = ssd.squareform(dist_matrix) # Run hierarchical clustering all_groups = [] for cluster_func in cluster_funcs: link = cluster_func(condensed_dist_matrix) groups = _extract_groups(hierarchy.to_tree(link), p=p) all_groups.extend(groups) # deduplicate all_groups = _dedup_list_of_lists(all_groups) return all_groups def _samples_dist_matrix(samples): p = samples.shape[1] precorr = np.concatenate( [ samples, np.zeros((1, p)), np.ones((1, p)) ], axis=0 ) corr_matrix = np.corrcoef(precorr.T) dist_matrix = corr_matrix + 1 return dist_matrix def _prefilter(cand_groups, max_pep): """ Returns the subset of cand_groups with a pep below max_pep. """ return [ x for x in cand_groups if x.pep < max_pep ] def _elim_redundant_features(cand_groups): """ After prefiltering groups, some features/locations may not appear in any candidate groups. When this happens, this function reindexes the locations to improve the efficiency of the BLiP solver. Parameters ---------- cand_groups : list A list of CandidateGroup objects. Returns ------- cand_groups : list A list of CandidateGroup objects, but with a "blip-group" attribute that reindexes the features to avoid redundancy. nrel : int The number relevant features. """ # Step 1: find relevant features active_features = set([j for x in cand_groups for j in x.group]) # Step 2: change feature inds to save computation nrel = len(active_features) orig2new = {} for i, j in enumerate(list(active_features)): orig2new[j] = i for cand_group in cand_groups: blip_group = [orig2new[x] for x in cand_group.group] cand_group.data['blip-group'] = set(blip_group) # return return cand_groups, nrel