Source code for pyblip.create_groups_cts

""" 
Creates candidate groups when signals could appear anywhere in a
continuous d-dimensional space. 
"""
import time
import copy
import numpy as np
import networkx as nx
import warnings

from .utilities import elapsed
from .create_groups import CandidateGroup
from .ecc import edge_clique_cover
from collections import Counter

TOL = 1e-10

[docs]def normalize_locs(locs): """ Paramters --------- locs : np.array A (N, num_disc, d)-dimensional array. Here, N is the number of samples from the posterior, d is the number of dimensions of the space, and each point corresponds to a signal in a particular posterior sample. Returns ------- locs : np.array locs, but normalized such that all values lie in [0,1]. NAs are converted to -1. shifts : np.array ``d``-dimensional array corresponding to the shifts applied in the normalizatin. scales : np.array ``d``-dimension array corresponding to the scales in the normalization. """ min_val = np.nanmin(np.nanmin(locs, axis=0), axis=0) max_val = np.nanmax(np.nanmax(locs, axis=0), axis=0) shifts = min_val scales = max_val - min_val norm_locs = (locs - shifts) / scales return norm_locs, shifts, scales
def _additional_circle_centers( samples, centers, center_list, radius, gsize ): N, d = samples.shape radius2 = np.power(radius, 2) # Check upper/lower/left/right for j in range(d): for offset in [-1, 1]: centers_new = centers.copy() centers_new[:, j] += offset / gsize # Check if points are in the offset centers #print(samples.shape, centers.shape, centers_new.shape) included = np.power(samples - centers_new, 2).sum(axis=1) <= radius2 included = included & (np.max(centers_new, axis=1) < 1 + TOL) included = included & (np.min(centers_new, axis=1) > -TOL) center_list.extend([ tuple(list(c)) for c in centers_new[included] ]) return center_list def _find_centers( samples, gsize, shape ): # Find central boxes containing samples N, d = samples.shape # lower-left corner (comments in 2d for intuition) corners = np.floor( samples * gsize ) corners = corners.astype(float) / gsize # Adjust to make centers and check for unexpected shapes centers = corners + 1 / (2 * gsize) center_list = [tuple(list(c)) for c in centers] if shape == 'circle': # Radius of Eucliean balls if d != 2: raise NotImplementedError( "Shape=circle only implemented for 2-d data, but detected {d} dimensions" ) radius = np.sqrt(d) / (2 * gsize) elif shape == 'square': radius = 1 / (2 * gsize) else: raise ValueError(f"Unrecognized shape={shape}, must be one of 'square', 'circle'") # Add extra centers for circles only if shape == 'circle' and N > 0: center_list = _additional_circle_centers( samples, centers, center_list, radius, gsize ) return [tuple(np.around(center, 10)) + (radius,) for center in center_list]
[docs]def grid_peps( locs, grid_sizes, count_signals=False, extra_centers=None, max_pep=0.25, log_interval=None, time0=None, shape='square' ): """ Parameters ---------- locs : np.array A (N, num_disc, d)-dimensional array. Here, N is the number of samples from the posterior, d is the number of dimensions of the space, and each point corresponds to a signal in a particular posterior sample. grid_sizes : list or np.ndarray List of grid-sizes to split up the locations. count_signals : bool If True, the number of signals in each group is counted. extra_centers : np.ndarray A (ncenters, d)-dimensional array. At each resolution, a candidate groups will be computed with centers at these locations. shape : string One of ``square`` or ``circle`` Returns ------- cand_groups : list A list of CandidateGroup objects. """ if time0 is None: time0 = time.time() if np.nanmin(locs) < -TOL or np.nanmax(locs) > 1 + TOL: raise ValueError( f"locs are not normalized: apply create_groups_cts.normalize_locs first." ) # Create PIPs pips = dict() N = locs.shape[0] d = locs.shape[2] if extra_centers is not None: n_extra_cent = extra_centers.shape[0] for j in range(N): # Ignore dummy discoveries active = ~np.any(np.isnan(locs[j]), axis=1) samples = locs[j, active] # Loop through grid sizes and find centers all_centers = [] for gsize in grid_sizes: all_centers.extend(_find_centers( samples=samples, gsize=gsize, shape=shape )) # Repeat for manually added centers all_extra_centers = [] if extra_centers is not None and n_extra_cent > 0 and samples.shape[0] > 0: dists = extra_centers.reshape(n_extra_cent, 1, d) - samples.reshape(1, -1, d) if shape == 'square': dists = np.abs(dists).max(axis=2) else: dists = np.sqrt(np.power(dists, 2).sum(axis=2)) min_dists = dists.min(axis=1) for gsize in grid_sizes: radius = 1 / (2*gsize) if shape == 'circle': radius = np.sqrt(d) * radius for nc in np.where(min_dists <= radius)[0]: key = tuple(extra_centers[nc]) + (radius,) # Just add this once if we are looking for the presence of a signal if not count_signals: all_extra_centers.append(key) # else count num. signals in this region else: nsignals = np.sum(dists[nc] <= radius) all_extra_centers.extend([key for _ in range(nsignals)]) # Update PIPs final_centers = all_centers + all_extra_centers if not count_signals: final_centers = set(final_centers) for key in final_centers: if key not in pips: pips[key] = 1 / N else: pips[key] += 1 / N else: counter = Counter(final_centers) for key in counter: count = counter[key] if key not in pips: pips[key] = {count: 1 / N, 'pip': 1 / N} else: if count not in pips[key]: pips[key][count] = 1 / N else: pips[key][count] += 1 / N pips[key]['pip'] += 1 / N if log_interval is not None: if j % log_interval == 0: print(f"Computing PEPs: finished with {j+1} / {N} posterior samples at {elapsed(time0)}.") # Filter filtered_peps = {} for key in pips.keys(): if not count_signals: pep = 1 - pips[key] if pep <= max_pep: filtered_peps[key] = pep else: pep = 1 - pips[key]['pip'] if pep <= max_pep: filtered_peps[key] = pips[key] return filtered_peps
# def _postprocess_peps_count_signals( # filtered_peps, # max_pep=0.5, # cutoff=0.95, # ): # """ # Given the output of grid_peps with count_signals=True, # postprocesses the result. This is a helper function for # grid_peps_to_cand_groups. # """ # output = dict() # for key in filtered_peps.keys(): # nsignals = [x for x in filtered_peps[key].keys() if x != 'pip'] # props = [filtered_peps[key][n] for n in nsignals] # inds = np.argsort(props)
[docs]def grid_peps_to_cand_groups( filtered_peps, time0=None, max_blip_size=1000, verbose=False, shape='square', max_pep=1, min_pep=0.001, ): """ Turns the output of the ``grid_peps`` function into a list of list of CandidateGroups. Each sub-list corresponds to a list of completely disconnected CandidateGroups which can be fed to BLiP separately (this saves computation). filtered_peps : dict An output of the ``grid_peps`` function. time0 : float The initial time the analysis started, useful for logging. max_blip_size : int Maximum size of a problem that can be fed into BLiP. verbose: bool If True, will report progress over time. Default: False. shape : string One of ``square`` or ``circle`` max_pep : float The maximum pep for candidate groups. Default: 1. min_pep : float Once we achieve a pep of this level for candidate groups, we do not search to find peps any lower. Default: 0.001. """ if time0 is None: time0 = time.time() # Step 0: determine whether filtered_peps counts the number of signals or not ngroups = len(filtered_peps) if ngroups == 0: return [], [] keys = sorted(filtered_peps.keys()) if isinstance(filtered_peps[keys[0]], dict): count_signals = True peps_arr = 1 - np.array([filtered_peps[k]['pip'] for k in keys]) else: count_signals = False peps_arr = np.array([filtered_peps[k] for k in keys]) # Step 1: compute adjacency matrix if ngroups > 50000: warnings.warn(f"Computing adjacency matrix may be too inefficient for {ngroups} candidate groups.") d = len(keys[0]) - 1 # dimensionality of problem if verbose: print(f"Constructing constraint matrix with ngroups={ngroups} at {elapsed(time0)}") constraints = np.ones((ngroups, ngroups)).astype(bool) centers = np.zeros((d, ngroups)) centers[:] = np.nan radii = np.array([float(k[-1]) for k in keys]) for j in range(d): # Extract centers of groups centers[j] = np.array([k[j] for k in keys]) if centers[j].max() > 1 or centers[j].min() < 0: print(f"centers max = {centers[j].max()}, centers min = {centers[j].min()}") raise ValueError("centers must be between 0 and 1 but this is not true") # Find overlaps and add to constraints radii = radii.astype(np.float32) centers = centers.astype(np.float32) deltas = radii.reshape(-1, 1) + radii.reshape(1, -1) if shape == 'square': for j in range(d): constraints = constraints & ( np.abs(centers[j].reshape(-1, 1) - centers[j].reshape(1, -1)) < deltas ) elif shape == 'circle': dists = np.sqrt(np.power( centers.reshape(d, ngroups, 1) - centers.reshape(d, 1, ngroups), 2 ).sum(axis=0)) constraints = dists < deltas else: raise ValueError(f"Unrecognized shape={shape}, must be one of 'square', 'circle'") # Step 2: Split problem into connected components if verbose: print(f"Isolating connected components at {elapsed(time0)}. Graph has {np.sum(constraints)} edges.") G = nx.Graph(constraints) components = list(nx.algorithms.components.connected_components(G)) merged_components = [[]] for c in components: if len(merged_components[-1]) + len(c) > max_blip_size: merged_components.append([]) merged_components[-1].extend(list(c)) del G # save memory # Step 3: construct cand_groups for BLiP all_cand_groups = [] for compnum, component in enumerate(merged_components): component_cand_groups = [] component_groups = [[] for _ in component] subG = nx.Graph(constraints[component, :][:, component]) # Construct groups for BLiP using as few locations as possible. # This is related to the problem of listing the maximal cliques of # subG. if verbose: print(f"Finding list of cliques for component {compnum} / {len(merged_components)} with dim {len(component)} at {elapsed(time0)}") #cliques = nx.algorithms.clique.find_cliques(subG) cliques = edge_clique_cover(subG)# for cliquenum, clique in enumerate(cliques): for j in clique: component_groups[j].append(cliquenum) if verbose: print(f"Finished finding cliques, constructing cand_groups at {elapsed(time0)}") for ii, j in enumerate(component): group = set(component_groups[ii]) data_dict = dict(radius=radii[j]) for k in range(d): data_dict[f'dim{k}'] = centers[k, j] data_dict['center'] = centers[:, j].tolist() # Add many candidate groups with varying confidence regions # for the number of signals if count_signals: key = keys[j] nsignals = np.array([ x for x in filtered_peps[key].keys() if x != 'pip' ]) props = np.array([filtered_peps[key][n] for n in nsignals]) inds = np.argsort(-1*props) nsignals = nsignals[inds] cumprops = np.cumsum(props[inds]) for ell in range(len(nsignals)): if cumprops[ell] >= 1 - max_pep: nsignal_ci = set(nsignals[0:(ell+1)].tolist()) data_dict['nsignals'] = nsignal_ci component_cand_groups.append( CandidateGroup( group=group, pep=max(0, 1-cumprops[ell]), data=copy.deepcopy(data_dict) ) ) if cumprops[ell] >= 1 - min_pep: break else: component_cand_groups.append( CandidateGroup( group=group, pep=peps_arr[j], data=data_dict ) ) all_cand_groups.append(component_cand_groups) # Return return all_cand_groups, merged_components