import time
import warnings
import copy
import numpy as np
import cvxpy as cp
from . import create_groups, create_groups_cts, weight_fns
# Find default solver
from cvxpy.reductions.solvers.defines import INSTALLED_SOLVERS
if 'GUROBI' in INSTALLED_SOLVERS:
DEFAULT_SOLVER = 'GUROBI'
elif 'CBC' in INSTALLED_SOLVERS:
DEFAULT_SOLVER = 'CBC'
else:
DEFAULT_SOLVER = 'ECOS'
# Solver for binarization (very small-scale mixed-integer LP)
# GLPK is a good solver in general, but it has known
# bugs in very small problems like this one. See
# https://github.com/cvxpy/cvxpy/issues/1112
if 'CBC' in INSTALLED_SOLVERS:
BIN_SOLVER = 'CBC'
else:
BIN_SOLVER = None # default
WEIGHT_FNS = {
'inverse_size':weight_fns.inverse_size,
'log_inverse_size':weight_fns.log_inverse_size,
}
ERROR_OPTIONS = ['fdr', 'local_fdr', 'fwer', 'pfer']
BINARY_TOL = 1e-3
DEFAULT_GRID_SIZES = np.around(np.logspace(np.log10(50), np.log10(4), 25))
[docs]def BLiP(
samples=None,
cand_groups=None,
weight_fn='inverse_size',
error='fdr',
q=0.1,
max_pep=0.5,
deterministic=True,
verbose=True,
perturb=True,
max_iters=100,
search_method='binary',
return_problem_status=False,
**kwargs
):
"""
Given samples from a posterior or a list of ``CandidateGroup``
objects, performs resolution-adaptive signal detection
to maximize power while controlling (e.g.) the FDR.
Note: when working with image data or a continuous set
of locations, consider using ``BLiP_cts``.
Parameters
----------
samples : np.array
An ``(N, p)``-shaped array of posterior samples,
where a nonzero value indicates the presence of a signal.
Defaults to `None`.
cand_groups : list
A list of ``CandidateGroup`` objects. Defaults to `None`.
Note either ``cand_groups`` or ``samples`` must be provided.
weight_fn : string or function
A function mapping ``CandidateGroup`` objects to nonnegative weights.
This defines the power function. Also can be 'inverse_size',
'log_inverse_size', or 'prespecified'. Defaults to 'inverse_size'.
error : string
The type of error rate to control. Must be one of
'fdr', 'local_fdr', 'fwer', 'pfer'.
q : float
The level at which to control the error rate.
max_pep : float
BLiP automatically filters out candidate groups with
a posterior error probability (PEP) above ``max_pep``.
Default: 0.5.
deterministic : bool
If True, guarantees a deterministic solution. Defaults to True.
verbose : bool
If True, gives occasional progress reports.
search_method : string
For FWER control, how to find the optimal parameter
for the LP. Either "none" or "binary." Defaults to "binary".
max_iters : int
Maximum number of binary-search iterations for FWER problem.
Defaults to 100.
perturb : bool
If True, will perturb the weight function. Defaults to True.
return_problem_status : False
If True, will return extra information about the problem.
Returns
-------
detections : list
A list of ``CandidateGroup`` objects, which asserts that each
group contains at least one signal.
problem_status : dict
A dict containing information about the BLiP optimization problem.
Only returned if ``return_problem_status=True``.
Examples
--------
Here we fit BLiP to perform resolution-adaptive variable selection
for a Gaussian linear model: ::
import numpy as np
import scipy.linalg
## Synthetic data generating process with AR1 design matrix
n, p, nsignals, rho = 100, 500, 20, 0.95
c = np.cumsum(np.zeros(p) + np.log(rho)) - np.log(rho)
cov = scipy.linalg.toeplitz(np.exp(c))
X = np.dot(np.random.randn(n, p), np.linalg.cholesky(cov).T)
# Sparse coefficients for linear model
beta = np.zeros(p)
signal_locations = np.random.choice(np.arange(p), nsignals)
beta[signal_locations] = np.random.randn(nsignals)
y = np.dot(X, beta) + np.random.randn(n)
## Fit linear spike and slab model
import pyblip
lm = pyblip.linear.LinearSpikeSlab(X=X, y=y)
lm.sample(N=1000, chains=10, bsize=2)
## Run BLiP
detections = pyblip.blip.BLiP(
samples=lm.betas,
q=0.1,
error='fdr'
)
"""
# Parse arguments
time0 = time.time()
error = str(error).lower()
if error not in ERROR_OPTIONS:
raise ValueError(f"error type {error} must be one of {ERROR_OPTIONS}")
if cand_groups is None and samples is None:
raise ValueError("At least one of cand_groups and samples must be provided.")
if error in ['fwer', 'pfer', 'local_fdr']:
max_pep = min(max_pep, q) # this is optimal for all but error == 'fdr'
solver = kwargs.get("solver", DEFAULT_SOLVER)
if solver == 'ECOS' and verbose:
warnings.warn("Using ECOS solver, which will be slightly slower. Consider installing CBC.")
# Create cand groups if necessary
if cand_groups is None:
cand_groups = create_groups.sequential_groups(
samples=samples,
q=q,
max_pep=max_pep,
prenarrow=True,
)
cand_groups.extend(
create_groups.hierarchical_groups(
samples=samples,
max_pep=max_pep,
filter_sequential=True
)
)
# Else, prefilter
cand_groups = create_groups._prefilter(cand_groups, max_pep=max_pep)
# Edge case where nothing is below max_pep
if len(cand_groups) == 0:
if return_problem_status:
return [], dict(
nlocs=0,
ngroups=0,
backtracking_iter=0,
ngroups_nonint=0,
lp_bound=0,
deterministic=deterministic,
)
return []
# Construct a re-indexing which does not include redundant features
cand_groups, nrel = create_groups._elim_redundant_features(cand_groups)
# Weights for each candidate group
ngroups = len(cand_groups)
if weight_fn == 'prespecified':
weights = np.array([x.data['weight'] for x in cand_groups])
else:
if 'weight' in cand_groups[0].data:
warnings.warn(
"Cand groups have a 'weight' attribute, do you mean to set weight_fn='prespecified'?"
)
if isinstance(weight_fn, str):
weight_fn = weight_fn.lower()
if weight_fn not in WEIGHT_FNS:
raise ValueError(f"Unrecognized weight_fn={weight_fn}, must be one of {list(WEIGHT_FNS.keys())}")
weight_fn = WEIGHT_FNS[weight_fn]
# Get weights
weights = np.array([weight_fn(x) for x in cand_groups])
# perturb to ensure unique solution
orig_weights = weights.copy()
if perturb:
weights = np.array([
w*(1 + 0.0001*np.random.uniform()) for w in weights]
)
# Extract peps
peps = np.array([x.pep for x in cand_groups])
# Constraints to ensure selected groups are disjoint
if verbose:
print(f"BLiP problem has {ngroups} groups in contention, with {nrel} active features/locations")
A = np.zeros((ngroups, nrel), dtype=bool)
for gj, cand_group in enumerate(cand_groups):
for feature in cand_group.data['blip-group']:
A[gj, feature] = 1
# Assemble constraints, variables
x = cp.Variable(ngroups)
x.value = np.zeros(ngroups)
b = np.ones(nrel)
v_param = cp.Parameter()
v_param.value = q
v_var = cp.Variable(pos=True) # for FDR only
# We perform a binary for FWER to find optimal v.
if samples is not None and error == 'fwer' and search_method == 'binary':
binary_search = True
else:
binary_search = False
# Construct problem
constraints = [
x >= 0,
x <= 1,
A.T @ x <= b
]
# Last constraint is redundant for local_fdr
if error in ['pfer', 'fwer']:
constraints += [
x @ peps <= v_param
]
# Extra constraints for fdr
elif error == 'fdr':
constraints += [
x @ peps <= v_var,
cp.sum(x) >= v_var / q,
v_var >= 0,
v_var <= q * nrel + 1
]
# Create problem
objective = cp.Maximize(((1-peps) * weights) @ x)
problem = cp.Problem(objective=objective, constraints=constraints)
# Solve problem once if we do not need to search over v
if not binary_search:
problem.solve(solver=solver)
selections = x.value
# Solve FWER through binary search
elif binary_search:
v_upper = q * nrel + 1 # upper bnd in search (min val not controlling FWER)
v_lower = 0 # lower bnd in search (max val controlling FWER)
samples = samples != 0
for niter in range(max_iters):
# Change parametrized constraint
v_current = (v_upper + v_lower)/2
v_param.value = v_current
# Solve
problem.solve(solver=solver, warm_start=True)
selections = x.value
# Round selections---could do something smarter, TODO
selections = np.around(selections)
# Compute exact FWER
false_disc = np.zeros(samples.shape[0]).astype(bool)
for gj in np.where(selections > BINARY_TOL)[0]:
group = list(cand_groups[gj].group)
false_disc = false_disc | np.all(samples[:, group] == 0, axis=1)
fwer = false_disc.mean()
if fwer > q:
v_upper = v_current
else:
v_lower = v_current
# Possibly break
if v_upper - v_lower < 1e-4:
break
# Solve with v_lower for final solution
v_param.value = v_lower
problem.solve(solver=solver, warm_start=True)
if problem.status == 'infeasible':
selections = np.zeros(ngroups)
else:
selections = x.value
# TODO could do something smarter for FWER bin search
if error == 'fwer' and binary_search:
selections = np.around(selections)
for cand_group, sprob, weight in zip(cand_groups, selections, orig_weights):
cand_group.data['sprob'] = sprob
cand_group.data['weight'] = weight
# Diagnostics to (optionally) return
problem_status = dict(
ngroups=ngroups,
nlocs=nrel,
lp_bound=np.dot(selections, (1-peps) * weights),
backtracking_iter=0,
deterministic=deterministic,
)
return binarize_selections(
cand_groups=cand_groups,
q=q,
error=error,
deterministic=deterministic,
problem_status=problem_status,
return_problem_status=return_problem_status,
verbose=verbose,
)
[docs]def BLiP_cts(
locs,
grid_sizes=DEFAULT_GRID_SIZES,
weight_fn=weight_fns.inverse_radius_weight,
max_pep=0.25,
max_blip_size=1500,
rescale=True,
**kwargs
):
"""
BLiP when the set of locations is continuous, e.g., when
working with image data.
Parameters
----------
locs : np.ndarray
A (N, num_disc, d)-dimensional array, where ``N`` is the
number of posterior samples, ``d`` is the dimension.
Each point represents to a signal in a posterior sample.
NANs are ignored (useful when the number of signals changes
between posterior iterations.)
grid_sizes : list or np.ndarray
List of grid sizes to split up the locations.
rescale : bool
If True, normalizes locs to ensure they are all between 0 and 1.
If ``rescale`` is False and this is not true, the function
will error.
kwargs : dict
Additional arguments to pass to the underlying BLiP call.
Returns
-------
detections : list
A list of ``CandidateGroup`` objects, which asserts that each
group contains at least one signal.
problem_status : dict
A dict containing information about the BLiP optimization problem.
Only returned if ``return_problem_status=True``.
"""
# Normalize locations
d = locs.shape[2] # dimensionality
if rescale:
norm_locs, shifts, scales = create_groups_cts.normalize_locs(locs)
else:
minval = locs[~np.isnan(locs)].min()
maxval = locs[~np.isnan(locs)].max()
if minval < 0 or maxval > 1:
raise ValueError(f"rescale=False but locs are not between 0 and 1")
norm_locs = locs.copy()
shifts = np.zeros(d)
scales = np.ones(d)
# 1. Calculate filtered PEPs
peps = create_groups_cts.grid_peps(
locs=norm_locs, grid_sizes=grid_sizes, max_pep=max_pep
)
# 2. Calculate nodes, components, and so on
all_cand_groups, _ = create_groups_cts.grid_peps_to_cand_groups(
peps, max_blip_size=max_blip_size
)
# 3. Run BLiP
all_rej = []
for i, cand_groups in enumerate(all_cand_groups):
rej = BLiP(
cand_groups=cand_groups,
weight_fn=weight_fn,
max_pep=max_pep,
**kwargs,
)
all_rej.extend(rej)
# 4. Renormalize locations
for cand_group in all_rej:
center = np.zeros(d)
radius = cand_group.data.pop('radius') * scales + shifts
for k in range(d):
center[k] = cand_group.data.pop(f'dim{k}') * scales[k] + shifts[k]
cand_group.data['center'] = center
cand_group.data['radii'] = radius
return all_rej
[docs]def binarize_selections(
cand_groups,
q,
error,
deterministic,
problem_status=None,
return_problem_status=False,
tol=1e-3,
nsample=10,
verbose=False,
):
"""
Parameters
----------
cand_groups : list
list of candidate groups.
q : float
Level at which to control the error rate.
error : string
The error to control: one of 'fdr', 'fwer', 'pfer', 'local_fdr'
deterministic : bool
If True, will not use a randomized solution.
nsample : int
Number of samples for randomized version.
Returns
-------
detections : list
List of candidate groups which have been detected.
"""
output = []
if problem_status is None:
problem_status = dict(backtracking_iter=0)
if error in ['local_fdr', 'pfer', 'fwer']:
cand_groups = create_groups._prefilter(cand_groups, max_pep=q)
# Account for integer solutions
nontriv_cand_groups = []
for cand_group in cand_groups:
if cand_group.data['sprob'] < tol:
continue
elif cand_group.data['sprob'] > 1 - tol:
output.append(cand_group)
else:
nontriv_cand_groups.append(cand_group)
# The easy cases...
ngroups = len(nontriv_cand_groups)
problem_status['ngroups_nonint'] = ngroups
if ngroups == 0 or ngroups == 1:
# if ngroups == 1:
# if not deterministic and np.random.uniform() < nontriv_cand_groups[0].data['sprob']:
# output.append(nontriv_cand_groups[0])
if return_problem_status:
return output, problem_status
return output
# Constraints to ensure selected groups are disjoint
nontriv_cand_groups, nrel = create_groups._elim_redundant_features(nontriv_cand_groups)
A = np.zeros((ngroups, nrel), dtype=bool)
for gj, cand_group in enumerate(nontriv_cand_groups):
for feature in cand_group.data['blip-group']:
A[gj, feature] = 1
if verbose:
msg = f"LP had {ngroups} non-integer solutions across {nrel} locations."
msg += f" Binarizing using deterministic={deterministic}."
print(msg)
# Sampling method
if not deterministic:
sprobs = np.array([cg.data['sprob'] for cg in nontriv_cand_groups])
# Sort features in order of marg. prob of selection
marg_probs = np.zeros(nrel)
for j in range(nrel):
marg_probs[j] = sprobs[A[:,j] == 1].sum()
inds = np.argsort(-1*marg_probs)
# Initialize
expected_powers = np.zeros(nsample)
all_outputs = []
for ii in range(nsample):
eliminated_groups = np.zeros(ngroups).astype(bool)
selected_groups = []
# Loop through features and sample
for feature in inds:
if np.all(eliminated_groups):
break
# Subset of available groups which contain the feature
available_flags = (A[:,feature] == 1) & (~eliminated_groups)
if np.any(available_flags):
# Scale up conditional probabilities
prev_elim = (A[:,feature] == 1) & (eliminated_groups)
scale = 1 - sprobs[prev_elim].sum()
new_probs = sprobs[available_flags] / scale
# select nothing with some probability
if np.random.uniform() <= 1 - new_probs.sum():
eliminated_groups[A[:,feature] == 1] = True
continue
# else select one of the groups containing feature
selected_group = np.where(
np.random.multinomial(1, new_probs / new_probs.sum()) != 0
)[0][0]
selected_group = np.where(available_flags)[0][selected_group]
selected_groups.append(selected_group)
# eliminate all mutually exclusive features
group_features = np.where(A[selected_group]==1)[0]
new_elim_groups = np.sum(A[:, group_features], axis=1) != 0
eliminated_groups[new_elim_groups] = True
output_ii = copy.deepcopy(output)
output_ii.extend([nontriv_cand_groups[sg] for sg in selected_groups])
# For local FDR, no backtracking required
if error != 'local_fdr':
# Iteratively eliminate highest PEP discovery to ensure FDR or PFER control
if error == 'fdr':
haterror = np.mean([x.pep for x in output_ii])
else:
haterror = np.sum([x.pep for x in output_ii])
output_ii = sorted(output_ii, key=lambda x: x.pep)
while haterror > q:
output_ii = output_ii[0:-1]
if len(output_ii) == 0:
break
elif error == 'fdr':
haterror = np.mean([x.pep for x in output_ii])
else:
haterror = np.sum([x.pep for x in output_ii])
# Append
expected_powers[ii] = np.sum([
(1-x.pep) * x.data['weight'] for x in output_ii
])
all_outputs.append(output_ii)
output = all_outputs[np.argmax(expected_powers)]
else:
# Construct integer linear program
peps = np.array([cand_group.pep for cand_group in nontriv_cand_groups])
weights = np.array([
cand_group.data['weight'] for cand_group in nontriv_cand_groups
])
# Assemble constraints, variables
x = cp.Variable(ngroups, boolean=True)
x.value = np.zeros(ngroups)
b = np.ones(nrel)
# Account for discoveries already made
ndisc_out = len(output)
v_output = sum([cg.pep for cg in output])
# Construct problem
constraints = [
A.T @ x <= b
]
objective = cp.Maximize(((1-peps) * weights) @ x)
# PFER / FWER specific constraints
# Note: when binary_search=True for FWER, all solutions
# are rounded before entering this function.
if error in ['pfer', 'fwer']:
v_opt = q
v_new = v_opt - v_output
constraints += [
x @ peps <= v_new
]
# No backtracking required for PFER, FWER, or local FDR
if error in ['pfer', 'fwer', 'local_fdr']:
problem = cp.Problem(objective=objective, constraints=constraints)
problem.solve(solver=BIN_SOLVER)
# FDR may require backtracking
elif error == 'fdr':
# Create output
output = sorted(output, key=lambda x: x.pep)
# Iteratively try to solve problem and then backtrack if infeasible
# (backtracking is extremely rare)
while len(output) >= 0:
v_var = cp.Variable(pos=True)
constraints_fdr = constraints + [
x @ peps <= v_var,
cp.sum(x) >= (v_var + v_output) / q - ndisc_out
]
problem = cp.Problem(objective=objective, constraints=constraints_fdr)
try:
problem.solve(solver='GLPK_MI')
except cp.error.SolverError as e:
problem.solve(solver=BIN_SOLVER)
if problem.status != 'infeasible':
break
else:
# This should never be triggered (is mathematically impossible
if len(output) == 0:
raise RuntimeError(
f"Backtracking for FDR control failed"
)
# Backtrack by getting rid of last group
problem_status['backtracking_iter'] += 1
if verbose:
print(f"Starting backtracking iter={problem_status['backtracking_iter']}")
v_output -= output[-1].pep
ndisc_out -= 1
output = output[0:-1]
if x.value is None:
print(f"status={problem.status}, nontriv_cand_groups={nontriv_cand_groups}")
for binprob, x in zip(x.value, nontriv_cand_groups):
if binprob > 1 - tol:
output.append(x)
if return_problem_status:
return output, problem_status
return output