Package InversionTest :: Module MonteCarloSampler
[hide private]
[frames] | no frames]

Source Code for Module InversionTest.MonteCarloSampler

  1  import itertools 
  2  from math import ceil, floor 
  3  from random import shuffle 
  4  from InversionTest.Constants import (GREATER_THAN_HYPOTHESIS, LESS_THAN_HYPOTHESIS, 
  5                                       TWO_SIDED_HYPOTHESIS, TEST_HYPOTHESES) 
  6   
7 -class MonteCarloEstimator(object):
8 """ A Monte Carlo Estimator """ 9
10 - def __init__(self, maxSamples=10000):
11 """ 12 Initialize the estimator 13 @param maxSamples: Maximum number of samples to run for an estimate 14 @type maxSamples: int 15 """ 16 self._maxSamples = maxSamples 17 self._samples = 0 18 self._estimate = None 19 self._hasEstimate = False
20
21 - def _startEstimate(self):
22 """ Start the Monte Carlo estimate """ 23 self._hasEstimate = False 24 self._estimate = None 25 self._samples = 0
26
27 - def _updateSupport(self, statistic):
28 """ 29 Update support based on the latest statistic value 30 @param statistic: Value of the statistic for the most recent sample 31 @type statistic: object 32 """ 33 pass
34
35 - def _finalizeEstimate(self):
36 """ Finalize the estimate """ 37 self._hasEstimate = True
38
39 - def getStatisticFunction(self):
40 """ 41 Get the function that calculates a statistic from a sample 42 @return: Function in form f(sample): return statistic 43 @rtype: callable 44 """ 45 raise NotImplementedError
46
47 - def getSampleFunction(self):
48 """ 49 Return the sample function, which generates random samples 50 @return: Sample generating function, in the form f(): return sample 51 @rtype: callable 52 """ 53 raise NotImplementedError
54
55 - def addSamples(self, n, finalize=True):
56 """ 57 Add a maximum of n samples to the estimate 58 @param n: Maximum number of samples to add 59 @type n: int 60 @param finalize: If True, finalize estimate value after adding samples 61 @type finalize: bool 62 """ 63 calcStatistic = self.getStatisticFunction() 64 getSample = self.getSampleFunction() 65 updateSupport = self._updateSupport 66 for i in xrange(n): 67 sample = getSample() 68 statistic = calcStatistic(sample) 69 updateSupport(statistic) 70 self._samples += 1 71 if finalize: 72 self._finalizeEstimate()
73
74 - def getEstimate(self, update=False):
75 """ 76 Get the value of the Monte Carlo estimate. 77 If no estimate available, calculate an estimate 78 @param update: If True, force re-calculation of the estimate. 79 @type update: bool 80 @return: Monte Carlo estimate 81 @rtype: object 82 """ 83 if not self._hasEstimate or update: 84 self.updateEstimate() 85 return self._estimate
86
87 - def getNumSamples(self):
88 """ 89 Return the number of samples used in the estimate so far 90 @return: Number of samples drawn for the Monte Carlo estimate 91 @rtype: int 92 """ 93 return self._samples
94
95 - def updateEstimate(self):
96 """ Calculate and store the Monte Carlo estimate """ 97 self._startEstimate() 98 self.addSamples(self._maxSamples, False) 99 self._finalizeEstimate()
100 101
102 -class MCHypothesisTest(MonteCarloEstimator):
103 """ Monte Carlo Bernoulli Hypothesis Test """ 104 VERBOSE = True 105
106 - def __init__(self, sample1, sample2, testStatistic, 107 alternative=TWO_SIDED_HYPOTHESIS, 108 maxSamples=10000):
109 """ 110 Initialize the estimator 111 @param sample1: First sample of data points 112 @type sample1: list of object 113 @param sample2: Second sample of data points 114 @type sample2: list of object 115 @param testStatistic: Statistic function to run on each sample, 116 in the form f(sample): return statistic 117 @type testStatistic: callable 118 @param alternative: Alternate hypothesis, from the set GREATER_THAN_HYPOTHESIS, 119 LESS_THAN_HYPOTHESIS, TWO_SIDED_HYPOTHESIS 120 @type alternative: str 121 @param maxSamples: Maximum number of samples to run for an estimate 122 @type maxSamples: int 123 """ 124 super(MCHypothesisTest, self).__init__(maxSamples) 125 if len(sample1) == 0 or len(sample2) == 0: 126 raise ValueError("MC Hypothesis Error: Both samples must have at least one element") 127 if alternative not in TEST_HYPOTHESES: 128 raise ValueError("MC Hypothesis Error: alternative was not a valid test hypothesis, got '%s', need %s"%(alternative, TEST_HYPOTHESES)) 129 if maxSamples <= 0: 130 raise ValueError("MC Hypothesis Error: maxSamples was less than 1") 131 elif maxSamples < 1000: 132 if self.VERBOSE == True: 133 print "Warning: Maximum samples for Monte Carlo was very low, got: %s"%(maxSamples,) 134 135 self._sample1 = sample1 136 self._sample2 = sample2 137 self._testStatistic = testStatistic 138 self._alternative = alternative 139 self._count = 0 140 self._tieCount = 0 # Count ties for two-tailed only 141 self._combinedSample = list(sample1) + list(sample2)
142
143 - def _startEstimate(self):
144 """ Start the Monte Carlo estimate """ 145 super(MCHypothesisTest, self)._startEstimate() 146 self._count = 0 147 self._tieCount = 0
148
149 - def _updateSupport(self, statistic):
150 """ 151 Update the count of samples where the alternate hypothesis was confirmed 152 @param statistic: If sample confirms alternate value is 1, else value is 0 153 @type statistic: int 154 """ 155 if self._alternative == TWO_SIDED_HYPOTHESIS: 156 if statistic > 0: 157 self._count += 1 158 elif statistic == 0: 159 self._tieCount += 1 160 else: 161 self._count += statistic
162
163 - def _finalizeEstimate(self):
164 """ 165 Finalize the estimate, which is the # positive samples divided by total samples 166 """ 167 super(MCHypothesisTest, self)._finalizeEstimate() 168 if self._samples == 0: 169 self._estimate = None 170 else: 171 if self._alternative == TWO_SIDED_HYPOTHESIS: 172 cValue = self._count/float(self._samples) 173 tValue = self._tieCount/float(self._samples) 174 val = 2.0*min(cValue + tValue, 1 - cValue) 175 else: 176 val = self._count/float(self._samples) 177 self._estimate = min(1.0, val)
178
179 - def getSampleFunction(self):
180 """ 181 Return the sample function, which generates random samples 182 @return: Sample generating function, in the form f(): return sample 183 @rtype: callable 184 """ 185 return self._sampleFunction
186
187 - def _sampleFunction(self):
188 """ 189 Sample function for the estimator. This shuffles the combined 190 sample of sample1 + sample2, returning a random permutation. 191 @return: Random permutation of the combined sample set 192 @rtype: list of object 193 """ 194 shuffle(self._combinedSample) 195 return self._combinedSample
196
197 - def getStatisticFunction(self):
198 """ 199 Get the statistic function for the estimator. 200 This function calculates an indicator variable based on a sample 201 as generated by getSampleFunction. 202 @return: Statistic function in form f(sample): return bool 203 @rtype: callable 204 """ 205 alternative = self._alternative 206 testStatistic = self._testStatistic 207 vObs = testStatistic(self._sample1, self._sample2) 208 n1 = len(self._sample1) 209 if alternative == GREATER_THAN_HYPOTHESIS: 210 def anIndicatorFunction(sample): 211 return int(testStatistic(sample[:n1], sample[n1:]) <= vObs)
212 elif alternative == LESS_THAN_HYPOTHESIS: 213 def anIndicatorFunction(sample): 214 return int(testStatistic(sample[:n1], sample[n1:]) >= vObs)
215 else: 216 def anIndicatorFunction(sample): 217 val = testStatistic(sample[:n1],sample[n1:]) 218 if val > vObs: 219 return 1 220 elif val < vObs: 221 return -1 222 else: 223 return 0 224 return anIndicatorFunction 225 226
227 -class MCMeanDifferenceTest(MCHypothesisTest):
228 """ 229 Hypothesis test for the difference between means 230 Terminates after a fixed number of samples 231 """ 232
233 - def __init__(self, sample1, sample2, 234 alternative=TWO_SIDED_HYPOTHESIS, 235 maxSamples=10000):
236 """ 237 Initialize the estimator 238 @param sample1: First sample of data points 239 @type sample1: list of object 240 @param sample2: Second sample of data points 241 @type sample2: list of object 242 @param alternative: Alternate hypothesis, from the set GREATER_THAN_HYPOTHESIS, 243 LESS_THAN_HYPOTHESIS, TWO_SIDED_HYPOTHESIS 244 @type alternative: str 245 @param maxSamples: Maximum number of samples to run for an estimate 246 @type maxSamples: int 247 """ 248 super(MCMeanDifferenceTest, self).__init__(sample1, sample2, 249 meanDifference, 250 alternative, 251 maxSamples)
252 253
254 -class MCWilcoxonRankTest(MCHypothesisTest):
255 """ 256 Hypothesis test for the difference between means 257 Terminates after a fixed number of samples 258 """ 259
260 - def __init__(self, sample1, sample2, 261 alternative=TWO_SIDED_HYPOTHESIS, 262 maxSamples=10000):
263 """ 264 Initialize the estimator 265 Before initializing, convert data points to their ranks 266 @param sample1: First sample of data points 267 @type sample1: list of object 268 @param sample2: Second sample of data points 269 @type sample2: list of object 270 @param alternative: Alternate hypothesis, from the set GREATER_THAN_HYPOTHESIS, 271 LESS_THAN_HYPOTHESIS, TWO_SIDED_HYPOTHESIS 272 @type alternative: str 273 @param maxSamples: Maximum number of samples to run for an estimate 274 @type maxSamples: int 275 """ 276 sample1, sample2 = convertSamplesToRanks(sample1, sample2) 277 super(MCWilcoxonRankTest, self).__init__(sample1, sample2, 278 wilcoxonRankSum, 279 alternative, 280 maxSamples)
281 282
283 -class MCBStoppingRuleTest(MCHypothesisTest):
284 """ 285 Hypothesis test based on the MCB stopping rule (Kim 2010) 286 Terminates when it hits bounds or when the maximal samples 287 have been hit. Estimates within approximately 0.001 at the 288 bounds for alpha=0.95 or alpha=0.99. 289 290 Upper Bounds: 291 r_1 = pValue*(maxSamples+1) 292 c1*((m*p*(1-p))**0.5) + n*p 293 294 Lower Bounds: 295 r_0 = (1-pValue)*(maxSamples+1)) 296 c2*((m*p*(1-p))**0.5) + n*p 297 298 For clarity in code, c1 is referred to as upperBuffer and c2 299 is referred to as lowerBuffer. The default upper and lower 300 buffer sizes are taken from Kim (2010) 301 """ 302 UPPER_BOUND_NAME = 'Upper' 303 LOWER_BOUND_NAME = 'Lower' 304 INNER_BOUND_NAME = 'Inner' 305 DEFAULT_P_VALUE = 0.05 306 DEFAULT_UPPER_BUFFER = 2.241 307 DEFAULT_LOWER_BUFFER = -DEFAULT_UPPER_BUFFER 308
309 - def __init__(self, sample1, sample2, testStatistic, 310 alternative=TWO_SIDED_HYPOTHESIS, 311 maxSamples=10000, pValue=DEFAULT_P_VALUE, 312 upperBuffer=DEFAULT_UPPER_BUFFER, 313 lowerBuffer=DEFAULT_LOWER_BUFFER):
314 """ 315 Initialize the estimator 316 @param sample1: First sample of data points 317 @type sample1: list of object 318 @param sample2: Second sample of data points 319 @type sample2: list of object 320 @param testStatistic: Statistic function to run on each sample, 321 in the form f(sample): return statistic 322 @type testStatistic: callable 323 @param alternative: Alternate hypothesis, from the set GREATER_THAN_HYPOTHESIS, 324 LESS_THAN_HYPOTHESIS, TWO_SIDED_HYPOTHESIS 325 @type alternative: str 326 @param maxSamples: Maximum number of samples to run for an estimate 327 @type maxSamples: int 328 @param pValue: P-value (alpha) for the test, in (0,1) range 329 @type pValue: float 330 @param upperBuffer: Buffer width for upper bound (c1). Should be positive. 331 Higher values require more positives to terminate early. 332 @type upperBuffer: float 333 @param lowerBuffer: Buffer width for the lower bound (c2). Should be negative. 334 Higher absolute values require more negatives to terminate early. 335 @type lowerBuffer: float 336 """ 337 super(MCBStoppingRuleTest, self).__init__(sample1, sample2, 338 testStatistic, 339 alternative, 340 maxSamples) 341 if alternative == TWO_SIDED_HYPOTHESIS: 342 # Store the pValue as the upper 343 pValue = max(pValue, 1.0-pValue) 344 self._pValue = pValue 345 self._upperBuffer = upperBuffer 346 self._lowerBuffer = lowerBuffer 347 self._boundHit = None
348
349 - def _updateBoundHit(self):
350 """ Update which bound has been hit, if any """ 351 value = self._count 352 tieCount = self._tieCount 353 p = self._pValue 354 m = self._maxSamples 355 n = self._samples 356 self._boundHit = self._calculateBoundHit(value, tieCount, p, m, n, 357 self._alternative)
358
359 - def _calculateBoundHit(self, value, tieCount, p, m, n, alternative):
360 """ 361 Calculate which bound has been hit, if any 362 @param value: The number of "hits" exceeding the observed value 363 @type value: int 364 @param tieCount: The number of ties equal to the observed value 365 @type tieCount: int 366 @param p: The p-value being tested, in [0, 1] 367 @type p: float 368 @param m: Max samples 369 @type m: int 370 @param n: Number of samples 371 @type n: int 372 @param alternative: Alternate hypothesis 373 @type alternative: str 374 @return: Bound hit 375 @rtype: str or None 376 """ 377 if alternative == TWO_SIDED_HYPOTHESIS: 378 # Split p-value across the tails 379 p = 1 - (1-p)/2.0 380 highValue = max(value+tieCount, n-value) 381 lowValue = min(value+tieCount, n-value) 382 uBound = self.getUpperBound(p, m, n) 383 lBound = self.getLowerBound(p, m, n) 384 uBound2 = self.getUpperBound(1.0-p, m, n) 385 if uBound is not None and highValue >= uBound: 386 return self.UPPER_BOUND_NAME 387 elif (lBound is not None and uBound2 is not None and 388 highValue <= lBound and 389 lowValue > uBound2): 390 return self.INNER_BOUND_NAME 391 else: 392 return None 393 else: 394 uBound = self.getUpperBound(p, m, n) 395 lBound = self.getLowerBound(p, m, n) 396 if uBound is not None and value >= uBound: 397 return self.UPPER_BOUND_NAME 398 elif lBound is not None and value <= lBound: 399 return self.LOWER_BOUND_NAME 400 else: 401 return None
402
403 - def _startEstimate(self):
404 """ Start the Monte Carlo estimate """ 405 super(MCBStoppingRuleTest, self)._startEstimate() 406 self._boundHit = None
407
408 - def addSamples(self, n, finalize=True):
409 """ 410 Add a maximum of n samples to the estimate. If any 411 termination bound hit, stop sampling and return. 412 @param n: Maximum number of samples to add 413 @type n: int 414 @param finalize: If True, finalize estimate value after adding samples 415 @type finalize: bool 416 """ 417 calcStatistic = self.getStatisticFunction() 418 getSample = self.getSampleFunction() 419 updateSupport = self._updateSupport 420 for i in xrange(n): 421 sample = getSample() 422 statistic = calcStatistic(sample) 423 updateSupport(statistic) 424 self._samples += 1 425 if self.isBoundHit(update=True): 426 break 427 if finalize: 428 self._finalizeEstimate()
429
430 - def isBoundHit(self, update=True):
431 """ 432 Check if upper or lower bound has been hit. 433 @param update: If True, update the stored value before returning 434 @type update: bool 435 @return: True if either bound hit, else False 436 @rtype: bool 437 """ 438 if update: 439 self._updateBoundHit() 440 return self._boundHit is not None
441
442 - def getBoundHit(self, update=True):
443 """ 444 Get the boundary that has been crossed, if any. 445 @param update: If True, update the stored value before returning 446 @type update: bool 447 @return: Return UPPER_BOUND_NAME if crossed the upper bound, 448 return LOWER_BOUND_NAME if crossed the lower bound, 449 return None if neither crossed 450 @rtype: str 451 """ 452 if update: 453 self._updateBoundHit() 454 return self._boundHit
455
456 - def getLowerBound(self, p, m, n):
457 """ 458 Return the active lower bound for values 459 @param p: Probability value for the test, in [0,1] 460 @type p: float 461 @param m: Maximum samples for the test 462 @type m: int 463 @param n: Number of samples so far in the test 464 @param n: int 465 @return: Lower bound value 466 @rtype: int 467 """ 468 bound = self._lowerBuffer*((m*p*(1-p))**0.5) + n*p 469 r0 = (1-p)*(m+1) 470 bound = max(floor(bound), floor(n-r0)) 471 if bound >= 0: 472 return bound 473 else: 474 return None
475
476 - def getUpperBound(self, p, m, n):
477 """ 478 Return the active upper bound value 479 @param p: Probability value for the test, in [0,1] 480 @type p: float 481 @param m: Maximum samples for the test 482 @type m: int 483 @param n: Number of samples so far in the test 484 @param n: int 485 @return: Upper bound value 486 @rtype: int 487 """ 488 bound = self._upperBuffer*((m*p*(1-p))**0.5) + n*p 489 r1 = p*(m+1) 490 bound = min(ceil(bound), ceil(r1)) 491 if bound >= 0: 492 return bound 493 else: 494 return None
495
496 -class MCBMeanDifferenceTest(MCBStoppingRuleTest):
497 """ Hypothesis test for the difference between means (e.g., like a t-Test) """ 498 DEFAULT_P_VALUE = MCBStoppingRuleTest.DEFAULT_P_VALUE 499 DEFAULT_UPPER_BUFFER = MCBStoppingRuleTest.DEFAULT_UPPER_BUFFER 500 DEFAULT_LOWER_BUFFER= MCBStoppingRuleTest.DEFAULT_LOWER_BUFFER 501
502 - def __init__(self, sample1, sample2, 503 alternative=TWO_SIDED_HYPOTHESIS, 504 maxSamples=10000, pValue=DEFAULT_P_VALUE, 505 upperBuffer=DEFAULT_UPPER_BUFFER, 506 lowerBuffer=DEFAULT_LOWER_BUFFER):
507 """ 508 Initialize the estimator 509 @param sample1: First sample of data points 510 @type sample1: list of object 511 @param sample2: Second sample of data points 512 @type sample2: list of object 513 @param alternative: Alternate hypothesis, from the set GREATER_THAN_HYPOTHESIS, 514 LESS_THAN_HYPOTHESIS, TWO_SIDED_HYPOTHESIS 515 @type alternative: str 516 @param maxSamples: Maximum number of samples to run for an estimate 517 @type maxSamples: int 518 @param pValue: P-value (alpha) for the test, in (0,1) range 519 @type pValue: float 520 @param upperBuffer: Buffer width for upper bound (c1). Should be positive. 521 Higher values require more positives to terminate early. 522 @type upperBuffer: float 523 @param lowerBuffer: Buffer width for the lower bound (c2). Should be negative. 524 Higher absolute values require more negatives to terminate early. 525 @type lowerBuffer: float 526 """ 527 super(MCBMeanDifferenceTest, self).__init__(sample1, sample2, 528 meanDifference, 529 alternative, maxSamples, 530 pValue, upperBuffer, lowerBuffer)
531
532 -class MCBWilcoxonRankTest(MCBStoppingRuleTest):
533 """ Wilcoxon MC hypothesis test for the difference between location """ 534 DEFAULT_P_VALUE = MCBStoppingRuleTest.DEFAULT_P_VALUE 535 DEFAULT_UPPER_BUFFER = MCBStoppingRuleTest.DEFAULT_UPPER_BUFFER 536 DEFAULT_LOWER_BUFFER = MCBStoppingRuleTest.DEFAULT_LOWER_BUFFER 537
538 - def __init__(self, sample1, sample2, 539 alternative=TWO_SIDED_HYPOTHESIS, 540 maxSamples=10000, pValue=DEFAULT_P_VALUE, 541 upperBuffer=DEFAULT_UPPER_BUFFER, 542 lowerBuffer=DEFAULT_LOWER_BUFFER):
543 """ 544 Initialize the estimator 545 @param sample1: First sample of data points 546 @type sample1: list of object 547 @param sample2: Second sample of data points 548 @type sample2: list of object 549 @param alternative: Alternate hypothesis, from the set GREATER_THAN_HYPOTHESIS, 550 LESS_THAN_HYPOTHESIS, TWO_SIDED_HYPOTHESIS 551 @type alternative: str 552 @param maxSamples: Maximum number of samples to run for an estimate 553 @type maxSamples: int 554 @param pValue: P-value (alpha) for the test, in (0,1) range 555 @type pValue: float 556 @param upperBuffer: Buffer width for upper bound (c1). Should be positive. 557 Higher values require more positives to terminate early. 558 @type upperBuffer: float 559 @param lowerBuffer: Buffer width for the lower bound (c2). Should be negative. 560 Higher absolute values require more negatives to terminate early. 561 @type lowerBuffer: float 562 """ 563 sample1, sample2 = convertSamplesToRanks(sample1, sample2) 564 super(MCBWilcoxonRankTest, self).__init__(sample1, sample2, 565 wilcoxonRankSum, 566 alternative, maxSamples, 567 pValue, upperBuffer, lowerBuffer)
568 569 570 # Basic Statistics
571 -def meanDifference(x, y):
572 """ 573 Difference of means between x and y, i.e. avg(y) - avg(x) 574 If either sample has zero elements, this raises an error 575 @param x: First sample 576 @type x: list of float 577 @param y: Second sample 578 @type y: list of float 579 @return: Mean of y minus mean of x 580 @rtype: float 581 """ 582 return sum(y)/float(len(y)) - sum(x)/float(len(x))
583
584 -def wilcoxonRankSum(x, y):
585 """ 586 Sum of ranks of y 587 If either sample has zero elements, this raises an error 588 @param x: First sample 589 @type x: list of float 590 @param y: Second sample 591 @type y: list of float 592 @return: Sum of ranks of y 593 @rtype: float 594 """ 595 return sum(y)
596 597 # Utility Functions
598 -def convertSamplesToRanks(x, y):
599 """ 600 Convert two sets of samples into their equivalent ranks 601 For ranks that share the same value, average the range of 602 ranks that they would hold. 603 @param x: First sample 604 @type x: list of float 605 @param y: Second sample 606 @type y: list of float 607 @return: Tuple of (sample of x-ranks, sample of y-ranks) 608 @rtype: list of float, list of float 609 """ 610 dataRanks = [(val, 0) for val in x] + [(val, 1) for val in y] 611 dataRanks.sort() 612 newX = [] 613 newY = [] 614 n = 0 615 extractGroupByValue = lambda x: x[0] 616 for val, group in itertools.groupby(dataRanks, extractGroupByValue): 617 group = list(group) 618 numItems = len(group) 619 rank = n + (numItems-1)/2.0 620 for val, sampleId in group: 621 if sampleId == 0: 622 newX.append(rank) 623 else: 624 newY.append(rank) 625 n += numItems 626 return newX, newY
627