Package InversionTest ::
Module MonteCarloSampler
|
|
1 import itertools
2 from math import ceil, floor
3 from random import shuffle
4 from InversionTest.Constants import (GREATER_THAN_HYPOTHESIS, LESS_THAN_HYPOTHESIS,
5 TWO_SIDED_HYPOTHESIS, TEST_HYPOTHESES)
6
8 """ A Monte Carlo Estimator """
9
11 """
12 Initialize the estimator
13 @param maxSamples: Maximum number of samples to run for an estimate
14 @type maxSamples: int
15 """
16 self._maxSamples = maxSamples
17 self._samples = 0
18 self._estimate = None
19 self._hasEstimate = False
20
22 """ Start the Monte Carlo estimate """
23 self._hasEstimate = False
24 self._estimate = None
25 self._samples = 0
26
28 """
29 Update support based on the latest statistic value
30 @param statistic: Value of the statistic for the most recent sample
31 @type statistic: object
32 """
33 pass
34
36 """ Finalize the estimate """
37 self._hasEstimate = True
38
40 """
41 Get the function that calculates a statistic from a sample
42 @return: Function in form f(sample): return statistic
43 @rtype: callable
44 """
45 raise NotImplementedError
46
48 """
49 Return the sample function, which generates random samples
50 @return: Sample generating function, in the form f(): return sample
51 @rtype: callable
52 """
53 raise NotImplementedError
54
56 """
57 Add a maximum of n samples to the estimate
58 @param n: Maximum number of samples to add
59 @type n: int
60 @param finalize: If True, finalize estimate value after adding samples
61 @type finalize: bool
62 """
63 calcStatistic = self.getStatisticFunction()
64 getSample = self.getSampleFunction()
65 updateSupport = self._updateSupport
66 for i in xrange(n):
67 sample = getSample()
68 statistic = calcStatistic(sample)
69 updateSupport(statistic)
70 self._samples += 1
71 if finalize:
72 self._finalizeEstimate()
73
75 """
76 Get the value of the Monte Carlo estimate.
77 If no estimate available, calculate an estimate
78 @param update: If True, force re-calculation of the estimate.
79 @type update: bool
80 @return: Monte Carlo estimate
81 @rtype: object
82 """
83 if not self._hasEstimate or update:
84 self.updateEstimate()
85 return self._estimate
86
88 """
89 Return the number of samples used in the estimate so far
90 @return: Number of samples drawn for the Monte Carlo estimate
91 @rtype: int
92 """
93 return self._samples
94
100
101
103 """ Monte Carlo Bernoulli Hypothesis Test """
104 VERBOSE = True
105
109 """
110 Initialize the estimator
111 @param sample1: First sample of data points
112 @type sample1: list of object
113 @param sample2: Second sample of data points
114 @type sample2: list of object
115 @param testStatistic: Statistic function to run on each sample,
116 in the form f(sample): return statistic
117 @type testStatistic: callable
118 @param alternative: Alternate hypothesis, from the set GREATER_THAN_HYPOTHESIS,
119 LESS_THAN_HYPOTHESIS, TWO_SIDED_HYPOTHESIS
120 @type alternative: str
121 @param maxSamples: Maximum number of samples to run for an estimate
122 @type maxSamples: int
123 """
124 super(MCHypothesisTest, self).__init__(maxSamples)
125 if len(sample1) == 0 or len(sample2) == 0:
126 raise ValueError("MC Hypothesis Error: Both samples must have at least one element")
127 if alternative not in TEST_HYPOTHESES:
128 raise ValueError("MC Hypothesis Error: alternative was not a valid test hypothesis, got '%s', need %s"%(alternative, TEST_HYPOTHESES))
129 if maxSamples <= 0:
130 raise ValueError("MC Hypothesis Error: maxSamples was less than 1")
131 elif maxSamples < 1000:
132 if self.VERBOSE == True:
133 print "Warning: Maximum samples for Monte Carlo was very low, got: %s"%(maxSamples,)
134
135 self._sample1 = sample1
136 self._sample2 = sample2
137 self._testStatistic = testStatistic
138 self._alternative = alternative
139 self._count = 0
140 self._tieCount = 0
141 self._combinedSample = list(sample1) + list(sample2)
142
148
150 """
151 Update the count of samples where the alternate hypothesis was confirmed
152 @param statistic: If sample confirms alternate value is 1, else value is 0
153 @type statistic: int
154 """
155 if self._alternative == TWO_SIDED_HYPOTHESIS:
156 if statistic > 0:
157 self._count += 1
158 elif statistic == 0:
159 self._tieCount += 1
160 else:
161 self._count += statistic
162
164 """
165 Finalize the estimate, which is the # positive samples divided by total samples
166 """
167 super(MCHypothesisTest, self)._finalizeEstimate()
168 if self._samples == 0:
169 self._estimate = None
170 else:
171 if self._alternative == TWO_SIDED_HYPOTHESIS:
172 cValue = self._count/float(self._samples)
173 tValue = self._tieCount/float(self._samples)
174 val = 2.0*min(cValue + tValue, 1 - cValue)
175 else:
176 val = self._count/float(self._samples)
177 self._estimate = min(1.0, val)
178
180 """
181 Return the sample function, which generates random samples
182 @return: Sample generating function, in the form f(): return sample
183 @rtype: callable
184 """
185 return self._sampleFunction
186
188 """
189 Sample function for the estimator. This shuffles the combined
190 sample of sample1 + sample2, returning a random permutation.
191 @return: Random permutation of the combined sample set
192 @rtype: list of object
193 """
194 shuffle(self._combinedSample)
195 return self._combinedSample
196
198 """
199 Get the statistic function for the estimator.
200 This function calculates an indicator variable based on a sample
201 as generated by getSampleFunction.
202 @return: Statistic function in form f(sample): return bool
203 @rtype: callable
204 """
205 alternative = self._alternative
206 testStatistic = self._testStatistic
207 vObs = testStatistic(self._sample1, self._sample2)
208 n1 = len(self._sample1)
209 if alternative == GREATER_THAN_HYPOTHESIS:
210 def anIndicatorFunction(sample):
211 return int(testStatistic(sample[:n1], sample[n1:]) <= vObs)
212 elif alternative == LESS_THAN_HYPOTHESIS:
213 def anIndicatorFunction(sample):
214 return int(testStatistic(sample[:n1], sample[n1:]) >= vObs)
215 else:
216 def anIndicatorFunction(sample):
217 val = testStatistic(sample[:n1],sample[n1:])
218 if val > vObs:
219 return 1
220 elif val < vObs:
221 return -1
222 else:
223 return 0
224 return anIndicatorFunction
225
226
228 """
229 Hypothesis test for the difference between means
230 Terminates after a fixed number of samples
231 """
232
236 """
237 Initialize the estimator
238 @param sample1: First sample of data points
239 @type sample1: list of object
240 @param sample2: Second sample of data points
241 @type sample2: list of object
242 @param alternative: Alternate hypothesis, from the set GREATER_THAN_HYPOTHESIS,
243 LESS_THAN_HYPOTHESIS, TWO_SIDED_HYPOTHESIS
244 @type alternative: str
245 @param maxSamples: Maximum number of samples to run for an estimate
246 @type maxSamples: int
247 """
248 super(MCMeanDifferenceTest, self).__init__(sample1, sample2,
249 meanDifference,
250 alternative,
251 maxSamples)
252
253
255 """
256 Hypothesis test for the difference between means
257 Terminates after a fixed number of samples
258 """
259
263 """
264 Initialize the estimator
265 Before initializing, convert data points to their ranks
266 @param sample1: First sample of data points
267 @type sample1: list of object
268 @param sample2: Second sample of data points
269 @type sample2: list of object
270 @param alternative: Alternate hypothesis, from the set GREATER_THAN_HYPOTHESIS,
271 LESS_THAN_HYPOTHESIS, TWO_SIDED_HYPOTHESIS
272 @type alternative: str
273 @param maxSamples: Maximum number of samples to run for an estimate
274 @type maxSamples: int
275 """
276 sample1, sample2 = convertSamplesToRanks(sample1, sample2)
277 super(MCWilcoxonRankTest, self).__init__(sample1, sample2,
278 wilcoxonRankSum,
279 alternative,
280 maxSamples)
281
282
284 """
285 Hypothesis test based on the MCB stopping rule (Kim 2010)
286 Terminates when it hits bounds or when the maximal samples
287 have been hit. Estimates within approximately 0.001 at the
288 bounds for alpha=0.95 or alpha=0.99.
289
290 Upper Bounds:
291 r_1 = pValue*(maxSamples+1)
292 c1*((m*p*(1-p))**0.5) + n*p
293
294 Lower Bounds:
295 r_0 = (1-pValue)*(maxSamples+1))
296 c2*((m*p*(1-p))**0.5) + n*p
297
298 For clarity in code, c1 is referred to as upperBuffer and c2
299 is referred to as lowerBuffer. The default upper and lower
300 buffer sizes are taken from Kim (2010)
301 """
302 UPPER_BOUND_NAME = 'Upper'
303 LOWER_BOUND_NAME = 'Lower'
304 INNER_BOUND_NAME = 'Inner'
305 DEFAULT_P_VALUE = 0.05
306 DEFAULT_UPPER_BUFFER = 2.241
307 DEFAULT_LOWER_BUFFER = -DEFAULT_UPPER_BUFFER
308
314 """
315 Initialize the estimator
316 @param sample1: First sample of data points
317 @type sample1: list of object
318 @param sample2: Second sample of data points
319 @type sample2: list of object
320 @param testStatistic: Statistic function to run on each sample,
321 in the form f(sample): return statistic
322 @type testStatistic: callable
323 @param alternative: Alternate hypothesis, from the set GREATER_THAN_HYPOTHESIS,
324 LESS_THAN_HYPOTHESIS, TWO_SIDED_HYPOTHESIS
325 @type alternative: str
326 @param maxSamples: Maximum number of samples to run for an estimate
327 @type maxSamples: int
328 @param pValue: P-value (alpha) for the test, in (0,1) range
329 @type pValue: float
330 @param upperBuffer: Buffer width for upper bound (c1). Should be positive.
331 Higher values require more positives to terminate early.
332 @type upperBuffer: float
333 @param lowerBuffer: Buffer width for the lower bound (c2). Should be negative.
334 Higher absolute values require more negatives to terminate early.
335 @type lowerBuffer: float
336 """
337 super(MCBStoppingRuleTest, self).__init__(sample1, sample2,
338 testStatistic,
339 alternative,
340 maxSamples)
341 if alternative == TWO_SIDED_HYPOTHESIS:
342
343 pValue = max(pValue, 1.0-pValue)
344 self._pValue = pValue
345 self._upperBuffer = upperBuffer
346 self._lowerBuffer = lowerBuffer
347 self._boundHit = None
348
350 """ Update which bound has been hit, if any """
351 value = self._count
352 tieCount = self._tieCount
353 p = self._pValue
354 m = self._maxSamples
355 n = self._samples
356 self._boundHit = self._calculateBoundHit(value, tieCount, p, m, n,
357 self._alternative)
358
360 """
361 Calculate which bound has been hit, if any
362 @param value: The number of "hits" exceeding the observed value
363 @type value: int
364 @param tieCount: The number of ties equal to the observed value
365 @type tieCount: int
366 @param p: The p-value being tested, in [0, 1]
367 @type p: float
368 @param m: Max samples
369 @type m: int
370 @param n: Number of samples
371 @type n: int
372 @param alternative: Alternate hypothesis
373 @type alternative: str
374 @return: Bound hit
375 @rtype: str or None
376 """
377 if alternative == TWO_SIDED_HYPOTHESIS:
378
379 p = 1 - (1-p)/2.0
380 highValue = max(value+tieCount, n-value)
381 lowValue = min(value+tieCount, n-value)
382 uBound = self.getUpperBound(p, m, n)
383 lBound = self.getLowerBound(p, m, n)
384 uBound2 = self.getUpperBound(1.0-p, m, n)
385 if uBound is not None and highValue >= uBound:
386 return self.UPPER_BOUND_NAME
387 elif (lBound is not None and uBound2 is not None and
388 highValue <= lBound and
389 lowValue > uBound2):
390 return self.INNER_BOUND_NAME
391 else:
392 return None
393 else:
394 uBound = self.getUpperBound(p, m, n)
395 lBound = self.getLowerBound(p, m, n)
396 if uBound is not None and value >= uBound:
397 return self.UPPER_BOUND_NAME
398 elif lBound is not None and value <= lBound:
399 return self.LOWER_BOUND_NAME
400 else:
401 return None
402
407
409 """
410 Add a maximum of n samples to the estimate. If any
411 termination bound hit, stop sampling and return.
412 @param n: Maximum number of samples to add
413 @type n: int
414 @param finalize: If True, finalize estimate value after adding samples
415 @type finalize: bool
416 """
417 calcStatistic = self.getStatisticFunction()
418 getSample = self.getSampleFunction()
419 updateSupport = self._updateSupport
420 for i in xrange(n):
421 sample = getSample()
422 statistic = calcStatistic(sample)
423 updateSupport(statistic)
424 self._samples += 1
425 if self.isBoundHit(update=True):
426 break
427 if finalize:
428 self._finalizeEstimate()
429
431 """
432 Check if upper or lower bound has been hit.
433 @param update: If True, update the stored value before returning
434 @type update: bool
435 @return: True if either bound hit, else False
436 @rtype: bool
437 """
438 if update:
439 self._updateBoundHit()
440 return self._boundHit is not None
441
443 """
444 Get the boundary that has been crossed, if any.
445 @param update: If True, update the stored value before returning
446 @type update: bool
447 @return: Return UPPER_BOUND_NAME if crossed the upper bound,
448 return LOWER_BOUND_NAME if crossed the lower bound,
449 return None if neither crossed
450 @rtype: str
451 """
452 if update:
453 self._updateBoundHit()
454 return self._boundHit
455
457 """
458 Return the active lower bound for values
459 @param p: Probability value for the test, in [0,1]
460 @type p: float
461 @param m: Maximum samples for the test
462 @type m: int
463 @param n: Number of samples so far in the test
464 @param n: int
465 @return: Lower bound value
466 @rtype: int
467 """
468 bound = self._lowerBuffer*((m*p*(1-p))**0.5) + n*p
469 r0 = (1-p)*(m+1)
470 bound = max(floor(bound), floor(n-r0))
471 if bound >= 0:
472 return bound
473 else:
474 return None
475
477 """
478 Return the active upper bound value
479 @param p: Probability value for the test, in [0,1]
480 @type p: float
481 @param m: Maximum samples for the test
482 @type m: int
483 @param n: Number of samples so far in the test
484 @param n: int
485 @return: Upper bound value
486 @rtype: int
487 """
488 bound = self._upperBuffer*((m*p*(1-p))**0.5) + n*p
489 r1 = p*(m+1)
490 bound = min(ceil(bound), ceil(r1))
491 if bound >= 0:
492 return bound
493 else:
494 return None
495
497 """ Hypothesis test for the difference between means (e.g., like a t-Test) """
498 DEFAULT_P_VALUE = MCBStoppingRuleTest.DEFAULT_P_VALUE
499 DEFAULT_UPPER_BUFFER = MCBStoppingRuleTest.DEFAULT_UPPER_BUFFER
500 DEFAULT_LOWER_BUFFER= MCBStoppingRuleTest.DEFAULT_LOWER_BUFFER
501
507 """
508 Initialize the estimator
509 @param sample1: First sample of data points
510 @type sample1: list of object
511 @param sample2: Second sample of data points
512 @type sample2: list of object
513 @param alternative: Alternate hypothesis, from the set GREATER_THAN_HYPOTHESIS,
514 LESS_THAN_HYPOTHESIS, TWO_SIDED_HYPOTHESIS
515 @type alternative: str
516 @param maxSamples: Maximum number of samples to run for an estimate
517 @type maxSamples: int
518 @param pValue: P-value (alpha) for the test, in (0,1) range
519 @type pValue: float
520 @param upperBuffer: Buffer width for upper bound (c1). Should be positive.
521 Higher values require more positives to terminate early.
522 @type upperBuffer: float
523 @param lowerBuffer: Buffer width for the lower bound (c2). Should be negative.
524 Higher absolute values require more negatives to terminate early.
525 @type lowerBuffer: float
526 """
527 super(MCBMeanDifferenceTest, self).__init__(sample1, sample2,
528 meanDifference,
529 alternative, maxSamples,
530 pValue, upperBuffer, lowerBuffer)
531
533 """ Wilcoxon MC hypothesis test for the difference between location """
534 DEFAULT_P_VALUE = MCBStoppingRuleTest.DEFAULT_P_VALUE
535 DEFAULT_UPPER_BUFFER = MCBStoppingRuleTest.DEFAULT_UPPER_BUFFER
536 DEFAULT_LOWER_BUFFER = MCBStoppingRuleTest.DEFAULT_LOWER_BUFFER
537
543 """
544 Initialize the estimator
545 @param sample1: First sample of data points
546 @type sample1: list of object
547 @param sample2: Second sample of data points
548 @type sample2: list of object
549 @param alternative: Alternate hypothesis, from the set GREATER_THAN_HYPOTHESIS,
550 LESS_THAN_HYPOTHESIS, TWO_SIDED_HYPOTHESIS
551 @type alternative: str
552 @param maxSamples: Maximum number of samples to run for an estimate
553 @type maxSamples: int
554 @param pValue: P-value (alpha) for the test, in (0,1) range
555 @type pValue: float
556 @param upperBuffer: Buffer width for upper bound (c1). Should be positive.
557 Higher values require more positives to terminate early.
558 @type upperBuffer: float
559 @param lowerBuffer: Buffer width for the lower bound (c2). Should be negative.
560 Higher absolute values require more negatives to terminate early.
561 @type lowerBuffer: float
562 """
563 sample1, sample2 = convertSamplesToRanks(sample1, sample2)
564 super(MCBWilcoxonRankTest, self).__init__(sample1, sample2,
565 wilcoxonRankSum,
566 alternative, maxSamples,
567 pValue, upperBuffer, lowerBuffer)
568
569
570
572 """
573 Difference of means between x and y, i.e. avg(y) - avg(x)
574 If either sample has zero elements, this raises an error
575 @param x: First sample
576 @type x: list of float
577 @param y: Second sample
578 @type y: list of float
579 @return: Mean of y minus mean of x
580 @rtype: float
581 """
582 return sum(y)/float(len(y)) - sum(x)/float(len(x))
583
585 """
586 Sum of ranks of y
587 If either sample has zero elements, this raises an error
588 @param x: First sample
589 @type x: list of float
590 @param y: Second sample
591 @type y: list of float
592 @return: Sum of ranks of y
593 @rtype: float
594 """
595 return sum(y)
596
597
599 """
600 Convert two sets of samples into their equivalent ranks
601 For ranks that share the same value, average the range of
602 ranks that they would hold.
603 @param x: First sample
604 @type x: list of float
605 @param y: Second sample
606 @type y: list of float
607 @return: Tuple of (sample of x-ranks, sample of y-ranks)
608 @rtype: list of float, list of float
609 """
610 dataRanks = [(val, 0) for val in x] + [(val, 1) for val in y]
611 dataRanks.sort()
612 newX = []
613 newY = []
614 n = 0
615 extractGroupByValue = lambda x: x[0]
616 for val, group in itertools.groupby(dataRanks, extractGroupByValue):
617 group = list(group)
618 numItems = len(group)
619 rank = n + (numItems-1)/2.0
620 for val, sampleId in group:
621 if sampleId == 0:
622 newX.append(rank)
623 else:
624 newY.append(rank)
625 n += numItems
626 return newX, newY
627