Package InversionTest :: Module InversionAnalysis
[hide private]
[frames] | no frames]

Source Code for Module InversionTest.InversionAnalysis

   1  """ 
   2  Main module for inversion counting, hypothesis tests, and 
   3  sequence transformations.  The exposed API of the package 
   4  relies on the implementations contained in this module. 
   5   
   6  Author: Benjamin D. Nye 
   7  License: Apache License V2.0 
   8  """ 
   9  from itertools import groupby 
  10  from InversionTest.InversionDistribution import (maxInversions, varianceOfInversions, 
  11      similarityByInversions, correlationByInversions, rawInversionCounts) 
  12  from InversionTest.PolynomialOperations import polynomialExpansion, syntheticDivision 
  13  from InversionTest.StatisticalTests import (normalCDFFunction, signTest, wilcoxonSignedRankTest, 
  14      permutationMeanTest, permutationRankTest, GREATER_THAN_HYPOTHESIS) 
15 16 17 # Error Classes 18 -class InvalidGradedPosetError(IndexError): pass
19 -class MissingElementError(IndexError): pass
20 21 # Types of CDF Calculations and Measures 22 EXACT_CDF = 'exact' 23 NORMAL_CDF = 'normal' 24 ADAPTIVE_CDF = 'adaptive' 25 SIMILARITY_METRIC = 'similarity' 26 CDF_TYPES = frozenset([EXACT_CDF, NORMAL_CDF, ADAPTIVE_CDF]) 27 28 # Types of Default Grades 29 LEFT_GRADE = 'Left' 30 ZERO_GRADE = 'Zero' 31 MEAN_GRADE = 'Mean' 32 MAX_GRADE = 'Max' 33 RIGHT_GRADE = 'Right' 34 DEFAULT_GRADES = frozenset([LEFT_GRADE, ZERO_GRADE, MEAN_GRADE, MAX_GRADE, RIGHT_GRADE]) 35 36 VALID_GRADE_CONTAINERS = (list, tuple, set)
37 38 -class GradedPosetSequence(object):
39 """ 40 An analysis of the inversion distribution of a graded poset 41 This can be used to analyze the reference sequence against 42 a sorted version of itself or against permutations of these 43 elements in a sequence passed in (which may be graded or not). 44 Note: When comparing two sequences, it should be noted that 45 these inversion measures are symmetric with respect to which 46 sequence is considered the reference versus the comparison. 47 48 Graded sequences are represented through nested lists, where 49 each subsequence contains the elements of a grade. So then, 50 the sequence: [[A,B,C], [1,2,3], [X,Y]] contains three grades, 51 where A, B, and C have rank 1, [1,2,3] have rank 2, etc. Elements 52 that share a grade cannot have inversions between them. The 53 elements can be any valid Python objects that have valid comparison 54 operations (including custom class instance). However, if any 55 of the elements cannot be hashed, the hashableElements flag must 56 be set false or problems will occur. 57 58 Note: Technically, while this code refers to ``permutations'', two 59 sequences with the same elements but different grade structures 60 are not technically permutations (though two with the same grade 61 structure, regardless of structure, are certainly permutations). 62 However, they are VERY close to permutations and may be considered 63 analogous to permutations where you can't see the order of certain 64 elements (i.e., censored observations). 65 """ 66 # Approximation Bound - N to use approximations for an adaptive calculation 67 APPROXIMATION_RANKS_BOUND = 100 68
69 - def __init__(self, sequence, isGraded=False, hashableElements=True, validate=False, defaultGrade=None):
70 """ 71 Initialize the reference sequence 72 @param sequence: Reference sequence, for comparing permutations against 73 @type sequence: list of object or list of list of object 74 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list. 75 @type isGraded: bool 76 @param hashableElements: If True, elements can be hashed (allowing slightly faster performance) 77 @type hashableElements: bool 78 @param validate: If True, run basic validation on the sequence inside 79 @type validate: bool 80 """ 81 self._sequence = sequence 82 self._isGraded = isGraded 83 self._hashableElements = hashableElements 84 if validate: 85 self.validate() 86 self._sortedRanks = False 87 self._hashRanks = None 88 self._numElements = self._calcNumElements(self._sequence, self._isGraded) 89 self._overloadedRanks = None 90 self._defaultGrade = defaultGrade
91
92 - def __len__(self):
93 """ 94 Get the number of grades in the sequence 95 @return: Number of grades 96 @rtype: int 97 """ 98 return len(self._sequence)
99
100 - def __iter__(self):
101 """ 102 Iterate over the grades/elements 103 @return: If nested, yield a grade (and all elements in it). If flat, yield an element. 104 @rtype: list of object or object 105 """ 106 for x in self._sequence: 107 yield x
108
109 - def iterElements(self):
110 """ 111 Iterate over the elements (flat iteration over grades) 112 @return: Yield all the elements in the poset 113 @rtype: list of object or object 114 """ 115 if self._isGraded: 116 for grade in self._sequence: 117 if isinstance(grade, VALID_GRADE_CONTAINERS): 118 for x in grade: 119 yield x 120 else: 121 yield grade 122 else: 123 for x in self._sequence: 124 yield x
125
126 - def isGraded(self):
127 """ 128 Return if this is a graded poset. If True, is graded. 129 """ 130 return self._isGraded
131
132 - def getNumElements(self):
133 """ 134 Get the number of elements in this sequence 135 @return: Number of elements 136 @rtype: int 137 """ 138 return self._numElements
139
140 - def getOverloadedRanks(self):
141 """ 142 Get the elements in grades with more than one element 143 @return: List of grades with their elements 144 @rtype: list of list of object 145 """ 146 if self._overloadedRanks is None: 147 self._overloadedRanks = self._calcOverloadedRanks(self._sequence, self._isGraded) 148 return self._overloadedRanks
149
150 - def maxInversions(self, sequence=None, isGraded=False):
151 """ 152 The maximum possible inversions, given the number of elements and 153 the number sharing each grade (e.g. ties). If sequence param is 154 given, the structure of the comparison sequence is considered also. 155 @param sequence: Comparison sequence 156 @type sequence: list of list of object or list of object or None 157 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list. 158 @type isGraded: bool 159 @return: Maximum possible number of inversions 160 @rtype: int 161 """ 162 return self._calcGradedStatistic(maxInversions, sequence, isGraded)
163
164 - def mean(self, sequence=None, isGraded=False):
165 """ 166 The mean of inversion across permutations, given the number of elements 167 and the number sharing each grade (e.g. ties). If sequence param is 168 given, the structure of the comparison sequence is considered also. 169 @param sequence: Comparison sequence 170 @type sequence: list of list of object or list of object or None 171 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list. 172 @type isGraded: bool 173 @return: Mean number of inversions across possible permutations 174 @rtype: int 175 """ 176 return self.maxInversions(sequence, isGraded)/2.0
177
178 - def variance(self, sequence=None, isGraded=False):
179 """ 180 The variance of inversions across permutations, given the number of elements 181 and the number sharing each grade (e.g. ties). If sequence param is 182 given, the structure of the comparison sequence is considered also. 183 @param sequence: Comparison sequence 184 @type sequence: list of list of object or list of object or None 185 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list. 186 @type isGraded: bool 187 @return: Variance of inversions across possible permutations 188 @rtype: int 189 """ 190 return self._calcGradedStatistic(varianceOfInversions, sequence, isGraded)
191
192 - def stdDev(self, sequence=None, isGraded=False):
193 """ 194 The standard deviation of inversions across permutations, given the elements 195 and the number sharing each grade (e.g. ties). If sequence param is not 196 given, the structure of the comparison sequence is considered also. 197 @param sequence: Comparison sequence 198 @type sequence: list of list of object or list of object or None 199 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list. 200 @type isGraded: bool 201 @return: Standard deviation of inversions across possible permutations 202 @rtype: int 203 """ 204 return self.variance(sequence, isGraded)**0.5
205
206 - def inversions(self, sequence=None, isGraded=False):
207 """ 208 The inversions between the reference sequence and the comparison sequence. 209 If the 'sequence' param is None, the comparison sequence is a fully sorted 210 form of the reference sequence. 211 @param sequence: Comparison sequence (If None, equivalent to a sorted reference sequence) 212 @type sequence: list of list of object or list of object or None 213 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list. 214 @type isGraded: bool 215 @return: If 'sequence' given, number of inversions to turn it into reference sequence. 216 Else, the number of inversions to sort the reference sequence. 217 @rtype: int 218 """ 219 if sequence: 220 if isinstance(sequence, GradedPosetSequence): 221 isGraded = sequence.isGraded() 222 sequence = sequence._sequence 223 if self._hashableElements: 224 if self._hashRanks is None: 225 self._populateHashRanks() 226 indexSeq = makeFlatIndexSequence(sequence, self._sequence, isGraded, self._isGraded, 227 indexFunction=self._getElementIndex) 228 else: 229 indexSeq = makeFlatIndexSequence(sequence, self._sequence, isGraded, self._isGraded) 230 return mergeSortInversionCount(indexSeq)[0] 231 else: 232 if self._isGraded: 233 if not self._sortedRanks: 234 for i, grade in enumerate(self._sequence): 235 if isinstance(grade, list): 236 grade.sort() 237 elif isinstance(grade, VALID_GRADE_CONTAINERS): 238 self._sequence[i] = sorted(grade) 239 self._sortedRanks = True 240 return mergeSortInversionCount([element for element in self.iterElements()])[0] 241 else: 242 return mergeSortInversionCount(self._sequence)[0]
243
244 - def similarity(self, sequence=None, isGraded=False):
245 """ 246 The similarity of the reference sequence to a comparison sequence. If the 247 'sequence' param is None, the comparison sequence is a fully sorted form 248 of the reference sequence. 249 @param sequence: Comparison sequence (If None, equivalent to a sorted reference sequence) 250 @type sequence: list of list of object or list of object or None 251 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list. 252 @type isGraded: bool 253 @return: The similarity of the reference sequence to the comparison sequence, in [0,1] 254 @rtype: float 255 """ 256 inversions = self.inversions(sequence, isGraded) 257 maxInversions = self.maxInversions(sequence, isGraded) 258 return similarityByInversions(inversions, maxInversions)
259
260 - def correlation(self, sequence=None, isGraded=False):
261 """ 262 The correlation of the reference sequence to a comparison sequence. If the 263 'sequence' param is None, the comparison sequence is a fully sorted form 264 of the reference sequence. This is just a rescaled form of similarity. 265 This should also be equivalent to a Kendall's Tau correlation under 266 standard conditions. 267 @param sequence: Comparison sequence (If None, equivalent to a sorted reference sequence) 268 @type sequence: list of list of object or list of object or None 269 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list. 270 @type isGraded: bool 271 @return: The correlation of the reference sequence to the comparison sequence, in [-1,1] 272 @rtype: float 273 """ 274 inversions = self.inversions(sequence, isGraded) 275 maxInversions = self.maxInversions(sequence, isGraded) 276 return correlationByInversions(inversions, maxInversions)
277
278 - def cdf(self, sequence=None, isGraded=False, cdfType=ADAPTIVE_CDF):
279 """ 280 Calculate a CDF value for inversions between the reference sequence 281 abd a comparison sequence. If the 'sequence' param is None, the comparison 282 sequence is a fully sorted form of the reference sequence. This can 283 use either an exact distribution or a normal approximation. 284 @param sequence: Comparison sequence (If None, equivalent to a sorted reference sequence) 285 @type sequence: list of list of object or list of object or None 286 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list. 287 @type isGraded: bool 288 @param cdfType: The CDF function to use. If ADAPTIVE_CDF, automatically switches to a normal 289 approximation when the # of elements exceeds a bound. If EXACT_CDF, 290 calculates the PDF exactly and sums over it. This gets slow and is 291 seldom necessary after about 100 elements. If NORMAL_CDF, uses the 292 mean and variance to generate a normal distribution. Except for 293 small numbers of elements or very large grades, this is advised. 294 @type cdfType: str 295 @return: P(x<=X) where x is a random variable for permutation inversions and X is 296 the number of inversions between the reference and comparison sequences. 297 @rtype: float 298 """ 299 if cdfType == NORMAL_CDF: 300 return self.normalCDF(sequence, isGraded) 301 elif cdfType == EXACT_CDF: 302 return self.exactCDF(sequence, isGraded) 303 else: 304 maxInversions = self.maxInversions(sequence, isGraded) 305 if maxInversions >= self.APPROXIMATION_RANKS_BOUND: 306 return self.normalCDF(sequence, isGraded) 307 else: 308 return self.exactCDF(sequence, isGraded)
309
310 - def exactCDF(self, sequence=None, isGraded=False):
311 """ 312 Calculate an exact CDF value for inversions between the reference sequence 313 abd a comparison sequence. If the 'sequence' param is None, the comparison 314 sequence is a fully sorted form of the reference sequence. 315 Note: The exact CDF calculation works very slowly as N gets greater than 100 elements. 316 @param sequence: Comparison sequence (If None, equivalent to a sorted reference sequence) 317 @type sequence: list of list of object or list of object or None 318 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list. 319 @type isGraded: bool 320 @return: P(x<=X) where x is a random variable for permutation inversions and X is 321 the number of inversions between the reference and comparison sequences. 322 @rtype: float 323 """ 324 inversions = self.inversions(sequence, isGraded) 325 maxInversions = self.maxInversions(sequence, isGraded) 326 # Special Invariant-Point Cases 327 if maxInversions == 0: 328 return None 329 elif inversions == maxInversions: 330 return 1.0 331 # General Case 332 else: 333 if sequence: 334 if isinstance(sequence, GradedPosetSequence): 335 isGraded = sequence.isGraded() 336 otherNumElements = sequence.getNumElements() 337 otherOverloadedRanks = sequence.getOverloadedRanks() 338 sequence = sequence._sequence 339 else: 340 otherNumElements = self._calcNumElements(sequence, isGraded) 341 otherOverloadedRanks = self._calcOverloadedRanks(sequence, isGraded) 342 # If sets of unequal size, discard extra elements from reference set to fit 343 if otherNumElements != self._numElements: 344 overloadedRanks = self._calcOverloadedRanks(self._sequence, self._isGraded, sequence, isGraded) 345 else: 346 overloadedRanks = self.getOverloadedRanks() 347 intersectingRanks = self._calcRankIntersections(overloadedRanks, otherOverloadedRanks) 348 numElements = otherNumElements 349 else: 350 overloadedRanks = self.getOverloadedRanks() 351 otherOverloadedRanks = [] 352 intersectingRanks = [] 353 numElements = self.getNumElements() 354 inversionDistribution = rawInversionCounts(numElements) 355 for intersections in intersectingRanks: 356 intersectionDistr = rawInversionCounts(len(intersections)) 357 inversionDistribution = polynomialExpansion(inversionDistribution, intersectionDistr) 358 for overloadedRank in overloadedRanks + otherOverloadedRanks: 359 overloadedDistr = rawInversionCounts(len(overloadedRank)) 360 inversionDistribution, remainder = syntheticDivision(inversionDistribution, overloadedDistr) 361 # Sum over the smaller tail 362 if inversions > maxInversions/2.0: 363 return 1.0-float(sum(inversionDistribution[inversions+1:]))/sum(inversionDistribution) 364 else: 365 return float(sum(inversionDistribution[:inversions+1]))/sum(inversionDistribution)
366
367 - def normalCDF(self, sequence=None, isGraded=False):
368 """ 369 Calculate an approximate CDF value for inversions between the reference sequence 370 abd a comparison sequence. If the 'sequence' param is None, the comparison 371 sequence is a fully sorted form of the reference sequence. 372 Note: The normal distribution gets quite accurate by N > 20 and should be used for larger 373 sequences. 374 @param sequence: Comparison sequence (If None, equivalent to a sorted reference sequence) 375 @type sequence: list of list of object or list of object or None 376 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list. 377 @type isGraded: bool 378 @return: P(x<=X) where x is a random variable for permutation inversions and X is 379 the number of inversions between the reference and comparison sequences. 380 @rtype: float 381 """ 382 mean = self.mean(sequence, isGraded) 383 stdDev = self.stdDev(sequence, isGraded) 384 inversions = self.inversions(sequence, isGraded) 385 if mean == 0: 386 return None 387 else: 388 return normalCDFFunction(inversions, mean, stdDev)
389
390 - def signTest(self, sequences, sequences2=None, isGraded=False, measure=SIMILARITY_METRIC, 391 mu=0.5, alternative=GREATER_THAN_HYPOTHESIS):
392 """ 393 Perform a Sign test on either the similarity or the complements of the CDF values, 394 which are a measure of the match between the each sequence and the reference seq. 395 This has the advantage of calculating based on the similarity, which is quicker than 396 calculating the CDF probability of a particular inversion counts. If two sequences 397 are provided, this performs a two sample paired test. 398 @param sequences: Multiple sequences that are permutations of the elements of this sequence 399 @type sequences: list of list of object or list of list of list of object 400 @param sequences2: Multiple sequences that are permutations (second sample) 401 @type sequences2: list of list of object or list of list of list of object 402 @param isGraded: If True, each sequence is nested (where nested elements share a grade) 403 @type isGraded: bool 404 @param measure: The measure to calculate for each sequence, either SIMILARITY_METRIC or from CDF_TYPES set 405 @type measure: str 406 @param mu: Assumed mean of the sign test, to allow assymetric testing 407 @type mu: float 408 @param alternative: Alternate hypothesis for the test, from StatisticalTests.TEST_HYPOTHESES set 409 @type alternative: str 410 @return: Probability of the null hypothesis 411 @rtype: float 412 """ 413 if mu >= 1 or mu <= 0: 414 raise ValueError("Sign test mu should be in [0,1] but got: %s"%mu) 415 return self._comparisonTest(signTest, sequences, sequences2, isGraded, measure, mu=mu, alternative=alternative)
416
417 - def wilcoxonTest(self, sequences, sequences2=None, isGraded=False, cdfType=ADAPTIVE_CDF, 418 alternative=GREATER_THAN_HYPOTHESIS):
419 """ 420 Perform a Wilcoxon Rank Sum test on the complements of the CDF values, 421 which are a measure of the match between the each sequence and the reference seq 422 Mu cannot be set on this test, as it would violate the assumption of symmetry. 423 If two sequences are provided, this performs a two-sample paired test. 424 @param sequences: Multiple sequences that are permutations of the elements of this sequence 425 @type sequences: list of list of object or list of list of list of object 426 @param sequences2: Multiple sequences that are permutations (second sample) 427 @type sequences2: list of list of object or list of list of list of object 428 @param isGraded: If True, each sequence is nested (where nested elements share a grade) 429 @type isGraded: bool 430 @param cdfType: The type of CDF to calculate for the inversion counts, from CDF_TYPES set 431 @type cdfType: str 432 @param alternative: Alternative hypothesis for the test, from StatisticalTests.TEST_HYPOTHESES set 433 @type alternative: str 434 @return: Probability of the null hypothesis 435 @rtype: float 436 """ 437 if cdfType not in CDF_TYPES: 438 raise TypeError("Wilcoxon Test received invalid CDF Type. Must be one of: %s"%(CDF_TYPES,)) 439 if sequences2 is None: 440 return self._comparisonTest(wilcoxonSignedRankTest, sequences, sequences2, isGraded, 441 cdfType, mu=0.5, alternative=alternative) 442 else: 443 return self._comparisonTest(wilcoxonSignedRankTest, sequences, sequences2, isGraded, 444 cdfType, alternative=alternative)
445
446 - def permutationMeanTest(self, sequences, sequences2, isGraded=False, measure=ADAPTIVE_CDF, 447 alternative=GREATER_THAN_HYPOTHESIS, pValue=0.95, iterations=100000, 448 useStoppingRule=True, maxExactN=7):
449 """ 450 Perform a Permutation or Monte Carlo Mean Difference test on some measure 451 (similarity or 1-CDF) for the match between the each sequence and the reference seq. 452 This requires two samples, as these are shuffled for form the permutations. 453 @param sequences: Multiple sequences that are permutations of the elements of this sequence 454 @type sequences: list of list of object or list of list of list of object 455 @param sequences2: Multiple sequences that are permutations (second sample) 456 @type sequences2: list of list of object or list of list of list of object 457 @param isGraded: If True, each sequence is nested (where nested elements share a grade) 458 @type isGraded: bool 459 @param measure: The type of CDF to calculate for the inversion counts, from CDF_TYPES set 460 @type measure: str 461 @param alternative: Alternative hypothesis for the test, from StatisticalTests.TEST_HYPOTHESES set 462 @type alternative: str 463 @param pValue: The p-Value for the test to confirm, used for Monte Carlo early termination 464 @type pValue: float 465 @param iterations: The max number of iterations to run for Monte Carlo 466 @type iterations: int 467 @param useStoppingRule: If True, use version of MonteCarlo with an unbiased early stopping rule 468 @type useStoppingRule: bool 469 @param maxExactN: The largest N=len(x)+len(y) to calculate an exact test value. 470 For values higher than this, use a Monte Carlo approximation. 471 @type maxExactN: int 472 @return: Probability of the null hypothesis 473 @rtype: float 474 """ 475 return self._comparisonTest(permutationMeanTest, sequences, sequences2, isGraded, measure, 476 alternative=alternative, pValue=pValue, iterations=iterations, 477 maxExactN=maxExactN)
478
479 - def permutationRankTest(self, sequences, sequences2, isGraded=False, measure=ADAPTIVE_CDF, 480 alternative=GREATER_THAN_HYPOTHESIS, pValue=0.95, iterations=100000, 481 useStoppingRule=True, maxExactN=7):
482 """ 483 Perform a Permutation or Monte Carlo rank test on a measure (similarity or 484 1-CDF) for the match between the each sequence and the reference seq. 485 This requires two samples, as these are shuffled for form the permutations. 486 @param sequences: Multiple sequences that are permutations of the elements of this sequence 487 @type sequences: list of list of object or list of list of list of object 488 @param sequences2: Multiple sequences that are permutations (second sample) 489 @type sequences2: list of list of object or list of list of list of object 490 @param isGraded: If True, each sequence is nested (where nested elements share a grade) 491 @type isGraded: bool 492 @param measure: The type of CDF to calculate for the inversion counts, from CDF_TYPES set 493 @type measure: str 494 @param alternative: Alternative hypothesis for the test, from StatisticalTests.TEST_HYPOTHESES set 495 @type alternative: str 496 @param pValue: The p-Value for the test to confirm, used for Monte Carlo early termination 497 @type pValue: float 498 @param iterations: The max number of iterations to run for Monte Carlo 499 @type iterations: int 500 @param useStoppingRule: If True, use version of MonteCarlo with an unbiased early stopping rule 501 @type useStoppingRule: bool 502 @param maxExactN: The largest N=len(x)+len(y) to calculate an exact test value. 503 For values higher than this, use a Monte Carlo approximation. 504 @type maxExactN: int 505 @return: Probability of the null hypothesis 506 @rtype: float 507 """ 508 return self._comparisonTest(permutationRankTest, sequences, sequences2, isGraded, measure, 509 alternative=alternative, pValue=pValue, iterations=iterations, 510 maxExactN=maxExactN)
511
512 - def _comparisonTest(self, test, sequences, sequences2=None, isGraded=False, measure=SIMILARITY_METRIC, **kwds):
513 """ 514 Perform a hypothesis test on either the similarity or the complements of the CDF values, 515 which are a measure of the match between the each sequence and the reference seq. 516 If two sequences are provided, this performs a two sample test of an appropriate type. 517 @param test: The hypothesis test function, in the form: f(sample1, sample1=None, **kwds) 518 @type test: callable 519 @param sequences: Multiple sequences that are permutations of the elements of this sequence 520 @type sequences: list of list of object or list of list of list of object 521 @param sequences2: Multiple sequences that are permutations (second sample) 522 @type sequences2: list of list of object or list of list of list of object 523 @param isGraded: If True, each sequence is nested (where nested elements share a grade) 524 @type isGraded: bool 525 @param measure: The measure to calculate for each sequence, either SIMILARITY_METRIC or from CDF_TYPES set 526 @type measure: str 527 @param kwds: Additional test-specific parameters passed to the test 528 @type kwds: dict 529 @return: Probability of the null hypothesis 530 @rtype: float 531 """ 532 if measure == SIMILARITY_METRIC: 533 values = [self.similarity(seq, isGraded) for seq in sequences] 534 else: 535 values = [1.0-self.cdf(seq, isGraded, measure) for seq in sequences] 536 if sequences2 is None: 537 return test(values, **kwds) 538 else: 539 if measure == SIMILARITY_METRIC: 540 values2 = [self.similarity(seq, isGraded) for seq in sequences2] 541 else: 542 values2 = [1.0-self.cdf(seq, isGraded, measure) for seq in sequences2] 543 return test(values, values2, **kwds)
544 545 @classmethod
546 - def _calcNumElements(cls, sequence, isGraded):
547 """ 548 Calculate the number of elements in the sequence. 549 This is trivial for a flat sequence and quick for a graded one. 550 @param sequence: The sequence to count elements in 551 @type sequence: list of list of object or list of object 552 @param isGraded: If True, sequence is nested (where nested elements share a grade) 553 @type isGraded: bool 554 @return: The number of elements in the sequence (N) 555 @rtype: bool 556 """ 557 if isGraded: 558 aSum = 0 559 for elements in sequence: 560 if isinstance(elements, VALID_GRADE_CONTAINERS): 561 aSum += len(elements) 562 else: 563 aSum += 1 564 return aSum 565 else: 566 return len(sequence)
567
568 - def _calcGradedStatistic(self, operation, sequence=None, isGraded=False):
569 """ 570 Calculate a series length statistic, adjusting for incomparable elements. 571 This assumes that the operation can be evaluated on the total number of elements, 572 then the operation is evaluated on the subsets of incomparable elements, which 573 are subtracted off from the full-set operation value. If the 'sequence' param is 574 provided, the grades for this sequence is subtracted off also (without double-counting). 575 The max, mean, and variance can all be calculated using this approach. 576 @param operation: The operation to evaluate, in the form 'numeric = f(int)' 577 @type operation: callable 578 @param sequence: A comparison sequence to evaluate grades for also. If None, ignored. 579 @type sequence: list of list of object or list of object 580 @param isGraded: If True, sequence is nested (where nested elements share a grade) 581 @type isGraded: bool 582 @return: Value of the graded statistic 583 @rtype: int or float 584 """ 585 if sequence: 586 if isinstance(sequence, GradedPosetSequence): 587 isGraded = sequence.isGraded() 588 otherNumElements = sequence.getNumElements() 589 otherOverloadedRanks = sequence.getOverloadedRanks() 590 sequence = sequence._sequence 591 else: 592 otherNumElements = self._calcNumElements(sequence, isGraded) 593 otherOverloadedRanks = self._calcOverloadedRanks(sequence, isGraded) 594 # If sets of unequal size, discard extra elements from reference set to fit 595 if otherNumElements != self._numElements: 596 overloadedRanks = self._calcOverloadedRanks(self._sequence, self._isGraded, sequence, isGraded) 597 else: 598 overloadedRanks = self.getOverloadedRanks() 599 intersectingRanks = self._calcRankIntersections(overloadedRanks, otherOverloadedRanks) 600 value = operation(min(otherNumElements, self._numElements)) 601 value -= sum([operation(len(elements)) for elements in overloadedRanks]) 602 value -= sum([operation(len(elements)) for elements in otherOverloadedRanks]) 603 value += sum([operation(len(elements)) for elements in intersectingRanks]) 604 return value 605 else: 606 value = operation(self._numElements) 607 value -= sum([operation(len(elements)) for elements in self.getOverloadedRanks()]) 608 return value
609
610 - def _calcOverloadedRanks(self, sequence, isGraded, otherSequence=None, isOtherGraded=False):
611 """ 612 Calculate the grades with more than one element (overloaded grades). 613 If two sequences are provided, this only calculates overloaded ranks for elements 614 that are contained in both sequences (in case one is missing elements) 615 Note: If a sequence is not nested, it has no overloaded ranks by definition 616 Note 2: The 'otherSequence' is only needed to handle missing elements 617 @param sequence: Sequence to calculate the overloaded grades 618 @type sequence: list of list of object or list of object 619 @param isGraded: If True, sequence is nested (where nested elements share a grade) 620 @type isGraded: bool 621 @param otherSequence: Optionally, a reference sequence to only count elements in both sequences 622 @type otherSequence: list of list of object or list of object 623 @param isOtherGraded: If True, sequence is nested (where nested elements share a grade) 624 @type isOtherGraded: bool 625 @return: List of subsets of elements that cannot be compared against each other 626 @rtype: List of list of object 627 """ 628 if isinstance(sequence, GradedPosetSequence): 629 isGraded = sequence.isGraded() 630 sequence = sequence._sequence 631 if otherSequence and isinstance(otherSequence, GradedPosetSequence): 632 isOtherGraded = otherSequence.isGraded() 633 otherSequence = otherSequence._sequence 634 if isGraded: 635 if otherSequence is None: 636 overloadedRanks = [elements for elements in sequence 637 if isinstance(elements, VALID_GRADE_CONTAINERS) 638 and len(elements) > 1] 639 else: 640 # Flatten a nested other sequence 641 if isOtherGraded: 642 if self._hashableElements: 643 flatOther = set() 644 for elements in otherSequence: 645 if isinstance(elements, VALID_GRADE_CONTAINERS): 646 flatOther |= set(elements) 647 else: 648 flatOther.add(elements) 649 else: 650 flatOther = [] 651 for elements in otherSequence: 652 if isinstance(elements, VALID_GRADE_CONTAINERS): 653 flatOther.extend(elements) 654 else: 655 flatOther.append(elements) 656 else: 657 if self._hashableElements: 658 flatOther = set(otherSequence) 659 else: 660 flatOther = otherSequence 661 # Find overloaded ranks that contain only elements from the other sequence 662 overloadedRanks = [] 663 for elements in sequence: 664 if isinstance(elements, VALID_GRADE_CONTAINERS) and len(elements) > 1: 665 filteredElements = [element for element in elements if element in flatOther] 666 if len(filteredElements) > 1: 667 overloadedRanks.append(filteredElements) 668 else: 669 overloadedRanks = [] 670 return overloadedRanks
671
672 - def _calcRankIntersections(self, overloadedRanks, overloadedRanks2):
673 """ 674 Calculate the intersection of overloaded ranks between sequences. 675 This basically calculates "overloaded overloads" of ranks. 676 @param overloadedRanks: First set of overloaded (len > 1) ranks 677 @type overloadedRanks: list of list of object 678 param overloadedRanks2: Second set of overloaded (len > 1) ranks 679 @type overloadedRanks2: list of list of object 680 @return: List of intersections between overloaded grades 681 @rtype: list of list of object 682 """ 683 intersections = [] 684 if self._hashableElements: 685 intersections= [list(set.intersection(set(elements1), set(elements2))) 686 for elements1 in overloadedRanks 687 for elements2 in overloadedRanks2] 688 else: 689 intersections= [filter(lambda x: x in elements1, elements2) 690 for elements1 in overloadedRanks 691 for elements2 in overloadedRanks2] 692 intersections = [x for x in intersections if len(x) > 1] 693 return intersections
694
695 - def _populateHashRanks(self):
696 """ 697 Populate hash table that reports the grade of any element. 698 Slight hit to memory, but significant boost in speed for 699 comparisons. 700 """ 701 sequence = self._sequence 702 if self._isGraded: 703 self._hashRanks = {} 704 for i, elements in enumerate(sequence): 705 if isinstance(elements, VALID_GRADE_CONTAINERS): 706 for e in elements: 707 self._hashRanks[e] = i 708 else: 709 self._hashRanks[elements] = i 710 else: 711 self._hashRanks = dict(((x,i) for i, x in enumerate(sequence)))
712
713 - def _getElementIndex(self, element, sequence, isGraded=False):
714 """ 715 Get the index of the given value in the sequence 716 Note: This hashes element indices, if possible 717 @param element: Value to get the position for 718 @type element: object 719 @param sequence: Nested sequence which is used to determine sorting value of elements 720 @type sequence: list of list of object 721 @param isGraded: If True, sequence is graded (nested). Else, flat sequence. 722 @type isGraded: bool 723 @return: Sorting key for the value 724 @rtype: int 725 """ 726 if self._hashableElements: 727 value = self._hashRanks.get(element, None) 728 if value is not None: 729 return value 730 else: 731 return getDefaultIndex(element, len(self._sequence), self._defaultGrade) 732 else: 733 return getValueIndex(element, sequence, isGraded)
734 735
736 - def validate(self):
737 """ 738 Check that this sequence is valid, so that it has 739 no repeated elements, that elements are hashable 740 if sequence is hashable, and that a graded sequence 741 contains iterated grades. 742 """ 743 self.validateSequence(self._sequence, self._isGraded, self._hashableElements)
744 745 @classmethod
746 - def validateSequence(cls, sequence, isGraded=False, hashableElements=True):
747 """ 748 Check that a sequence is valid, so that it has 749 no repeated elements, and that elements are hashable 750 if sequence is hashable. 751 """ 752 if hasattr(sequence, '__iter__'): 753 if isGraded: 754 elements = [] 755 for grade in sequence: 756 if isinstance(grade, VALID_GRADE_CONTAINERS): 757 elements.extend(grade) 758 else: 759 elements.append(grade) 760 else: 761 elements = list(sequence) 762 # Collect the unhashables 763 if hashableElements: 764 unhashables = [] 765 for element in elements: 766 if not hasattr(element, '__hash__') or element.__hash__ is None: 767 unhashables.append(element) 768 else: 769 hash(element) 770 # Collect repeated elements 771 observedElements = [] 772 repeatedElements = [] 773 for element in elements: 774 if element in observedElements: 775 repeatedElements.append(element) 776 else: 777 observedElements.append(element) 778 errStr = '' 779 if len(unhashables) > 0: 780 unhashableStr = ', '.join([str(unhashable) for unhashable in unhashables]) 781 errStr +="The following elements were unhashable in a hashable poset:\n%s\n"%(unhashableStr,) 782 if len(repeatedElements) > 0: 783 errStr += "The following elements were repeated in the poset:\n%s"%(repeatedElements,) 784 else: 785 errStr = 'Sequence was not iterable.' 786 if len(errStr) > 0: 787 raise InvalidGradedPosetError(errStr)
788
789 ########################### 790 # Descriptive Measures # 791 ########################### 792 793 -def inversionCountMax(seq, seq2=None, isGraded=False, hashableElements=True):
794 """ 795 Calculate the maximum possible number of inversions, based 796 on the sequence (# of elements and grade structure). If 797 the second optional sequence is provided, its grade structure 798 is also considered. 799 Note: This only depends on the structure of the sequence(s), not their order 800 @param seq: Sequence to get the maximum possible inversions for any permutation 801 @type seq: list of object or list of list of object 802 @param seq2: A comparison sequence for the sequence 'seq' 803 @type seq2: list of object or list of list of object 804 @param isGraded: If True, both seq and seq2 are nested (where nested elements share a grade) 805 @type isGraded: bool 806 @param hashableElements: If True, elements can be hashed (improves speed) 807 @type hashableElements: bool 808 @return: Maximum inversions 809 @rtype: int 810 """ 811 if isinstance(seq, GradedPosetSequence): 812 refSeq = seq 813 else: 814 refSeq = GradedPosetSequence(seq, isGraded=isGraded, hashableElements=hashableElements) 815 return refSeq.maxInversions(seq2, isGraded)
816
817 -def inversionCountMean(seq, seq2=None, isGraded=False, hashableElements=True):
818 """ 819 Calculate the mean inversions across permutations, based 820 on the sequence (# of elements and grade structure). If 821 the second optional sequence is provided, its grade structure 822 is also considered. 823 Note: This only depends on the structure of the sequence(s), not their order 824 @param seq: Sequence to get the mean inversions for all possible permutations 825 @type seq: list of object or list of list of object 826 @param seq2: A comparison sequence for the sequence 'seq' 827 @type seq2: list of object or list of list of object 828 @param isGraded: If True, both seq and seq2 are nested (where nested elements share a grade) 829 @type isGraded: bool 830 @param hashableElements: If True, elements can be hashed (improves speed) 831 @type hashableElements: bool 832 @return: Mean inversions 833 @rtype: float 834 """ 835 if isinstance(seq, GradedPosetSequence): 836 refSeq = seq 837 else: 838 refSeq = GradedPosetSequence(seq, isGraded=isGraded, hashableElements=hashableElements) 839 return refSeq.mean(seq2, isGraded)
840
841 -def inversionCountVariance(seq, seq2=None, isGraded=False, hashableElements=True):
842 """ 843 Calculate the variance of inversions across permutations, based 844 on the sequence (# of elements and grade structure). If 845 the second optional sequence is provided, its grade structure 846 is also considered. 847 Note: This only depends on the structure of the sequence(s), not their order 848 @param seq: Sequence to get the variance of inversions for all possible permutations 849 @type seq: list of object or list of list of object 850 @param seq2: A comparison sequence for the sequence 'seq' 851 @type seq2: list of object or list of list of object 852 @param isGraded: If True, both seq and seq2 are nested (where nested elements share a grade) 853 @type isGraded: bool 854 @param hashableElements: If True, elements can be hashed (improves speed) 855 @type hashableElements: bool 856 @return: Variance of inversions 857 @rtype: float 858 """ 859 if isinstance(seq, GradedPosetSequence): 860 refSeq = seq 861 else: 862 refSeq = GradedPosetSequence(seq, isGraded=isGraded, hashableElements=hashableElements) 863 return refSeq.variance(seq2, isGraded)
864
865 -def medianSequence(permutations, isGraded=False, hashableElements=True):
866 """ 867 From the given permutations, make a sequence ordered by the median 868 rank of each element. Note: This requires that all permutations 869 contain the same set of elements. If this condition does not hold, 870 then permutations with less elements would by definition have lower 871 median ranks. As such, this raises a KeyError if this case occurs. 872 @param permutations: Permutations to gather the ranks for 873 @type permutations: list of list of object or list of list of list of object 874 @param isGraded: If True, both seq and seq2 are nested (where nested elements share a grade) 875 @type isGraded: bool 876 @param hashableElements: If True, elements can be hashed (improves speed) 877 @type hashableElements: bool 878 @return: A graded sequence based on the median ranks of elements. 879 Grades occur if elements share the same median rank. 880 @rtype: list of list of object 881 """ 882 if len(permutations) == 0: 883 return [] 884 # If Hashable, Collect Indices in a Dictionary and Get Median Indices 885 if hashableElements: 886 indices = {} 887 for n, permutation in enumerate(permutations): 888 for i, x in enumerate(permutation): 889 if isGraded and isinstance(x, VALID_GRADE_CONTAINERS): 890 elements = x 891 else: 892 elements = [x] 893 for element in elements: 894 if n == 0: 895 indices[element] = [] 896 indices[element].append(i) 897 indices = [(getMedianValue(values), e) for e, values in indices.iteritems()] 898 # Otherwise, Collect Indices in a Flat List, Sort, Collect, and Get Median Indices 899 else: 900 indices = [] 901 for permutation in permutations: 902 for i, x in enumerate(permutation): 903 if isGraded and isinstance(x, VALID_GRADE_CONTAINERS): 904 elements = x 905 else: 906 elements = [x] 907 for element in elements: 908 indices.append((element,i)) 909 indices.sort() 910 newIndices = [] 911 for element, i in indices: 912 if len(newIndices) == 0 or newIndices[-1][0] != element: 913 newIndices.append((element, [i])) 914 else: 915 newIndices[-1][1].append(i) 916 indices = [(getMedianValue(values), e) for e, values in newIndices] 917 indices.sort() 918 # Turn the list of indices into a graded sequence 919 medianSeq = [] 920 lastIndex = None 921 for indexVal, e in indices: 922 if lastIndex is None or lastIndex != indexVal: 923 medianSeq.append([e]) 924 lastIndex = indexVal 925 else: 926 medianSeq[-1].append(e) 927 return medianSeq
928
929 930 #################### 931 # Inversion Counts # 932 #################### 933 934 -def mergeSortInversionCount(sequence):
935 """ 936 Calculate the number of inversions requried to sort sequence via mergesort. 937 This function only works on a flat list. 938 Note: Relies on python sort to do actual merging (but not the counting). 939 More efficient this way, as the Python version is in C. 940 @param sequence: Sequence to sort and count inversions 941 @type sequence: list 942 @return: Number of inversions, sorted list 943 @rtype: int, list 944 """ 945 if len(sequence) <= 1: 946 return 0, sequence 947 else: 948 firstHalf = sequence[:int(len(sequence)/2)] 949 secondHalf = sequence[int(len(sequence)/2):] 950 count1, firstHalf = mergeSortInversionCount(firstHalf) 951 count2, secondHalf = mergeSortInversionCount(secondHalf) 952 firstN = len(firstHalf) 953 secondN = len(secondHalf) 954 secondHalfEnd = secondN 955 count3 = count1 956 count3+= count2 957 # Count the inversions in the merge 958 # Uses a countdown through each sublist 959 for i in xrange(firstN-1, -1, -1): 960 x = firstHalf[i] 961 inversionFound = False 962 for j in xrange(secondHalfEnd-1,-1,-1): 963 if x > secondHalf[j]: 964 inversionFound = True 965 break 966 if inversionFound: 967 secondHalfEnd = j+1 968 count3 += j+1 969 mergeList = firstHalf + secondHalf 970 mergeList.sort() 971 return count3, mergeList
972
973 -def inversionCount(seq, seq2=None, isGraded=False, hashableElements=True):
974 """ 975 Calculate the inversions between seq and seq2. If seq2 is None, this works 976 equivalently to seq2 being a sorted version of seq 977 @param seq: Sequence to calculate the inversions for 978 @type seq: list of object or list of list of object 979 @param seq2: A comparison sequence for the sequence 'seq' 980 @type seq2: list of object or list of list of object 981 @param isGraded: If True, both seq and seq2 are nested (where nested elements share a grade) 982 @type isGraded: bool 983 @param hashableElements: If True, elements can be hashed (improves speed) 984 @type hashableElements: bool 985 @return: Number of inversions 986 @rtype: int 987 """ 988 if isinstance(seq, GradedPosetSequence): 989 refSeq = seq 990 else: 991 refSeq = GradedPosetSequence(seq, isGraded=isGraded, hashableElements=hashableElements) 992 return refSeq.inversions(seq2, isGraded)
993
994 -def inversionSimilarity(seq, seq2=None, isGraded=False, hashableElements=True):
995 """ 996 Calculate the similarity between sequences: 1-inversions/maxInversions 997 If seq2 is None, this works equivalently to seq2 being a sorted version of seq 998 @param seq: Sequence to get the similarity for 999 @type seq: list of object or list of list of object 1000 @param seq2: A comparison sequence for the sequence 'seq' 1001 @type seq2: list of object or list of list of object 1002 @param isGraded: If True, both seq and seq2 are nested (where nested elements share a grade) 1003 @type isGraded: bool 1004 @param hashableElements: If True, elements can be hashed (improves speed) 1005 @type hashableElements: bool 1006 @return: Similarity of sequences by inversion counts, in [0,1], where 1 is a perfect match 1007 @rtype: float 1008 """ 1009 if isinstance(seq, GradedPosetSequence): 1010 refSeq = seq 1011 else: 1012 refSeq = GradedPosetSequence(seq, isGraded=isGraded, hashableElements=hashableElements) 1013 return refSeq.similarity(seq2, isGraded)
1014
1015 1016 ################################## 1017 # Probability and Statistics # 1018 ################################## 1019 1020 -def inversionCountCDF(seq, seq2=None, isGraded=False, hashableElements=True, cdfType=ADAPTIVE_CDF):
1021 """ 1022 Get the cummulative distribution function probability for the number 1023 of inversions between seq and seq2. If seq2 is None, this works 1024 equivalently to seq2 being a sorted version of seq 1025 @param seq: Sequence to calculate the number of inversions 1026 @type seq: list of object or list of list of object 1027 @param seq2: A comparison sequence for the sequence 'seq' 1028 @type seq2: list of object or list of list of object 1029 @param isGraded: If True, both seq and seq2 are nested (where nested elements share a grade) 1030 @type isGraded: bool 1031 @param hashableElements: If True, elements can be hashed (improves speed) 1032 @type hashableElements: bool 1033 @return: P(X<=x) where X is a R.V. based on all permutations and 1034 x is the actual number of inversions between seq and seq2 1035 @rtype: float 1036 """ 1037 if isinstance(seq, GradedPosetSequence): 1038 refSeq = seq 1039 else: 1040 refSeq = GradedPosetSequence(seq, isGraded=isGraded, hashableElements=hashableElements) 1041 return refSeq.cdf(seq2, isGraded, cdfType)
1042
1043 1044 -def inversionSignTest(orderings, orderings2=None, seq=None, isGraded=False, hashableElements=True, 1045 measure=SIMILARITY_METRIC, alternative=GREATER_THAN_HYPOTHESIS, mu=0.5):
1046 """ 1047 A sign test based on the values of some measure. This measure can be either similarity 1048 or the complement of the cdf (1-cdf), both of which are in the range [0,1] where 1049 0 is the worst match and 1 is the best match. For each sequence in 'orderings', 1050 the measure is calculated comparing seq against the permutation. This creates a list 1051 of measure values that the sign test can be applied to. If two sets of orderings 1052 are provided, this runs a two-sample test and ignores mu. 1053 Note: A calculation based on similarity will provide different results than one based 1054 on CDF if mu!=0.5. When mu=0.5, both are identical. 1055 @param orderings: One or more comparison sequences (first sample) 1056 @type orderings: list of list of object or list of list of list of object 1057 @param orderings2: One or more comparison sequences (second sample) 1058 @type orderings2: list of list of object or list of list of list of object 1059 @param seq: Reference sequence to calculate the number of inversions against 1060 @type seq: list of object or list of list of object 1061 @param isGraded: If True, both 'seq' and sequences in 'orderings' are nested (where nested elements share a grade) 1062 @type isGraded: bool 1063 @param hashableElements: If True, elements can be hashed (improves speed) 1064 @type hashableElements: bool 1065 @param measure: The measure to calculate for each sequence, either SIMILARITY_METRIC or from CDF_TYPES set 1066 @type measure: str 1067 @param mu: Assumed mean of the sign test, to allow assymetric testing 1068 @type mu: float 1069 @param alternative: Alternate hypothesis for the test, from the TEST_HYPOTHESES set ('greater', 'less', 'two.sided') 1070 @type alternative: str 1071 @return: Probability of the null hypothesis 1072 @rtype: float 1073 """ 1074 if mu >= 1 or mu <= 0: 1075 raise ValueError("Sign test mu should be in [0,1] but got: %s"%mu) 1076 return _inversionComparisonTest(signTest, GradedPosetSequence.signTest, orderings, orderings2, seq, 1077 isGraded, hashableElements, measure, mu=mu, alternative=alternative)
1078
1079 1080 -def inversionWilcoxonTest(orderings, orderings2=None, seq=None, isGraded=False, hashableElements=True, 1081 cdfType=ADAPTIVE_CDF, alternative=GREATER_THAN_HYPOTHESIS):
1082 """ 1083 A Wilcoxon rank sum test based on the complement of the CDF (1-cdf). 1084 For each ordering in 'orderings', 1-cdf is calculated comparing seq 1085 against the permutation. This creates a list of probabilities of getting 1086 a worse match by chance (one probability for each comparison sequence). 1087 @param orderings: One or more comparison sequences (first sample) 1088 @type orderings: list of list of object or list of list of list of object 1089 @param orderings2: One or more comparison sequences (second sample) 1090 @type orderings2: list of list of object or list of list of list of object 1091 @param seq: Sequence to calculate the number of inversions 1092 @type seq: list of object or list of list of object 1093 @param isGraded: If True, both 'seq' and sequences in 'orderings' are nested (where nested elements share a grade) 1094 @type isGraded: bool 1095 @param hashableElements: If True, elements can be hashed (improves speed) 1096 @type hashableElements: bool 1097 @param cdfType: The type of CDF to calculate for the inversion counts, from CDF_TYPES set 1098 @type cdfType: str 1099 @param alternative: Alternate hypothesis for the test, from the TEST_HYPOTHESES set ('greater', 'less', 'two.sided') 1100 @type alternative: str 1101 @return: Probability of the null hypothesis 1102 @rtype: float 1103 """ 1104 if cdfType not in CDF_TYPES: 1105 raise TypeError("Wilcoxon Test received invalid CDF Type. Must be one of: %s"%(CDF_TYPES,)) 1106 if orderings2 is None and seq is None: 1107 return _inversionComparisonTest(wilcoxonSignedRankTest, GradedPosetSequence.wilcoxonTest, 1108 orderings, orderings2, seq, isGraded, hashableElements, cdfType, 1109 mu=0.5, alternative=alternative) 1110 else: 1111 return _inversionComparisonTest(wilcoxonSignedRankTest, GradedPosetSequence.wilcoxonTest, 1112 orderings, orderings2, seq, isGraded, hashableElements, cdfType, 1113 alternative=alternative)
1114
1115 1116 -def inversionPermutationMeanTest(orderings, orderings2, seq=None, isGraded=False, hashableElements=True, 1117 measure=ADAPTIVE_CDF, alternative=GREATER_THAN_HYPOTHESIS, 1118 pValue=0.95, iterations=100000, useStoppingRule=True, maxExactN=7):
1119 """ 1120 Perform a Permutation or Monte Carlo Mean Difference Test on some measure 1121 (similarity or 1-CDF) for the match between the each sequence and the reference seq. 1122 This requires two samples, as these are shuffled for form the permutations. 1123 @param orderings: One or more comparison sequences (first sample) 1124 @type orderings: list of list of object or list of list of list of object 1125 @param orderings2: One or more comparison sequences (second sample) 1126 @type orderings2: list of list of object or list of list of list of object 1127 @param seq: Sequence to calculate the number of inversions 1128 @type seq: list of object or list of list of object 1129 @param isGraded: If True, both 'seq' and sequences in 'orderings' are nested (where nested elements share a grade) 1130 @type isGraded: bool 1131 @param hashableElements: If True, elements can be hashed (improves speed) 1132 @type hashableElements: bool 1133 @param measure: The type of CDF to calculate for the inversion counts, from CDF_TYPES set 1134 @type measure: str 1135 @param alternative: Alternative hypothesis for the test, from StatisticalTests.TEST_HYPOTHESES set 1136 @type alternative: str 1137 @param pValue: The p-Value for the test to confirm, used for Monte Carlo early termination 1138 @type pValue: float 1139 @param iterations: The max number of iterations to run for Monte Carlo 1140 @type iterations: int 1141 @param useStoppingRule: If True, use version of MonteCarlo with an unbiased early stopping rule 1142 @type useStoppingRule: bool 1143 @param maxExactN: The largest N=len(x)+len(y) to calculate an exact test value. 1144 For values higher than this, use a Monte Carlo approximation. 1145 @type maxExactN: int 1146 @return: Probability of the null hypothesis 1147 @rtype: float 1148 """ 1149 return _inversionComparisonTest(permutationMeanTest, GradedPosetSequence.permutationMeanTest, 1150 orderings, orderings2, seq, isGraded, hashableElements, measure, 1151 alternative=alternative, pValue=pValue, iterations=iterations, 1152 maxExactN=maxExactN)
1153
1154 -def inversionPermutationRankTest(orderings, orderings2, seq=None, isGraded=False, hashableElements=True, 1155 measure=ADAPTIVE_CDF, alternative=GREATER_THAN_HYPOTHESIS, 1156 pValue=0.95, iterations=100000, useStoppingRule=True, maxExactN=7):
1157 """ 1158 Perform a Permutation or Monte Carlo Rank on some measure (similarity or 1-CDF) 1159 for the match between the each sequence and the reference seq. 1160 This requires two samples, as these are shuffled for form the permutations. 1161 @param orderings: One or more comparison sequences (first sample) 1162 @type orderings: list of list of object or list of list of list of object 1163 @param orderings2: One or more comparison sequences (second sample) 1164 @type orderings2: list of list of object or list of list of list of object 1165 @param seq: Sequence to calculate the number of inversions 1166 @type seq: list of object or list of list of object 1167 @param isGraded: If True, both 'seq' and sequences in 'orderings' are nested (where nested elements share a grade) 1168 @type isGraded: bool 1169 @param hashableElements: If True, elements can be hashed (improves speed) 1170 @type hashableElements: bool 1171 @param measure: The type of CDF to calculate for the inversion counts, from CDF_TYPES set 1172 @type measure: str 1173 @param alternative: Alternative hypothesis for the test, from StatisticalTests.TEST_HYPOTHESES set 1174 @type alternative: str 1175 @param pValue: The p-Value for the test to confirm, used for Monte Carlo early termination 1176 @type pValue: float 1177 @param useStoppingRule: If True, use version of MonteCarlo with an unbiased early stopping rule 1178 @type useStoppingRule: bool 1179 @param iterations: The max number of iterations to run for Monte Carlo 1180 @type iterations: int 1181 @param maxExactN: The largest N=len(x)+len(y) to calculate an exact test value. 1182 For values higher than this, use a Monte Carlo approximation. 1183 @type maxExactN: int 1184 @return: Probability of the null hypothesis 1185 @rtype: float 1186 """ 1187 return _inversionComparisonTest(permutationRankTest, GradedPosetSequence.permutationRankTest, 1188 orderings, orderings2, seq, isGraded, hashableElements, measure, 1189 alternative=alternative, pValue=pValue, iterations=iterations, 1190 maxExactN=maxExactN)
1191
1192 1193 -def _inversionComparisonTest(test, classTest, orderings, orderings2=None, seq=None, isGraded=False, 1194 hashableElements=True, measure=ADAPTIVE_CDF, **kwds):
1195 """ 1196 A hypothesis test for comparing the similarity of some orderings based on some 1197 test function (test) and its equivalent in the GradedPosetSequence class (classTest). 1198 @param test: A test function, in the form, f(orderings, orderings2, **kwds) 1199 @type test: callable 1200 @param classTest: An unbound method for the GradedPosetSequence class, in the form 1201 f(self, orderings, orderings2, isGraded, measure, **kwds) 1202 @type classTest: callable 1203 @param orderings: One or more comparison sequences (first sample) 1204 @type orderings: list of list of object or list of list of list of object 1205 @param orderings2: One or more comparison sequences (second sample) 1206 @type orderings2: list of list of object or list of list of list of object 1207 @param seq: Sequence to calculate the number of inversions 1208 @type seq: list of object or list of list of object 1209 @param isGraded: If True, both 'seq' and sequences in 'orderings' are nested (where nested elements share a grade) 1210 @type isGraded: bool 1211 @param hashableElements: If True, elements can be hashed (improves speed) 1212 @type hashableElements: bool 1213 @param measure: The type of CDF or similarity to calculate for the inversion counts 1214 @type measure: str 1215 @return: Probability of the null hypothesis 1216 @rtype: float 1217 """ 1218 if seq is None: 1219 f = lambda x: GradedPosetSequence(x, isGraded=isGraded, hashableElements=hashableElements) 1220 if measure == SIMILARITY_METRIC: 1221 mFunct = lambda x: x.similarity(isGraded=isGraded) 1222 else: 1223 mFunct = lambda x: 1.0-x.cdf(isGraded=isGraded, cdfType=measure) 1224 vals = [mFunct(f(x)) for x in orderings] 1225 if orderings2 is None: 1226 return test(vals, **kwds) 1227 else: 1228 vals2 = [mFunct(f(x)) for x in orderings2] 1229 return test(vals, vals2, **kwds) 1230 else: 1231 if isinstance(seq, GradedPosetSequence): 1232 refSeq = seq 1233 else: 1234 refSeq = GradedPosetSequence(seq, isGraded=isGraded, hashableElements=hashableElements) 1235 if classTest is None: 1236 return refSeq._comparisonTest(test, orderings, orderings2, isGraded, measure, **kwds) 1237 else: 1238 return classTest(refSeq, orderings, orderings2, isGraded, measure, **kwds)
1239
1240 1241 -def pairwiseGroupComparison(orderings, orderings2, isGraded=False, hashableElements=True, 1242 cdfType=ADAPTIVE_CDF, alternative=GREATER_THAN_HYPOTHESIS):
1243 if measure == SIMILARITY_METRIC: 1244 mFunct = lambda x,y: x.similarity(y, isGraded=isGraded) 1245 else: 1246 mFunct = lambda x,y: 1.0-x.cdf(y, isGraded=isGraded, cdfType=measure) 1247 group1 = [] 1248 crossGroup = [] 1249 for i, x in enumerate(orderings): 1250 x = GradedPosetSequence(x, isGraded=isGraded, hashableElements=hashableElements) 1251 group1.extend([mFunct(x, orderings[j]) for j in xrange(i+1,len(orderings))]) 1252 crossGroup.extend([mFunct(x, y) for y in orderings2]) 1253 group2 = [] 1254 for i, x in enumerate(orderings2): 1255 x = GradedPosetSequence(x, isGraded=isGraded, hashableElements=hashableElements) 1256 group2.extend([mFunct(x, orderings2[j]) for j in xrange(i+1,len(orderings2))])
1257
1258 1259 # Kruskall Wallis or Friedman might be used to compare groups, but 1260 # interpretation is a bit iffy. 1261 1262 ######################## 1263 # Utility Functions # 1264 ######################## 1265 -def getMedianValue(values):
1266 """ 1267 Get the median of a sequence 1268 Each element of the sequence must be able to be averaged (added and divided by a number) 1269 @param values: A list of values 1270 @type values: list of object 1271 @return: Median value of the list 1272 @rtype: object 1273 """ 1274 n = len(values) 1275 values = sorted(values) 1276 if n == 0: 1277 raise IndexError("No median value for empty sequence") 1278 elif n%2 == 0: 1279 return (values[int(n/2)] + values[int(n/2-1)])/2.0 1280 else: 1281 return values[int((n-1)/2)]
1282
1283 -def getValueIndex(value, sequence, isGraded=False, defaultGrade=None):
1284 """ 1285 Get the index of the given value in the sequence 1286 @param value: Value to get the position for 1287 @type value: object 1288 @param sequence: Nested sequence which is used to determine sorting value of elements 1289 @type sequence: list of list of object 1290 @return: Sorting key for the value 1291 @rtype: int 1292 """ 1293 if isGraded: 1294 for i, values in enumerate(sequence): 1295 isIterable = isinstance(values, VALID_GRADE_CONTAINERS) 1296 if (isIterable and value in values) or (not isIterable and value == values): 1297 return i 1298 return getDefaultIndex(value, i, defaultGrade) 1299 elif value in sequence: 1300 return sequence.index(value) 1301 else: 1302 return getDefaultIndex(value, len(sequence), defaultGrade)
1303
1304 -def getDefaultIndex(element, maxIndex, defaultGrade=None):
1305 """ 1306 Get a default index for a missing element. 1307 Default grades are: 1308 LEFT_GRADE - Always -1, ranked before the lowest existing element(s) 1309 ZERO_GRADE - Always 0, ranked equal to the lowest element(s) 1310 MEAN_GRADE - Always half of the maximum grade 1311 MAX_GRADE - Always equal to the maximum grade, ranked equal to highest element(s) 1312 RIGHT_GRADE - Always maximum grade +1 than the maximum grade, after the highest element(s) 1313 @param maxIndex: Maximum possible grade for sequence 1314 @type maxIndex: int 1315 @param defaultGrade: The type of default grade to use, from DEFAULT_GRADES set 1316 @type defaultGrade: str 1317 @return: Ranking for a default element 1318 @rtype: int 1319 """ 1320 if defaultGrade is None: 1321 raise MissingElementError("%s not a subset element in sequence"%(element,)) 1322 elif defaultGrade == LEFT_GRADE: 1323 return -1 1324 elif defaultGrade == ZERO_GRADE: 1325 return 0 1326 elif defaultGrade == MEAN_GRADE: 1327 return maxIndex/2.0 1328 elif defaultGrade == MAX_GRADE: 1329 return maxIndex 1330 elif defaultGrade == RIGHT_GRADE: 1331 return maxIndex+1 1332 else: 1333 raise ValueError("Unknown value for defaultGrade, got: %s"%defaultGrade)
1334
1335 -def makeIndexSequence(sequence, referenceSequence, isSeqNested=False, isRefNested=False, tieFunction=sorted, 1336 indexFunction=getValueIndex):
1337 """ 1338 Convert 'sequence' into a sequence of indices from the reference sequence, rather than objects 1339 Note: This retains the original sequence structure (so if the sequence was nested, it remains nested) 1340 @param sequence: Sequence to convert into indices 1341 @type sequence: list of list of object or list of object 1342 @param referenceSequence: Sequence with "true ordering" used to collect element indices 1343 @type referenceSequence: list of list of object or list of object 1344 @param isSeqNested: If True, sequence is nested (where nested elements share a grade) 1345 @type isSeqNested: bool 1346 @param isRefNested: If True, reference is nested (where nested elements share a grade) 1347 @type isRefNested: bool 1348 @param tieFunction: How to sort elements within the same grade (after applying indices), in form seq = f(seq) 1349 The 'sorted' function assumes they are incomparable and removes any inversions. 1350 By comparison, a reversed sort would apply an inversion penalty to elements in a grade. 1351 @type tieFunction: callable 1352 @param indexFunction: Function for calculating the index of an element, as f(element, referenceSequence, isRefNested) 1353 @type indexFunction: callable 1354 @return: The 'sequence' parameter tranformed into integer indices, based on the reference sequence ordering. 1355 @rtype: list of list of int or list of int 1356 """ 1357 if isSeqNested: 1358 indexSeq = [] 1359 for grade in sequence: 1360 if isinstance(grade, VALID_GRADE_CONTAINERS): 1361 indexSeq.append(tieFunction([indexFunction(element,referenceSequence,isRefNested) for element in grade])) 1362 else: 1363 indexSeq.append(indexFunction(grade,referenceSequence,isRefNested)) 1364 return indexSeq 1365 else: 1366 return [indexFunction(element,referenceSequence,isRefNested) for element in sequence]
1367
1368 -def makeFlatIndexSequence(sequence, referenceSequence, isSeqNested=False, isRefNested=False, tieFunction=sorted, 1369 indexFunction=getValueIndex):
1370 """ 1371 Convert 'sequence' into a sequence of indices from the reference sequence, 1372 Note: This flattens the sequence, if it was nested, for inversion counting. 1373 rather than objects, then flatten. 1374 @param sequence: Sequence to convert into indices 1375 @type sequence: list of list of object or list of object 1376 @param referenceSequence: Sequence with "true ordering" used to collect element indices 1377 @type referenceSequence: list of list of object or list of object 1378 @param isSeqNested: If True, sequence is nested (where nested elements share a grade) 1379 @type isSeqNested: bool 1380 @param isRefNested: If True, reference is nested (where nested elements share a grade) 1381 @type isRefNested: bool 1382 @param indexFunction: Function for calculating the index of an element, as f(element, referenceSequence, isRefNested) 1383 @type indexFunction: callable 1384 @return: Flat sequence of indices, where an element's index is its grade in the reference sequence 1385 @rtype: list of int 1386 """ 1387 if isSeqNested: 1388 indexSeq = [] 1389 for grade in sequence: 1390 if isinstance(grade, VALID_GRADE_CONTAINERS): 1391 indexSeq.extend(tieFunction([indexFunction(element,referenceSequence,isRefNested) for element in grade])) 1392 else: 1393 indexSeq.append(indexFunction(grade,referenceSequence,isRefNested)) 1394 return indexSeq 1395 else: 1396 return [indexFunction(element,referenceSequence,isRefNested) for element in sequence]
1397
1398 -def filterElements(sequence, excludedElements, isGraded=False):
1399 """ 1400 Remove elements from a nested list, if they are part of the excluded elements. 1401 Returns a copy of the list, in all cases 1402 @param sequence: Standard sequence ([x,y,...]) or nested sequence ([[x,..], [y,..],..]) 1403 @type sequence: list of object or list of list of object 1404 @param excludedElements: List of elements to be removed from the sequence 1405 @type excludedElements: list of object 1406 @return: Sequence (in same structure) with excluded elements removed 1407 @rtype: list of object or list of list of object 1408 """ 1409 if isGraded: 1410 filtered = [] 1411 for grade in sequence: 1412 if isinstance(grade, VALID_GRADE_CONTAINERS): 1413 filteredGrade = [x for x in grade if x not in excludedElements] 1414 if len(filteredGrade) > 0: 1415 filtered.append(filteredGrade) 1416 elif grade not in excludedElements: 1417 filtered.append(grade) 1418 return filtered 1419 else: 1420 return [x for x in sequence if x not in excludedElements]
1421