Package InversionTest ::
Module InversionAnalysis
|
|
1 """
2 Main module for inversion counting, hypothesis tests, and
3 sequence transformations. The exposed API of the package
4 relies on the implementations contained in this module.
5
6 Author: Benjamin D. Nye
7 License: Apache License V2.0
8 """
9 from itertools import groupby
10 from InversionTest.InversionDistribution import (maxInversions, varianceOfInversions,
11 similarityByInversions, correlationByInversions, rawInversionCounts)
12 from InversionTest.PolynomialOperations import polynomialExpansion, syntheticDivision
13 from InversionTest.StatisticalTests import (normalCDFFunction, signTest, wilcoxonSignedRankTest,
14 permutationMeanTest, permutationRankTest, GREATER_THAN_HYPOTHESIS)
20
21
22 EXACT_CDF = 'exact'
23 NORMAL_CDF = 'normal'
24 ADAPTIVE_CDF = 'adaptive'
25 SIMILARITY_METRIC = 'similarity'
26 CDF_TYPES = frozenset([EXACT_CDF, NORMAL_CDF, ADAPTIVE_CDF])
27
28
29 LEFT_GRADE = 'Left'
30 ZERO_GRADE = 'Zero'
31 MEAN_GRADE = 'Mean'
32 MAX_GRADE = 'Max'
33 RIGHT_GRADE = 'Right'
34 DEFAULT_GRADES = frozenset([LEFT_GRADE, ZERO_GRADE, MEAN_GRADE, MAX_GRADE, RIGHT_GRADE])
35
36 VALID_GRADE_CONTAINERS = (list, tuple, set)
39 """
40 An analysis of the inversion distribution of a graded poset
41 This can be used to analyze the reference sequence against
42 a sorted version of itself or against permutations of these
43 elements in a sequence passed in (which may be graded or not).
44 Note: When comparing two sequences, it should be noted that
45 these inversion measures are symmetric with respect to which
46 sequence is considered the reference versus the comparison.
47
48 Graded sequences are represented through nested lists, where
49 each subsequence contains the elements of a grade. So then,
50 the sequence: [[A,B,C], [1,2,3], [X,Y]] contains three grades,
51 where A, B, and C have rank 1, [1,2,3] have rank 2, etc. Elements
52 that share a grade cannot have inversions between them. The
53 elements can be any valid Python objects that have valid comparison
54 operations (including custom class instance). However, if any
55 of the elements cannot be hashed, the hashableElements flag must
56 be set false or problems will occur.
57
58 Note: Technically, while this code refers to ``permutations'', two
59 sequences with the same elements but different grade structures
60 are not technically permutations (though two with the same grade
61 structure, regardless of structure, are certainly permutations).
62 However, they are VERY close to permutations and may be considered
63 analogous to permutations where you can't see the order of certain
64 elements (i.e., censored observations).
65 """
66
67 APPROXIMATION_RANKS_BOUND = 100
68
69 - def __init__(self, sequence, isGraded=False, hashableElements=True, validate=False, defaultGrade=None):
70 """
71 Initialize the reference sequence
72 @param sequence: Reference sequence, for comparing permutations against
73 @type sequence: list of object or list of list of object
74 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list.
75 @type isGraded: bool
76 @param hashableElements: If True, elements can be hashed (allowing slightly faster performance)
77 @type hashableElements: bool
78 @param validate: If True, run basic validation on the sequence inside
79 @type validate: bool
80 """
81 self._sequence = sequence
82 self._isGraded = isGraded
83 self._hashableElements = hashableElements
84 if validate:
85 self.validate()
86 self._sortedRanks = False
87 self._hashRanks = None
88 self._numElements = self._calcNumElements(self._sequence, self._isGraded)
89 self._overloadedRanks = None
90 self._defaultGrade = defaultGrade
91
93 """
94 Get the number of grades in the sequence
95 @return: Number of grades
96 @rtype: int
97 """
98 return len(self._sequence)
99
101 """
102 Iterate over the grades/elements
103 @return: If nested, yield a grade (and all elements in it). If flat, yield an element.
104 @rtype: list of object or object
105 """
106 for x in self._sequence:
107 yield x
108
110 """
111 Iterate over the elements (flat iteration over grades)
112 @return: Yield all the elements in the poset
113 @rtype: list of object or object
114 """
115 if self._isGraded:
116 for grade in self._sequence:
117 if isinstance(grade, VALID_GRADE_CONTAINERS):
118 for x in grade:
119 yield x
120 else:
121 yield grade
122 else:
123 for x in self._sequence:
124 yield x
125
127 """
128 Return if this is a graded poset. If True, is graded.
129 """
130 return self._isGraded
131
133 """
134 Get the number of elements in this sequence
135 @return: Number of elements
136 @rtype: int
137 """
138 return self._numElements
139
141 """
142 Get the elements in grades with more than one element
143 @return: List of grades with their elements
144 @rtype: list of list of object
145 """
146 if self._overloadedRanks is None:
147 self._overloadedRanks = self._calcOverloadedRanks(self._sequence, self._isGraded)
148 return self._overloadedRanks
149
151 """
152 The maximum possible inversions, given the number of elements and
153 the number sharing each grade (e.g. ties). If sequence param is
154 given, the structure of the comparison sequence is considered also.
155 @param sequence: Comparison sequence
156 @type sequence: list of list of object or list of object or None
157 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list.
158 @type isGraded: bool
159 @return: Maximum possible number of inversions
160 @rtype: int
161 """
162 return self._calcGradedStatistic(maxInversions, sequence, isGraded)
163
164 - def mean(self, sequence=None, isGraded=False):
165 """
166 The mean of inversion across permutations, given the number of elements
167 and the number sharing each grade (e.g. ties). If sequence param is
168 given, the structure of the comparison sequence is considered also.
169 @param sequence: Comparison sequence
170 @type sequence: list of list of object or list of object or None
171 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list.
172 @type isGraded: bool
173 @return: Mean number of inversions across possible permutations
174 @rtype: int
175 """
176 return self.maxInversions(sequence, isGraded)/2.0
177
178 - def variance(self, sequence=None, isGraded=False):
179 """
180 The variance of inversions across permutations, given the number of elements
181 and the number sharing each grade (e.g. ties). If sequence param is
182 given, the structure of the comparison sequence is considered also.
183 @param sequence: Comparison sequence
184 @type sequence: list of list of object or list of object or None
185 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list.
186 @type isGraded: bool
187 @return: Variance of inversions across possible permutations
188 @rtype: int
189 """
190 return self._calcGradedStatistic(varianceOfInversions, sequence, isGraded)
191
192 - def stdDev(self, sequence=None, isGraded=False):
193 """
194 The standard deviation of inversions across permutations, given the elements
195 and the number sharing each grade (e.g. ties). If sequence param is not
196 given, the structure of the comparison sequence is considered also.
197 @param sequence: Comparison sequence
198 @type sequence: list of list of object or list of object or None
199 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list.
200 @type isGraded: bool
201 @return: Standard deviation of inversions across possible permutations
202 @rtype: int
203 """
204 return self.variance(sequence, isGraded)**0.5
205
206 - def inversions(self, sequence=None, isGraded=False):
207 """
208 The inversions between the reference sequence and the comparison sequence.
209 If the 'sequence' param is None, the comparison sequence is a fully sorted
210 form of the reference sequence.
211 @param sequence: Comparison sequence (If None, equivalent to a sorted reference sequence)
212 @type sequence: list of list of object or list of object or None
213 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list.
214 @type isGraded: bool
215 @return: If 'sequence' given, number of inversions to turn it into reference sequence.
216 Else, the number of inversions to sort the reference sequence.
217 @rtype: int
218 """
219 if sequence:
220 if isinstance(sequence, GradedPosetSequence):
221 isGraded = sequence.isGraded()
222 sequence = sequence._sequence
223 if self._hashableElements:
224 if self._hashRanks is None:
225 self._populateHashRanks()
226 indexSeq = makeFlatIndexSequence(sequence, self._sequence, isGraded, self._isGraded,
227 indexFunction=self._getElementIndex)
228 else:
229 indexSeq = makeFlatIndexSequence(sequence, self._sequence, isGraded, self._isGraded)
230 return mergeSortInversionCount(indexSeq)[0]
231 else:
232 if self._isGraded:
233 if not self._sortedRanks:
234 for i, grade in enumerate(self._sequence):
235 if isinstance(grade, list):
236 grade.sort()
237 elif isinstance(grade, VALID_GRADE_CONTAINERS):
238 self._sequence[i] = sorted(grade)
239 self._sortedRanks = True
240 return mergeSortInversionCount([element for element in self.iterElements()])[0]
241 else:
242 return mergeSortInversionCount(self._sequence)[0]
243
244 - def similarity(self, sequence=None, isGraded=False):
245 """
246 The similarity of the reference sequence to a comparison sequence. If the
247 'sequence' param is None, the comparison sequence is a fully sorted form
248 of the reference sequence.
249 @param sequence: Comparison sequence (If None, equivalent to a sorted reference sequence)
250 @type sequence: list of list of object or list of object or None
251 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list.
252 @type isGraded: bool
253 @return: The similarity of the reference sequence to the comparison sequence, in [0,1]
254 @rtype: float
255 """
256 inversions = self.inversions(sequence, isGraded)
257 maxInversions = self.maxInversions(sequence, isGraded)
258 return similarityByInversions(inversions, maxInversions)
259
261 """
262 The correlation of the reference sequence to a comparison sequence. If the
263 'sequence' param is None, the comparison sequence is a fully sorted form
264 of the reference sequence. This is just a rescaled form of similarity.
265 This should also be equivalent to a Kendall's Tau correlation under
266 standard conditions.
267 @param sequence: Comparison sequence (If None, equivalent to a sorted reference sequence)
268 @type sequence: list of list of object or list of object or None
269 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list.
270 @type isGraded: bool
271 @return: The correlation of the reference sequence to the comparison sequence, in [-1,1]
272 @rtype: float
273 """
274 inversions = self.inversions(sequence, isGraded)
275 maxInversions = self.maxInversions(sequence, isGraded)
276 return correlationByInversions(inversions, maxInversions)
277
279 """
280 Calculate a CDF value for inversions between the reference sequence
281 abd a comparison sequence. If the 'sequence' param is None, the comparison
282 sequence is a fully sorted form of the reference sequence. This can
283 use either an exact distribution or a normal approximation.
284 @param sequence: Comparison sequence (If None, equivalent to a sorted reference sequence)
285 @type sequence: list of list of object or list of object or None
286 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list.
287 @type isGraded: bool
288 @param cdfType: The CDF function to use. If ADAPTIVE_CDF, automatically switches to a normal
289 approximation when the # of elements exceeds a bound. If EXACT_CDF,
290 calculates the PDF exactly and sums over it. This gets slow and is
291 seldom necessary after about 100 elements. If NORMAL_CDF, uses the
292 mean and variance to generate a normal distribution. Except for
293 small numbers of elements or very large grades, this is advised.
294 @type cdfType: str
295 @return: P(x<=X) where x is a random variable for permutation inversions and X is
296 the number of inversions between the reference and comparison sequences.
297 @rtype: float
298 """
299 if cdfType == NORMAL_CDF:
300 return self.normalCDF(sequence, isGraded)
301 elif cdfType == EXACT_CDF:
302 return self.exactCDF(sequence, isGraded)
303 else:
304 maxInversions = self.maxInversions(sequence, isGraded)
305 if maxInversions >= self.APPROXIMATION_RANKS_BOUND:
306 return self.normalCDF(sequence, isGraded)
307 else:
308 return self.exactCDF(sequence, isGraded)
309
310 - def exactCDF(self, sequence=None, isGraded=False):
311 """
312 Calculate an exact CDF value for inversions between the reference sequence
313 abd a comparison sequence. If the 'sequence' param is None, the comparison
314 sequence is a fully sorted form of the reference sequence.
315 Note: The exact CDF calculation works very slowly as N gets greater than 100 elements.
316 @param sequence: Comparison sequence (If None, equivalent to a sorted reference sequence)
317 @type sequence: list of list of object or list of object or None
318 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list.
319 @type isGraded: bool
320 @return: P(x<=X) where x is a random variable for permutation inversions and X is
321 the number of inversions between the reference and comparison sequences.
322 @rtype: float
323 """
324 inversions = self.inversions(sequence, isGraded)
325 maxInversions = self.maxInversions(sequence, isGraded)
326
327 if maxInversions == 0:
328 return None
329 elif inversions == maxInversions:
330 return 1.0
331
332 else:
333 if sequence:
334 if isinstance(sequence, GradedPosetSequence):
335 isGraded = sequence.isGraded()
336 otherNumElements = sequence.getNumElements()
337 otherOverloadedRanks = sequence.getOverloadedRanks()
338 sequence = sequence._sequence
339 else:
340 otherNumElements = self._calcNumElements(sequence, isGraded)
341 otherOverloadedRanks = self._calcOverloadedRanks(sequence, isGraded)
342
343 if otherNumElements != self._numElements:
344 overloadedRanks = self._calcOverloadedRanks(self._sequence, self._isGraded, sequence, isGraded)
345 else:
346 overloadedRanks = self.getOverloadedRanks()
347 intersectingRanks = self._calcRankIntersections(overloadedRanks, otherOverloadedRanks)
348 numElements = otherNumElements
349 else:
350 overloadedRanks = self.getOverloadedRanks()
351 otherOverloadedRanks = []
352 intersectingRanks = []
353 numElements = self.getNumElements()
354 inversionDistribution = rawInversionCounts(numElements)
355 for intersections in intersectingRanks:
356 intersectionDistr = rawInversionCounts(len(intersections))
357 inversionDistribution = polynomialExpansion(inversionDistribution, intersectionDistr)
358 for overloadedRank in overloadedRanks + otherOverloadedRanks:
359 overloadedDistr = rawInversionCounts(len(overloadedRank))
360 inversionDistribution, remainder = syntheticDivision(inversionDistribution, overloadedDistr)
361
362 if inversions > maxInversions/2.0:
363 return 1.0-float(sum(inversionDistribution[inversions+1:]))/sum(inversionDistribution)
364 else:
365 return float(sum(inversionDistribution[:inversions+1]))/sum(inversionDistribution)
366
367 - def normalCDF(self, sequence=None, isGraded=False):
368 """
369 Calculate an approximate CDF value for inversions between the reference sequence
370 abd a comparison sequence. If the 'sequence' param is None, the comparison
371 sequence is a fully sorted form of the reference sequence.
372 Note: The normal distribution gets quite accurate by N > 20 and should be used for larger
373 sequences.
374 @param sequence: Comparison sequence (If None, equivalent to a sorted reference sequence)
375 @type sequence: list of list of object or list of object or None
376 @param isGraded: If true, sequence consists of subsequences containing each grade. Else, flat list.
377 @type isGraded: bool
378 @return: P(x<=X) where x is a random variable for permutation inversions and X is
379 the number of inversions between the reference and comparison sequences.
380 @rtype: float
381 """
382 mean = self.mean(sequence, isGraded)
383 stdDev = self.stdDev(sequence, isGraded)
384 inversions = self.inversions(sequence, isGraded)
385 if mean == 0:
386 return None
387 else:
388 return normalCDFFunction(inversions, mean, stdDev)
389
392 """
393 Perform a Sign test on either the similarity or the complements of the CDF values,
394 which are a measure of the match between the each sequence and the reference seq.
395 This has the advantage of calculating based on the similarity, which is quicker than
396 calculating the CDF probability of a particular inversion counts. If two sequences
397 are provided, this performs a two sample paired test.
398 @param sequences: Multiple sequences that are permutations of the elements of this sequence
399 @type sequences: list of list of object or list of list of list of object
400 @param sequences2: Multiple sequences that are permutations (second sample)
401 @type sequences2: list of list of object or list of list of list of object
402 @param isGraded: If True, each sequence is nested (where nested elements share a grade)
403 @type isGraded: bool
404 @param measure: The measure to calculate for each sequence, either SIMILARITY_METRIC or from CDF_TYPES set
405 @type measure: str
406 @param mu: Assumed mean of the sign test, to allow assymetric testing
407 @type mu: float
408 @param alternative: Alternate hypothesis for the test, from StatisticalTests.TEST_HYPOTHESES set
409 @type alternative: str
410 @return: Probability of the null hypothesis
411 @rtype: float
412 """
413 if mu >= 1 or mu <= 0:
414 raise ValueError("Sign test mu should be in [0,1] but got: %s"%mu)
415 return self._comparisonTest(signTest, sequences, sequences2, isGraded, measure, mu=mu, alternative=alternative)
416
419 """
420 Perform a Wilcoxon Rank Sum test on the complements of the CDF values,
421 which are a measure of the match between the each sequence and the reference seq
422 Mu cannot be set on this test, as it would violate the assumption of symmetry.
423 If two sequences are provided, this performs a two-sample paired test.
424 @param sequences: Multiple sequences that are permutations of the elements of this sequence
425 @type sequences: list of list of object or list of list of list of object
426 @param sequences2: Multiple sequences that are permutations (second sample)
427 @type sequences2: list of list of object or list of list of list of object
428 @param isGraded: If True, each sequence is nested (where nested elements share a grade)
429 @type isGraded: bool
430 @param cdfType: The type of CDF to calculate for the inversion counts, from CDF_TYPES set
431 @type cdfType: str
432 @param alternative: Alternative hypothesis for the test, from StatisticalTests.TEST_HYPOTHESES set
433 @type alternative: str
434 @return: Probability of the null hypothesis
435 @rtype: float
436 """
437 if cdfType not in CDF_TYPES:
438 raise TypeError("Wilcoxon Test received invalid CDF Type. Must be one of: %s"%(CDF_TYPES,))
439 if sequences2 is None:
440 return self._comparisonTest(wilcoxonSignedRankTest, sequences, sequences2, isGraded,
441 cdfType, mu=0.5, alternative=alternative)
442 else:
443 return self._comparisonTest(wilcoxonSignedRankTest, sequences, sequences2, isGraded,
444 cdfType, alternative=alternative)
445
449 """
450 Perform a Permutation or Monte Carlo Mean Difference test on some measure
451 (similarity or 1-CDF) for the match between the each sequence and the reference seq.
452 This requires two samples, as these are shuffled for form the permutations.
453 @param sequences: Multiple sequences that are permutations of the elements of this sequence
454 @type sequences: list of list of object or list of list of list of object
455 @param sequences2: Multiple sequences that are permutations (second sample)
456 @type sequences2: list of list of object or list of list of list of object
457 @param isGraded: If True, each sequence is nested (where nested elements share a grade)
458 @type isGraded: bool
459 @param measure: The type of CDF to calculate for the inversion counts, from CDF_TYPES set
460 @type measure: str
461 @param alternative: Alternative hypothesis for the test, from StatisticalTests.TEST_HYPOTHESES set
462 @type alternative: str
463 @param pValue: The p-Value for the test to confirm, used for Monte Carlo early termination
464 @type pValue: float
465 @param iterations: The max number of iterations to run for Monte Carlo
466 @type iterations: int
467 @param useStoppingRule: If True, use version of MonteCarlo with an unbiased early stopping rule
468 @type useStoppingRule: bool
469 @param maxExactN: The largest N=len(x)+len(y) to calculate an exact test value.
470 For values higher than this, use a Monte Carlo approximation.
471 @type maxExactN: int
472 @return: Probability of the null hypothesis
473 @rtype: float
474 """
475 return self._comparisonTest(permutationMeanTest, sequences, sequences2, isGraded, measure,
476 alternative=alternative, pValue=pValue, iterations=iterations,
477 maxExactN=maxExactN)
478
482 """
483 Perform a Permutation or Monte Carlo rank test on a measure (similarity or
484 1-CDF) for the match between the each sequence and the reference seq.
485 This requires two samples, as these are shuffled for form the permutations.
486 @param sequences: Multiple sequences that are permutations of the elements of this sequence
487 @type sequences: list of list of object or list of list of list of object
488 @param sequences2: Multiple sequences that are permutations (second sample)
489 @type sequences2: list of list of object or list of list of list of object
490 @param isGraded: If True, each sequence is nested (where nested elements share a grade)
491 @type isGraded: bool
492 @param measure: The type of CDF to calculate for the inversion counts, from CDF_TYPES set
493 @type measure: str
494 @param alternative: Alternative hypothesis for the test, from StatisticalTests.TEST_HYPOTHESES set
495 @type alternative: str
496 @param pValue: The p-Value for the test to confirm, used for Monte Carlo early termination
497 @type pValue: float
498 @param iterations: The max number of iterations to run for Monte Carlo
499 @type iterations: int
500 @param useStoppingRule: If True, use version of MonteCarlo with an unbiased early stopping rule
501 @type useStoppingRule: bool
502 @param maxExactN: The largest N=len(x)+len(y) to calculate an exact test value.
503 For values higher than this, use a Monte Carlo approximation.
504 @type maxExactN: int
505 @return: Probability of the null hypothesis
506 @rtype: float
507 """
508 return self._comparisonTest(permutationRankTest, sequences, sequences2, isGraded, measure,
509 alternative=alternative, pValue=pValue, iterations=iterations,
510 maxExactN=maxExactN)
511
513 """
514 Perform a hypothesis test on either the similarity or the complements of the CDF values,
515 which are a measure of the match between the each sequence and the reference seq.
516 If two sequences are provided, this performs a two sample test of an appropriate type.
517 @param test: The hypothesis test function, in the form: f(sample1, sample1=None, **kwds)
518 @type test: callable
519 @param sequences: Multiple sequences that are permutations of the elements of this sequence
520 @type sequences: list of list of object or list of list of list of object
521 @param sequences2: Multiple sequences that are permutations (second sample)
522 @type sequences2: list of list of object or list of list of list of object
523 @param isGraded: If True, each sequence is nested (where nested elements share a grade)
524 @type isGraded: bool
525 @param measure: The measure to calculate for each sequence, either SIMILARITY_METRIC or from CDF_TYPES set
526 @type measure: str
527 @param kwds: Additional test-specific parameters passed to the test
528 @type kwds: dict
529 @return: Probability of the null hypothesis
530 @rtype: float
531 """
532 if measure == SIMILARITY_METRIC:
533 values = [self.similarity(seq, isGraded) for seq in sequences]
534 else:
535 values = [1.0-self.cdf(seq, isGraded, measure) for seq in sequences]
536 if sequences2 is None:
537 return test(values, **kwds)
538 else:
539 if measure == SIMILARITY_METRIC:
540 values2 = [self.similarity(seq, isGraded) for seq in sequences2]
541 else:
542 values2 = [1.0-self.cdf(seq, isGraded, measure) for seq in sequences2]
543 return test(values, values2, **kwds)
544
545 @classmethod
547 """
548 Calculate the number of elements in the sequence.
549 This is trivial for a flat sequence and quick for a graded one.
550 @param sequence: The sequence to count elements in
551 @type sequence: list of list of object or list of object
552 @param isGraded: If True, sequence is nested (where nested elements share a grade)
553 @type isGraded: bool
554 @return: The number of elements in the sequence (N)
555 @rtype: bool
556 """
557 if isGraded:
558 aSum = 0
559 for elements in sequence:
560 if isinstance(elements, VALID_GRADE_CONTAINERS):
561 aSum += len(elements)
562 else:
563 aSum += 1
564 return aSum
565 else:
566 return len(sequence)
567
569 """
570 Calculate a series length statistic, adjusting for incomparable elements.
571 This assumes that the operation can be evaluated on the total number of elements,
572 then the operation is evaluated on the subsets of incomparable elements, which
573 are subtracted off from the full-set operation value. If the 'sequence' param is
574 provided, the grades for this sequence is subtracted off also (without double-counting).
575 The max, mean, and variance can all be calculated using this approach.
576 @param operation: The operation to evaluate, in the form 'numeric = f(int)'
577 @type operation: callable
578 @param sequence: A comparison sequence to evaluate grades for also. If None, ignored.
579 @type sequence: list of list of object or list of object
580 @param isGraded: If True, sequence is nested (where nested elements share a grade)
581 @type isGraded: bool
582 @return: Value of the graded statistic
583 @rtype: int or float
584 """
585 if sequence:
586 if isinstance(sequence, GradedPosetSequence):
587 isGraded = sequence.isGraded()
588 otherNumElements = sequence.getNumElements()
589 otherOverloadedRanks = sequence.getOverloadedRanks()
590 sequence = sequence._sequence
591 else:
592 otherNumElements = self._calcNumElements(sequence, isGraded)
593 otherOverloadedRanks = self._calcOverloadedRanks(sequence, isGraded)
594
595 if otherNumElements != self._numElements:
596 overloadedRanks = self._calcOverloadedRanks(self._sequence, self._isGraded, sequence, isGraded)
597 else:
598 overloadedRanks = self.getOverloadedRanks()
599 intersectingRanks = self._calcRankIntersections(overloadedRanks, otherOverloadedRanks)
600 value = operation(min(otherNumElements, self._numElements))
601 value -= sum([operation(len(elements)) for elements in overloadedRanks])
602 value -= sum([operation(len(elements)) for elements in otherOverloadedRanks])
603 value += sum([operation(len(elements)) for elements in intersectingRanks])
604 return value
605 else:
606 value = operation(self._numElements)
607 value -= sum([operation(len(elements)) for elements in self.getOverloadedRanks()])
608 return value
609
611 """
612 Calculate the grades with more than one element (overloaded grades).
613 If two sequences are provided, this only calculates overloaded ranks for elements
614 that are contained in both sequences (in case one is missing elements)
615 Note: If a sequence is not nested, it has no overloaded ranks by definition
616 Note 2: The 'otherSequence' is only needed to handle missing elements
617 @param sequence: Sequence to calculate the overloaded grades
618 @type sequence: list of list of object or list of object
619 @param isGraded: If True, sequence is nested (where nested elements share a grade)
620 @type isGraded: bool
621 @param otherSequence: Optionally, a reference sequence to only count elements in both sequences
622 @type otherSequence: list of list of object or list of object
623 @param isOtherGraded: If True, sequence is nested (where nested elements share a grade)
624 @type isOtherGraded: bool
625 @return: List of subsets of elements that cannot be compared against each other
626 @rtype: List of list of object
627 """
628 if isinstance(sequence, GradedPosetSequence):
629 isGraded = sequence.isGraded()
630 sequence = sequence._sequence
631 if otherSequence and isinstance(otherSequence, GradedPosetSequence):
632 isOtherGraded = otherSequence.isGraded()
633 otherSequence = otherSequence._sequence
634 if isGraded:
635 if otherSequence is None:
636 overloadedRanks = [elements for elements in sequence
637 if isinstance(elements, VALID_GRADE_CONTAINERS)
638 and len(elements) > 1]
639 else:
640
641 if isOtherGraded:
642 if self._hashableElements:
643 flatOther = set()
644 for elements in otherSequence:
645 if isinstance(elements, VALID_GRADE_CONTAINERS):
646 flatOther |= set(elements)
647 else:
648 flatOther.add(elements)
649 else:
650 flatOther = []
651 for elements in otherSequence:
652 if isinstance(elements, VALID_GRADE_CONTAINERS):
653 flatOther.extend(elements)
654 else:
655 flatOther.append(elements)
656 else:
657 if self._hashableElements:
658 flatOther = set(otherSequence)
659 else:
660 flatOther = otherSequence
661
662 overloadedRanks = []
663 for elements in sequence:
664 if isinstance(elements, VALID_GRADE_CONTAINERS) and len(elements) > 1:
665 filteredElements = [element for element in elements if element in flatOther]
666 if len(filteredElements) > 1:
667 overloadedRanks.append(filteredElements)
668 else:
669 overloadedRanks = []
670 return overloadedRanks
671
673 """
674 Calculate the intersection of overloaded ranks between sequences.
675 This basically calculates "overloaded overloads" of ranks.
676 @param overloadedRanks: First set of overloaded (len > 1) ranks
677 @type overloadedRanks: list of list of object
678 param overloadedRanks2: Second set of overloaded (len > 1) ranks
679 @type overloadedRanks2: list of list of object
680 @return: List of intersections between overloaded grades
681 @rtype: list of list of object
682 """
683 intersections = []
684 if self._hashableElements:
685 intersections= [list(set.intersection(set(elements1), set(elements2)))
686 for elements1 in overloadedRanks
687 for elements2 in overloadedRanks2]
688 else:
689 intersections= [filter(lambda x: x in elements1, elements2)
690 for elements1 in overloadedRanks
691 for elements2 in overloadedRanks2]
692 intersections = [x for x in intersections if len(x) > 1]
693 return intersections
694
696 """
697 Populate hash table that reports the grade of any element.
698 Slight hit to memory, but significant boost in speed for
699 comparisons.
700 """
701 sequence = self._sequence
702 if self._isGraded:
703 self._hashRanks = {}
704 for i, elements in enumerate(sequence):
705 if isinstance(elements, VALID_GRADE_CONTAINERS):
706 for e in elements:
707 self._hashRanks[e] = i
708 else:
709 self._hashRanks[elements] = i
710 else:
711 self._hashRanks = dict(((x,i) for i, x in enumerate(sequence)))
712
714 """
715 Get the index of the given value in the sequence
716 Note: This hashes element indices, if possible
717 @param element: Value to get the position for
718 @type element: object
719 @param sequence: Nested sequence which is used to determine sorting value of elements
720 @type sequence: list of list of object
721 @param isGraded: If True, sequence is graded (nested). Else, flat sequence.
722 @type isGraded: bool
723 @return: Sorting key for the value
724 @rtype: int
725 """
726 if self._hashableElements:
727 value = self._hashRanks.get(element, None)
728 if value is not None:
729 return value
730 else:
731 return getDefaultIndex(element, len(self._sequence), self._defaultGrade)
732 else:
733 return getValueIndex(element, sequence, isGraded)
734
735
737 """
738 Check that this sequence is valid, so that it has
739 no repeated elements, that elements are hashable
740 if sequence is hashable, and that a graded sequence
741 contains iterated grades.
742 """
743 self.validateSequence(self._sequence, self._isGraded, self._hashableElements)
744
745 @classmethod
747 """
748 Check that a sequence is valid, so that it has
749 no repeated elements, and that elements are hashable
750 if sequence is hashable.
751 """
752 if hasattr(sequence, '__iter__'):
753 if isGraded:
754 elements = []
755 for grade in sequence:
756 if isinstance(grade, VALID_GRADE_CONTAINERS):
757 elements.extend(grade)
758 else:
759 elements.append(grade)
760 else:
761 elements = list(sequence)
762
763 if hashableElements:
764 unhashables = []
765 for element in elements:
766 if not hasattr(element, '__hash__') or element.__hash__ is None:
767 unhashables.append(element)
768 else:
769 hash(element)
770
771 observedElements = []
772 repeatedElements = []
773 for element in elements:
774 if element in observedElements:
775 repeatedElements.append(element)
776 else:
777 observedElements.append(element)
778 errStr = ''
779 if len(unhashables) > 0:
780 unhashableStr = ', '.join([str(unhashable) for unhashable in unhashables])
781 errStr +="The following elements were unhashable in a hashable poset:\n%s\n"%(unhashableStr,)
782 if len(repeatedElements) > 0:
783 errStr += "The following elements were repeated in the poset:\n%s"%(repeatedElements,)
784 else:
785 errStr = 'Sequence was not iterable.'
786 if len(errStr) > 0:
787 raise InvalidGradedPosetError(errStr)
788
789
790
791
792
793 -def inversionCountMax(seq, seq2=None, isGraded=False, hashableElements=True):
794 """
795 Calculate the maximum possible number of inversions, based
796 on the sequence (# of elements and grade structure). If
797 the second optional sequence is provided, its grade structure
798 is also considered.
799 Note: This only depends on the structure of the sequence(s), not their order
800 @param seq: Sequence to get the maximum possible inversions for any permutation
801 @type seq: list of object or list of list of object
802 @param seq2: A comparison sequence for the sequence 'seq'
803 @type seq2: list of object or list of list of object
804 @param isGraded: If True, both seq and seq2 are nested (where nested elements share a grade)
805 @type isGraded: bool
806 @param hashableElements: If True, elements can be hashed (improves speed)
807 @type hashableElements: bool
808 @return: Maximum inversions
809 @rtype: int
810 """
811 if isinstance(seq, GradedPosetSequence):
812 refSeq = seq
813 else:
814 refSeq = GradedPosetSequence(seq, isGraded=isGraded, hashableElements=hashableElements)
815 return refSeq.maxInversions(seq2, isGraded)
816
818 """
819 Calculate the mean inversions across permutations, based
820 on the sequence (# of elements and grade structure). If
821 the second optional sequence is provided, its grade structure
822 is also considered.
823 Note: This only depends on the structure of the sequence(s), not their order
824 @param seq: Sequence to get the mean inversions for all possible permutations
825 @type seq: list of object or list of list of object
826 @param seq2: A comparison sequence for the sequence 'seq'
827 @type seq2: list of object or list of list of object
828 @param isGraded: If True, both seq and seq2 are nested (where nested elements share a grade)
829 @type isGraded: bool
830 @param hashableElements: If True, elements can be hashed (improves speed)
831 @type hashableElements: bool
832 @return: Mean inversions
833 @rtype: float
834 """
835 if isinstance(seq, GradedPosetSequence):
836 refSeq = seq
837 else:
838 refSeq = GradedPosetSequence(seq, isGraded=isGraded, hashableElements=hashableElements)
839 return refSeq.mean(seq2, isGraded)
840
842 """
843 Calculate the variance of inversions across permutations, based
844 on the sequence (# of elements and grade structure). If
845 the second optional sequence is provided, its grade structure
846 is also considered.
847 Note: This only depends on the structure of the sequence(s), not their order
848 @param seq: Sequence to get the variance of inversions for all possible permutations
849 @type seq: list of object or list of list of object
850 @param seq2: A comparison sequence for the sequence 'seq'
851 @type seq2: list of object or list of list of object
852 @param isGraded: If True, both seq and seq2 are nested (where nested elements share a grade)
853 @type isGraded: bool
854 @param hashableElements: If True, elements can be hashed (improves speed)
855 @type hashableElements: bool
856 @return: Variance of inversions
857 @rtype: float
858 """
859 if isinstance(seq, GradedPosetSequence):
860 refSeq = seq
861 else:
862 refSeq = GradedPosetSequence(seq, isGraded=isGraded, hashableElements=hashableElements)
863 return refSeq.variance(seq2, isGraded)
864
928
935 """
936 Calculate the number of inversions requried to sort sequence via mergesort.
937 This function only works on a flat list.
938 Note: Relies on python sort to do actual merging (but not the counting).
939 More efficient this way, as the Python version is in C.
940 @param sequence: Sequence to sort and count inversions
941 @type sequence: list
942 @return: Number of inversions, sorted list
943 @rtype: int, list
944 """
945 if len(sequence) <= 1:
946 return 0, sequence
947 else:
948 firstHalf = sequence[:int(len(sequence)/2)]
949 secondHalf = sequence[int(len(sequence)/2):]
950 count1, firstHalf = mergeSortInversionCount(firstHalf)
951 count2, secondHalf = mergeSortInversionCount(secondHalf)
952 firstN = len(firstHalf)
953 secondN = len(secondHalf)
954 secondHalfEnd = secondN
955 count3 = count1
956 count3+= count2
957
958
959 for i in xrange(firstN-1, -1, -1):
960 x = firstHalf[i]
961 inversionFound = False
962 for j in xrange(secondHalfEnd-1,-1,-1):
963 if x > secondHalf[j]:
964 inversionFound = True
965 break
966 if inversionFound:
967 secondHalfEnd = j+1
968 count3 += j+1
969 mergeList = firstHalf + secondHalf
970 mergeList.sort()
971 return count3, mergeList
972
973 -def inversionCount(seq, seq2=None, isGraded=False, hashableElements=True):
974 """
975 Calculate the inversions between seq and seq2. If seq2 is None, this works
976 equivalently to seq2 being a sorted version of seq
977 @param seq: Sequence to calculate the inversions for
978 @type seq: list of object or list of list of object
979 @param seq2: A comparison sequence for the sequence 'seq'
980 @type seq2: list of object or list of list of object
981 @param isGraded: If True, both seq and seq2 are nested (where nested elements share a grade)
982 @type isGraded: bool
983 @param hashableElements: If True, elements can be hashed (improves speed)
984 @type hashableElements: bool
985 @return: Number of inversions
986 @rtype: int
987 """
988 if isinstance(seq, GradedPosetSequence):
989 refSeq = seq
990 else:
991 refSeq = GradedPosetSequence(seq, isGraded=isGraded, hashableElements=hashableElements)
992 return refSeq.inversions(seq2, isGraded)
993
995 """
996 Calculate the similarity between sequences: 1-inversions/maxInversions
997 If seq2 is None, this works equivalently to seq2 being a sorted version of seq
998 @param seq: Sequence to get the similarity for
999 @type seq: list of object or list of list of object
1000 @param seq2: A comparison sequence for the sequence 'seq'
1001 @type seq2: list of object or list of list of object
1002 @param isGraded: If True, both seq and seq2 are nested (where nested elements share a grade)
1003 @type isGraded: bool
1004 @param hashableElements: If True, elements can be hashed (improves speed)
1005 @type hashableElements: bool
1006 @return: Similarity of sequences by inversion counts, in [0,1], where 1 is a perfect match
1007 @rtype: float
1008 """
1009 if isinstance(seq, GradedPosetSequence):
1010 refSeq = seq
1011 else:
1012 refSeq = GradedPosetSequence(seq, isGraded=isGraded, hashableElements=hashableElements)
1013 return refSeq.similarity(seq2, isGraded)
1014
1021 """
1022 Get the cummulative distribution function probability for the number
1023 of inversions between seq and seq2. If seq2 is None, this works
1024 equivalently to seq2 being a sorted version of seq
1025 @param seq: Sequence to calculate the number of inversions
1026 @type seq: list of object or list of list of object
1027 @param seq2: A comparison sequence for the sequence 'seq'
1028 @type seq2: list of object or list of list of object
1029 @param isGraded: If True, both seq and seq2 are nested (where nested elements share a grade)
1030 @type isGraded: bool
1031 @param hashableElements: If True, elements can be hashed (improves speed)
1032 @type hashableElements: bool
1033 @return: P(X<=x) where X is a R.V. based on all permutations and
1034 x is the actual number of inversions between seq and seq2
1035 @rtype: float
1036 """
1037 if isinstance(seq, GradedPosetSequence):
1038 refSeq = seq
1039 else:
1040 refSeq = GradedPosetSequence(seq, isGraded=isGraded, hashableElements=hashableElements)
1041 return refSeq.cdf(seq2, isGraded, cdfType)
1042
1046 """
1047 A sign test based on the values of some measure. This measure can be either similarity
1048 or the complement of the cdf (1-cdf), both of which are in the range [0,1] where
1049 0 is the worst match and 1 is the best match. For each sequence in 'orderings',
1050 the measure is calculated comparing seq against the permutation. This creates a list
1051 of measure values that the sign test can be applied to. If two sets of orderings
1052 are provided, this runs a two-sample test and ignores mu.
1053 Note: A calculation based on similarity will provide different results than one based
1054 on CDF if mu!=0.5. When mu=0.5, both are identical.
1055 @param orderings: One or more comparison sequences (first sample)
1056 @type orderings: list of list of object or list of list of list of object
1057 @param orderings2: One or more comparison sequences (second sample)
1058 @type orderings2: list of list of object or list of list of list of object
1059 @param seq: Reference sequence to calculate the number of inversions against
1060 @type seq: list of object or list of list of object
1061 @param isGraded: If True, both 'seq' and sequences in 'orderings' are nested (where nested elements share a grade)
1062 @type isGraded: bool
1063 @param hashableElements: If True, elements can be hashed (improves speed)
1064 @type hashableElements: bool
1065 @param measure: The measure to calculate for each sequence, either SIMILARITY_METRIC or from CDF_TYPES set
1066 @type measure: str
1067 @param mu: Assumed mean of the sign test, to allow assymetric testing
1068 @type mu: float
1069 @param alternative: Alternate hypothesis for the test, from the TEST_HYPOTHESES set ('greater', 'less', 'two.sided')
1070 @type alternative: str
1071 @return: Probability of the null hypothesis
1072 @rtype: float
1073 """
1074 if mu >= 1 or mu <= 0:
1075 raise ValueError("Sign test mu should be in [0,1] but got: %s"%mu)
1076 return _inversionComparisonTest(signTest, GradedPosetSequence.signTest, orderings, orderings2, seq,
1077 isGraded, hashableElements, measure, mu=mu, alternative=alternative)
1078
1082 """
1083 A Wilcoxon rank sum test based on the complement of the CDF (1-cdf).
1084 For each ordering in 'orderings', 1-cdf is calculated comparing seq
1085 against the permutation. This creates a list of probabilities of getting
1086 a worse match by chance (one probability for each comparison sequence).
1087 @param orderings: One or more comparison sequences (first sample)
1088 @type orderings: list of list of object or list of list of list of object
1089 @param orderings2: One or more comparison sequences (second sample)
1090 @type orderings2: list of list of object or list of list of list of object
1091 @param seq: Sequence to calculate the number of inversions
1092 @type seq: list of object or list of list of object
1093 @param isGraded: If True, both 'seq' and sequences in 'orderings' are nested (where nested elements share a grade)
1094 @type isGraded: bool
1095 @param hashableElements: If True, elements can be hashed (improves speed)
1096 @type hashableElements: bool
1097 @param cdfType: The type of CDF to calculate for the inversion counts, from CDF_TYPES set
1098 @type cdfType: str
1099 @param alternative: Alternate hypothesis for the test, from the TEST_HYPOTHESES set ('greater', 'less', 'two.sided')
1100 @type alternative: str
1101 @return: Probability of the null hypothesis
1102 @rtype: float
1103 """
1104 if cdfType not in CDF_TYPES:
1105 raise TypeError("Wilcoxon Test received invalid CDF Type. Must be one of: %s"%(CDF_TYPES,))
1106 if orderings2 is None and seq is None:
1107 return _inversionComparisonTest(wilcoxonSignedRankTest, GradedPosetSequence.wilcoxonTest,
1108 orderings, orderings2, seq, isGraded, hashableElements, cdfType,
1109 mu=0.5, alternative=alternative)
1110 else:
1111 return _inversionComparisonTest(wilcoxonSignedRankTest, GradedPosetSequence.wilcoxonTest,
1112 orderings, orderings2, seq, isGraded, hashableElements, cdfType,
1113 alternative=alternative)
1114
1115
1116 -def inversionPermutationMeanTest(orderings, orderings2, seq=None, isGraded=False, hashableElements=True,
1117 measure=ADAPTIVE_CDF, alternative=GREATER_THAN_HYPOTHESIS,
1118 pValue=0.95, iterations=100000, useStoppingRule=True, maxExactN=7):
1119 """
1120 Perform a Permutation or Monte Carlo Mean Difference Test on some measure
1121 (similarity or 1-CDF) for the match between the each sequence and the reference seq.
1122 This requires two samples, as these are shuffled for form the permutations.
1123 @param orderings: One or more comparison sequences (first sample)
1124 @type orderings: list of list of object or list of list of list of object
1125 @param orderings2: One or more comparison sequences (second sample)
1126 @type orderings2: list of list of object or list of list of list of object
1127 @param seq: Sequence to calculate the number of inversions
1128 @type seq: list of object or list of list of object
1129 @param isGraded: If True, both 'seq' and sequences in 'orderings' are nested (where nested elements share a grade)
1130 @type isGraded: bool
1131 @param hashableElements: If True, elements can be hashed (improves speed)
1132 @type hashableElements: bool
1133 @param measure: The type of CDF to calculate for the inversion counts, from CDF_TYPES set
1134 @type measure: str
1135 @param alternative: Alternative hypothesis for the test, from StatisticalTests.TEST_HYPOTHESES set
1136 @type alternative: str
1137 @param pValue: The p-Value for the test to confirm, used for Monte Carlo early termination
1138 @type pValue: float
1139 @param iterations: The max number of iterations to run for Monte Carlo
1140 @type iterations: int
1141 @param useStoppingRule: If True, use version of MonteCarlo with an unbiased early stopping rule
1142 @type useStoppingRule: bool
1143 @param maxExactN: The largest N=len(x)+len(y) to calculate an exact test value.
1144 For values higher than this, use a Monte Carlo approximation.
1145 @type maxExactN: int
1146 @return: Probability of the null hypothesis
1147 @rtype: float
1148 """
1149 return _inversionComparisonTest(permutationMeanTest, GradedPosetSequence.permutationMeanTest,
1150 orderings, orderings2, seq, isGraded, hashableElements, measure,
1151 alternative=alternative, pValue=pValue, iterations=iterations,
1152 maxExactN=maxExactN)
1153
1154 -def inversionPermutationRankTest(orderings, orderings2, seq=None, isGraded=False, hashableElements=True,
1155 measure=ADAPTIVE_CDF, alternative=GREATER_THAN_HYPOTHESIS,
1156 pValue=0.95, iterations=100000, useStoppingRule=True, maxExactN=7):
1157 """
1158 Perform a Permutation or Monte Carlo Rank on some measure (similarity or 1-CDF)
1159 for the match between the each sequence and the reference seq.
1160 This requires two samples, as these are shuffled for form the permutations.
1161 @param orderings: One or more comparison sequences (first sample)
1162 @type orderings: list of list of object or list of list of list of object
1163 @param orderings2: One or more comparison sequences (second sample)
1164 @type orderings2: list of list of object or list of list of list of object
1165 @param seq: Sequence to calculate the number of inversions
1166 @type seq: list of object or list of list of object
1167 @param isGraded: If True, both 'seq' and sequences in 'orderings' are nested (where nested elements share a grade)
1168 @type isGraded: bool
1169 @param hashableElements: If True, elements can be hashed (improves speed)
1170 @type hashableElements: bool
1171 @param measure: The type of CDF to calculate for the inversion counts, from CDF_TYPES set
1172 @type measure: str
1173 @param alternative: Alternative hypothesis for the test, from StatisticalTests.TEST_HYPOTHESES set
1174 @type alternative: str
1175 @param pValue: The p-Value for the test to confirm, used for Monte Carlo early termination
1176 @type pValue: float
1177 @param useStoppingRule: If True, use version of MonteCarlo with an unbiased early stopping rule
1178 @type useStoppingRule: bool
1179 @param iterations: The max number of iterations to run for Monte Carlo
1180 @type iterations: int
1181 @param maxExactN: The largest N=len(x)+len(y) to calculate an exact test value.
1182 For values higher than this, use a Monte Carlo approximation.
1183 @type maxExactN: int
1184 @return: Probability of the null hypothesis
1185 @rtype: float
1186 """
1187 return _inversionComparisonTest(permutationRankTest, GradedPosetSequence.permutationRankTest,
1188 orderings, orderings2, seq, isGraded, hashableElements, measure,
1189 alternative=alternative, pValue=pValue, iterations=iterations,
1190 maxExactN=maxExactN)
1191
1195 """
1196 A hypothesis test for comparing the similarity of some orderings based on some
1197 test function (test) and its equivalent in the GradedPosetSequence class (classTest).
1198 @param test: A test function, in the form, f(orderings, orderings2, **kwds)
1199 @type test: callable
1200 @param classTest: An unbound method for the GradedPosetSequence class, in the form
1201 f(self, orderings, orderings2, isGraded, measure, **kwds)
1202 @type classTest: callable
1203 @param orderings: One or more comparison sequences (first sample)
1204 @type orderings: list of list of object or list of list of list of object
1205 @param orderings2: One or more comparison sequences (second sample)
1206 @type orderings2: list of list of object or list of list of list of object
1207 @param seq: Sequence to calculate the number of inversions
1208 @type seq: list of object or list of list of object
1209 @param isGraded: If True, both 'seq' and sequences in 'orderings' are nested (where nested elements share a grade)
1210 @type isGraded: bool
1211 @param hashableElements: If True, elements can be hashed (improves speed)
1212 @type hashableElements: bool
1213 @param measure: The type of CDF or similarity to calculate for the inversion counts
1214 @type measure: str
1215 @return: Probability of the null hypothesis
1216 @rtype: float
1217 """
1218 if seq is None:
1219 f = lambda x: GradedPosetSequence(x, isGraded=isGraded, hashableElements=hashableElements)
1220 if measure == SIMILARITY_METRIC:
1221 mFunct = lambda x: x.similarity(isGraded=isGraded)
1222 else:
1223 mFunct = lambda x: 1.0-x.cdf(isGraded=isGraded, cdfType=measure)
1224 vals = [mFunct(f(x)) for x in orderings]
1225 if orderings2 is None:
1226 return test(vals, **kwds)
1227 else:
1228 vals2 = [mFunct(f(x)) for x in orderings2]
1229 return test(vals, vals2, **kwds)
1230 else:
1231 if isinstance(seq, GradedPosetSequence):
1232 refSeq = seq
1233 else:
1234 refSeq = GradedPosetSequence(seq, isGraded=isGraded, hashableElements=hashableElements)
1235 if classTest is None:
1236 return refSeq._comparisonTest(test, orderings, orderings2, isGraded, measure, **kwds)
1237 else:
1238 return classTest(refSeq, orderings, orderings2, isGraded, measure, **kwds)
1239
1243 if measure == SIMILARITY_METRIC:
1244 mFunct = lambda x,y: x.similarity(y, isGraded=isGraded)
1245 else:
1246 mFunct = lambda x,y: 1.0-x.cdf(y, isGraded=isGraded, cdfType=measure)
1247 group1 = []
1248 crossGroup = []
1249 for i, x in enumerate(orderings):
1250 x = GradedPosetSequence(x, isGraded=isGraded, hashableElements=hashableElements)
1251 group1.extend([mFunct(x, orderings[j]) for j in xrange(i+1,len(orderings))])
1252 crossGroup.extend([mFunct(x, y) for y in orderings2])
1253 group2 = []
1254 for i, x in enumerate(orderings2):
1255 x = GradedPosetSequence(x, isGraded=isGraded, hashableElements=hashableElements)
1256 group2.extend([mFunct(x, orderings2[j]) for j in xrange(i+1,len(orderings2))])
1257
1282
1283 -def getValueIndex(value, sequence, isGraded=False, defaultGrade=None):
1284 """
1285 Get the index of the given value in the sequence
1286 @param value: Value to get the position for
1287 @type value: object
1288 @param sequence: Nested sequence which is used to determine sorting value of elements
1289 @type sequence: list of list of object
1290 @return: Sorting key for the value
1291 @rtype: int
1292 """
1293 if isGraded:
1294 for i, values in enumerate(sequence):
1295 isIterable = isinstance(values, VALID_GRADE_CONTAINERS)
1296 if (isIterable and value in values) or (not isIterable and value == values):
1297 return i
1298 return getDefaultIndex(value, i, defaultGrade)
1299 elif value in sequence:
1300 return sequence.index(value)
1301 else:
1302 return getDefaultIndex(value, len(sequence), defaultGrade)
1303
1305 """
1306 Get a default index for a missing element.
1307 Default grades are:
1308 LEFT_GRADE - Always -1, ranked before the lowest existing element(s)
1309 ZERO_GRADE - Always 0, ranked equal to the lowest element(s)
1310 MEAN_GRADE - Always half of the maximum grade
1311 MAX_GRADE - Always equal to the maximum grade, ranked equal to highest element(s)
1312 RIGHT_GRADE - Always maximum grade +1 than the maximum grade, after the highest element(s)
1313 @param maxIndex: Maximum possible grade for sequence
1314 @type maxIndex: int
1315 @param defaultGrade: The type of default grade to use, from DEFAULT_GRADES set
1316 @type defaultGrade: str
1317 @return: Ranking for a default element
1318 @rtype: int
1319 """
1320 if defaultGrade is None:
1321 raise MissingElementError("%s not a subset element in sequence"%(element,))
1322 elif defaultGrade == LEFT_GRADE:
1323 return -1
1324 elif defaultGrade == ZERO_GRADE:
1325 return 0
1326 elif defaultGrade == MEAN_GRADE:
1327 return maxIndex/2.0
1328 elif defaultGrade == MAX_GRADE:
1329 return maxIndex
1330 elif defaultGrade == RIGHT_GRADE:
1331 return maxIndex+1
1332 else:
1333 raise ValueError("Unknown value for defaultGrade, got: %s"%defaultGrade)
1334
1337 """
1338 Convert 'sequence' into a sequence of indices from the reference sequence, rather than objects
1339 Note: This retains the original sequence structure (so if the sequence was nested, it remains nested)
1340 @param sequence: Sequence to convert into indices
1341 @type sequence: list of list of object or list of object
1342 @param referenceSequence: Sequence with "true ordering" used to collect element indices
1343 @type referenceSequence: list of list of object or list of object
1344 @param isSeqNested: If True, sequence is nested (where nested elements share a grade)
1345 @type isSeqNested: bool
1346 @param isRefNested: If True, reference is nested (where nested elements share a grade)
1347 @type isRefNested: bool
1348 @param tieFunction: How to sort elements within the same grade (after applying indices), in form seq = f(seq)
1349 The 'sorted' function assumes they are incomparable and removes any inversions.
1350 By comparison, a reversed sort would apply an inversion penalty to elements in a grade.
1351 @type tieFunction: callable
1352 @param indexFunction: Function for calculating the index of an element, as f(element, referenceSequence, isRefNested)
1353 @type indexFunction: callable
1354 @return: The 'sequence' parameter tranformed into integer indices, based on the reference sequence ordering.
1355 @rtype: list of list of int or list of int
1356 """
1357 if isSeqNested:
1358 indexSeq = []
1359 for grade in sequence:
1360 if isinstance(grade, VALID_GRADE_CONTAINERS):
1361 indexSeq.append(tieFunction([indexFunction(element,referenceSequence,isRefNested) for element in grade]))
1362 else:
1363 indexSeq.append(indexFunction(grade,referenceSequence,isRefNested))
1364 return indexSeq
1365 else:
1366 return [indexFunction(element,referenceSequence,isRefNested) for element in sequence]
1367
1370 """
1371 Convert 'sequence' into a sequence of indices from the reference sequence,
1372 Note: This flattens the sequence, if it was nested, for inversion counting.
1373 rather than objects, then flatten.
1374 @param sequence: Sequence to convert into indices
1375 @type sequence: list of list of object or list of object
1376 @param referenceSequence: Sequence with "true ordering" used to collect element indices
1377 @type referenceSequence: list of list of object or list of object
1378 @param isSeqNested: If True, sequence is nested (where nested elements share a grade)
1379 @type isSeqNested: bool
1380 @param isRefNested: If True, reference is nested (where nested elements share a grade)
1381 @type isRefNested: bool
1382 @param indexFunction: Function for calculating the index of an element, as f(element, referenceSequence, isRefNested)
1383 @type indexFunction: callable
1384 @return: Flat sequence of indices, where an element's index is its grade in the reference sequence
1385 @rtype: list of int
1386 """
1387 if isSeqNested:
1388 indexSeq = []
1389 for grade in sequence:
1390 if isinstance(grade, VALID_GRADE_CONTAINERS):
1391 indexSeq.extend(tieFunction([indexFunction(element,referenceSequence,isRefNested) for element in grade]))
1392 else:
1393 indexSeq.append(indexFunction(grade,referenceSequence,isRefNested))
1394 return indexSeq
1395 else:
1396 return [indexFunction(element,referenceSequence,isRefNested) for element in sequence]
1397
1399 """
1400 Remove elements from a nested list, if they are part of the excluded elements.
1401 Returns a copy of the list, in all cases
1402 @param sequence: Standard sequence ([x,y,...]) or nested sequence ([[x,..], [y,..],..])
1403 @type sequence: list of object or list of list of object
1404 @param excludedElements: List of elements to be removed from the sequence
1405 @type excludedElements: list of object
1406 @return: Sequence (in same structure) with excluded elements removed
1407 @rtype: list of object or list of list of object
1408 """
1409 if isGraded:
1410 filtered = []
1411 for grade in sequence:
1412 if isinstance(grade, VALID_GRADE_CONTAINERS):
1413 filteredGrade = [x for x in grade if x not in excludedElements]
1414 if len(filteredGrade) > 0:
1415 filtered.append(filteredGrade)
1416 elif grade not in excludedElements:
1417 filtered.append(grade)
1418 return filtered
1419 else:
1420 return [x for x in sequence if x not in excludedElements]
1421