wallaroo.assay_config
1import json 2import math 3from abc import ABC, abstractmethod 4from collections import Counter 5from datetime import datetime, timezone 6from enum import Enum 7from typing import TYPE_CHECKING, Dict, List, Optional, TypeVar, Union 8 9import matplotlib.pyplot as plt 10import pandas as pd 11import seaborn as sns 12 13from wallaroo.assay import AssayAnalysis, AssayAnalysisList 14 15from .wallaroo_ml_ops_api_client.api.assay import ( 16 assays_run_interactive, 17 assays_run_interactive_baseline, 18) 19from .wallaroo_ml_ops_api_client.models.assays_run_interactive_baseline_json_body import ( 20 AssaysRunInteractiveBaselineJsonBody, 21) 22from .wallaroo_ml_ops_api_client.models.assays_run_interactive_baseline_response_200 import ( 23 AssaysRunInteractiveBaselineResponse200, 24) 25from .wallaroo_ml_ops_api_client.models.assays_run_interactive_json_body import ( 26 AssaysRunInteractiveJsonBody, 27) 28 29if TYPE_CHECKING: 30 # Imports that happen below in methods to fix circular import dependency 31 # issues need to also be specified here to satisfy mypy type checking. 32 from wallaroo.client import Client 33 34T = TypeVar("T") 35 36 37def unwrap(v: Optional[T]) -> T: 38 """Simple function to placate pylance""" 39 if v: 40 return v 41 raise Exception("Expected a value in forced unwrap") 42 43 44class BaselineConfig(object): 45 """Abstract base class for Baseline config objects. Currently 46 only FixedBaseline is implemented though SlidingBaseline and 47 others are planned.""" 48 49 def __init__(self): 50 pass 51 52 def to_json(self) -> str: 53 return json.dumps(self, indent=4, default=ConfigEncoder) 54 55 56class FixedBaseline(BaselineConfig): 57 """The FixedBaseline is calculate from the inferences from a 58 specific time window.""" 59 60 def __init__( 61 self, pipeline_name: str, model_name: str, start: datetime, end: datetime 62 ): 63 self.Fixed = { 64 "pipeline": pipeline_name, 65 "model": model_name, 66 "start_at": start.isoformat(), 67 "end_at": end.isoformat(), 68 } 69 70 71class BaselineBuilder(ABC): 72 @abstractmethod 73 def build(self) -> BaselineConfig: 74 pass 75 76 def to_json(self) -> str: 77 return json.dumps(self, indent=4, default=ConfigEncoder) 78 79 80def ensure_tz(d: datetime) -> datetime: 81 """Ensure the date it tz aware. If naive assume it is in utc.""" 82 if d.tzinfo: 83 return d 84 else: 85 return d.astimezone(tz=timezone.utc) 86 87 88class FixedBaselineBuilder(BaselineBuilder): 89 """Helps to easily create the config object for a FixedBaseline.""" 90 91 def __init__(self, pipeline_name: str): 92 self.pipeline_name = pipeline_name 93 self.model_name: Optional[str] = None 94 self.start: Optional[datetime] = None 95 self.end: Optional[datetime] = None 96 97 def add_model_name(self, model_name: str): 98 """Specify the model to use in the baseline""" 99 self.model_name = model_name 100 return self 101 102 def add_start(self, start: datetime): 103 """Specify the start of the window for the baseline""" 104 self.start = start 105 return self 106 107 def add_end(self, end: datetime): 108 """Specify the end of the window for the baseline""" 109 self.end = end 110 return self 111 112 def build(self) -> FixedBaseline: 113 """Create the FixedBaseline object.""" 114 start = ensure_tz(unwrap(self.start)) 115 end = ensure_tz(unwrap(self.end)) 116 117 return FixedBaseline(self.pipeline_name, unwrap(self.model_name), start, end) 118 119 120class SummarizerConfig(object): 121 """The summarizer specifies how the bins of the baseline and 122 window should be compared.""" 123 124 def __init__(self): 125 pass 126 127 def to_json(self) -> str: 128 return json.dumps(self, indent=4, default=ConfigEncoder) 129 130 131class BinMode(str, Enum): 132 """How should we calculate the bins. 133 NONE - no bins. Only useful if we only care about the mean, median, etc. 134 EQUAL - evenly spaced bins: min - max / num_bins 135 QUANTILE - based on percentages. If num_bins is 5 then quintiles 136 so bins are created at the 20%, 40%, 60%, 80% and 100% points. 137 PROVIDED - user provides the edge points for the bins. 138 """ 139 140 NONE = "None" 141 EQUAL = "Equal" 142 QUANTILE = "Quantile" 143 PROVIDED = "Provided" 144 145 146class Aggregation(str, Enum): 147 """What we use to calculate the score. 148 EDGES - distnces between the edges. 149 DENSITY - percentage of values that fall in each bin. 150 CUMULATIVE - cumulative percentage that fall in the bins.""" 151 152 EDGES = "Edges" 153 DENSITY = "Density" 154 CUMULATIVE = "Cumulative" 155 156 157class Metric(str, Enum): 158 """How we calculate the score. 159 MAXDIFF - maximum difference between corresponding bins. 160 SUMDIFF - sum of differences between corresponding bins. 161 PSI - Population Stability Index""" 162 163 MAXDIFF = "MaxDiff" 164 SUMDIFF = "SumDiff" 165 PSI = "PSI" 166 167 168class UnivariateContinousSummarizerConfig(SummarizerConfig): 169 """The UnivariateContinousSummarizer analyizes one input or output feature 170 (Univariate) at a time. Expects the values to be continous or at least numerous 171 enough to fall in various/all the bins.""" 172 173 def __init__( 174 self, 175 bin_mode: BinMode, 176 aggregation: Aggregation, 177 metric: Metric, 178 num_bins: int, 179 bin_weights: Optional[List[float]] = None, 180 bin_width: Optional[float] = None, 181 provided_edges: Optional[List[float]] = None, 182 add_outlier_edges: bool = True, 183 ): 184 self.type = "UnivariateContinuous" 185 self.bin_mode = bin_mode 186 self.aggregation = aggregation 187 self.metric = metric 188 self.num_bins = num_bins 189 self.bin_weights = bin_weights 190 self.bin_width = bin_width 191 self.provided_edges = provided_edges 192 self.add_outlier_edges = add_outlier_edges 193 194 195class SummarizerBuilder(ABC): 196 @abstractmethod 197 def build(self) -> SummarizerConfig: 198 pass 199 200 201class UnivariateContinousSummarizerBuilder(SummarizerBuilder): 202 """Builds the UnviariateSummarizer""" 203 204 def __init__(self): 205 self.bin_mode = BinMode.QUANTILE 206 self.aggregation = Aggregation.DENSITY 207 self.metric = Metric.PSI 208 self.num_bins = 5 209 self.bin_weights: Optional[List[float]] = None 210 self.bin_width: Optional[float] = None 211 self.provided_edges: Optional[List[float]] = None 212 self.add_outlier_edges = True 213 214 def build(self) -> UnivariateContinousSummarizerConfig: 215 if self.bin_mode == BinMode.PROVIDED: 216 if self.provided_edges is None: 217 raise ValueError("Edges must be provided with BinMode.PROVIDED") 218 else: 219 if self.provided_edges is not None: 220 raise ValueError( 221 f"Edges may not be provided with bin mode {self.bin_mode}" 222 ) 223 224 sum = UnivariateContinousSummarizerConfig( 225 self.bin_mode, 226 self.aggregation, 227 self.metric, 228 self.num_bins, 229 self.bin_weights, 230 self.bin_width, 231 self.provided_edges, 232 self.add_outlier_edges, 233 ) 234 return sum 235 236 def add_bin_mode(self, bin_mode: BinMode, edges: Optional[List[float]] = None): 237 """Sets the binning mode. If BinMode.PROVIDED is specified a list of edges 238 is also required.""" 239 if bin_mode == BinMode.PROVIDED: 240 if edges is None: 241 raise ValueError("Edges must be provided with BinMode.PROVIDED") 242 243 self.bin_mode = bin_mode 244 self.add_bin_edges(edges) 245 return self 246 247 def add_num_bins(self, num_bins: int): 248 """Sets the number of bins. If weights have been previously set they 249 must be set to none to allow changing the number of bins.""" 250 251 if num_bins != self.num_bins and self.bin_weights is not None: 252 if num_bins + 2 != len(self.bin_weights): 253 msg = ( 254 f"({len(self.bin_weights)}) have already been set. " 255 + f"Please set them to None before changing the number of bins." 256 ) 257 raise ValueError(msg) 258 259 if num_bins != self.num_bins and self.provided_edges is not None: 260 if not ( 261 len(self.provided_edges) == num_bins 262 or len(self.provided_edges) == num_bins + 1 263 ): 264 msg = ( 265 f"({len(self.provided_edges)}) bin edges have already been set. " 266 + f"Please set them to None before changing the number of bins." 267 ) 268 raise ValueError(msg) 269 270 self.num_bins = num_bins 271 return self 272 273 def add_bin_weights(self, weights: Union[List[float], None]): 274 """Specifies the weighting to be given to the bins. The number of weights 275 must be 2 larger than the number of bins to accomodate outliers smaller 276 and outliers larger than values seen in the baseline. 277 The passed in values can be whole or real numbers and do not need to add 278 up to 1 or any other specific value as they will be normalized during the 279 score calculation phase. 280 The weights passed in can be none to remove previously specified weights 281 and to allow changing of the number of bins.""" 282 283 if weights is not None: 284 if self.num_bins + 2 != len(weights): 285 msg = ( 286 f"The number of weights ({len(weights)}) " 287 + f"must be 2 more ({self.num_bins + 2}) than the " 288 + f"number of bins ({self.num_bins}) to allow for the " 289 + f"left and right outlier bins." 290 ) 291 raise ValueError(msg) 292 self.bin_weights = weights 293 return self 294 295 def add_metric(self, metric: Metric): 296 """Sets the metric mode.""" 297 self.metric = metric 298 return self 299 300 def add_aggregation(self, aggregation: Aggregation): 301 """Sets the aggregation style.""" 302 self.aggregation = aggregation 303 return self 304 305 def add_bin_edges(self, edges: Union[List[float], None]): 306 """Specifies the right hand side (max value) of the bins. The number 307 of edges must be equal to or one more than the number of bins. When 308 equal to the number of bins the edge for the left outlier bin is 309 calculated from the baseline. When an additional edge (one more than 310 number of bins) that first (lower) value is used as the max value for 311 the left outlier bin. The max value for the right hand outlier bin is 312 always Float MAX. 313 """ 314 315 if edges is not None: 316 if not (len(edges) == self.num_bins or len(edges) == self.num_bins + 1): 317 msg = ( 318 f"The number of edges ({len(edges)}) " 319 + f"must be equal to ({self.num_bins}) or one more " 320 + f"({self.num_bins + 1 }) than the number of bins to account " 321 + f"for the left outlier bin." 322 ) 323 raise ValueError(msg) 324 edges = sorted(edges) 325 326 self.provided_edges = edges 327 return self 328 329 330class WindowConfig(object): 331 """Configures a window to be compared against the baseline.""" 332 333 def __init__( 334 self, 335 pipeline_name: str, 336 model_name: str, 337 width: str, 338 start: Optional[datetime] = None, 339 interval: Optional[str] = None, 340 ): 341 self.pipeline = pipeline_name 342 self.model = model_name 343 self.width = width 344 self.start = start 345 self.interval = interval 346 347 def to_json(self) -> str: 348 return json.dumps(self, indent=4, default=ConfigEncoder) 349 350 351class WindowBuilder(object): 352 """Helps build a WindowConfig. model and width are required but there are no 353 good default values for them because they depend on the baseline. We leave it 354 up to the assay builder to configure the window correctly after it is created. 355 """ 356 357 def __init__(self, pipeline_name: str): 358 self.pipeline = pipeline_name 359 self.model: Optional[str] = None 360 self.width: Optional[str] = "24 hours" 361 self.start: Optional[datetime] = None 362 self.interval: Optional[str] = None 363 364 def add_model_name(self, model_name: str): 365 """The model name (model_id) that the window should analyze.""" 366 self.model = model_name 367 return self 368 369 def _duration_kw_to_str(self, **kwargs) -> str: 370 interval_names = ["minute", "hour", "day", "week"] 371 duration_str = None 372 kw_count = 0 373 374 for interval_name in interval_names: 375 plural = interval_name + "s" 376 377 for kw in [interval_name, plural]: 378 if kw in kwargs: 379 duration_str = f"{kwargs[kw]} {plural}" 380 kw_count += 1 381 382 if kw_count == 0: 383 raise Exception( 384 "Please specify one of 'minutes', 'hours', 'days' or 'weeks' keyword args" 385 ) 386 387 elif kw_count > 1: 388 raise Exception( 389 "Please specify only one of 'minutes', 'hours', 'days' or 'weeks' keyword args" 390 ) 391 else: 392 return unwrap(duration_str) 393 394 def add_width(self, **kwargs: int): 395 """The width of the window to use when collecting data for analysis.""" 396 self.width = self._duration_kw_to_str(**kwargs) 397 return self 398 399 def add_interval(self, **kwargs: int): 400 """The width of the window to use when collecting data for analysis.""" 401 self.interval = self._duration_kw_to_str(**kwargs) 402 return self 403 404 def add_start(self, start: datetime): 405 self.start = start 406 return self 407 408 def build(self) -> WindowConfig: 409 start = ensure_tz(self.start) if self.start else None 410 411 return WindowConfig( 412 self.pipeline, 413 unwrap(self.model), 414 unwrap(self.width), 415 start, 416 self.interval, 417 ) 418 419 420def ConfigEncoder(o): 421 """Used to format datetimes as we need when encoding to JSON""" 422 if isinstance(o, datetime): 423 return o.isoformat() 424 else: 425 return o.__dict__ 426 427 428class AssayConfig(object): 429 """Configuration for an Assay record.""" 430 431 def __init__( 432 self, 433 client: Optional["Client"], 434 name: str, 435 pipeline_id: int, 436 pipeline_name: str, 437 active: bool, 438 status: str, 439 iopath: str, 440 baseline: BaselineConfig, 441 window: WindowConfig, 442 summarizer: SummarizerConfig, 443 warning_threshold: Optional[float], 444 alert_threshold: float, 445 run_until: Optional[datetime], 446 workspace_id: Optional[int], 447 ): 448 self.client = client 449 self.name = name 450 self.pipeline_id = pipeline_id 451 self.pipeline_name = pipeline_name 452 self.active = active 453 self.status = status 454 self.iopath = iopath 455 self.baseline = baseline 456 self.window = window 457 self.summarizer = summarizer 458 self.warning_threshold = warning_threshold 459 self.alert_threshold = alert_threshold 460 self.run_until = run_until 461 self.workspace_id = workspace_id 462 463 def to_json(self) -> str: 464 payload = self.__dict__.copy() 465 payload.pop("client", None) 466 payload.pop("model_insights_url", None) 467 return json.dumps(payload, indent=4, default=ConfigEncoder) 468 469 def interactive_run(self) -> AssayAnalysisList: 470 """Runs this assay interactively. The assay is not saved to the database 471 nor are analyis records saved to a Plateau topic. Useful for exploring 472 pipeline inference data and experimenting with thresholds.""" 473 474 client = unwrap(self.client) 475 payload = { 476 **json.loads(self.to_json()), 477 "created_at": datetime.now(timezone.utc).isoformat(), 478 } 479 mlops_client = client.mlops() 480 mlops_client.timeout = 5 * 60 481 ret = assays_run_interactive.sync( 482 client=mlops_client, 483 json_body=AssaysRunInteractiveJsonBody.from_dict(payload), 484 ) 485 486 analysis_list = [] 487 if ret is not None: 488 if not isinstance(ret, List): 489 raise Exception(ret.msg) 490 491 analysis_list = [AssayAnalysis(ar.to_dict()) for ar in ret] 492 493 return AssayAnalysisList(analysis_list) 494 495 def interactive_baseline_run(self) -> Optional[AssayAnalysis]: 496 497 client = unwrap(self.client) 498 payload = { 499 **json.loads(self.to_json()), 500 "created_at": datetime.now(timezone.utc).isoformat(), 501 } 502 ret = assays_run_interactive_baseline.sync( 503 client=client.mlops(), 504 json_body=AssaysRunInteractiveBaselineJsonBody.from_dict(payload), 505 ) 506 507 if ret is not None: 508 if not isinstance(ret, AssaysRunInteractiveBaselineResponse200): 509 raise Exception(ret.msg) 510 511 aa = ret.to_dict() 512 return AssayAnalysis(aa) 513 514 return None 515 516 def interactive_input_run( 517 self, inferences: List[Dict], labels: Optional[List[str]] 518 ) -> AssayAnalysisList: 519 """Analyzes the inputs given to create an interactive run for each feature 520 column. The assay is not saved to the database nor are analyis records saved 521 to a Plateau topic. Usefull for exploring inputs for possible causes when a 522 difference is detected in the output.""" 523 524 all_assays = [] 525 inference = inferences[0] 526 527 print(f"input column distinct_vals label largest_pct") 528 # TODO extend this to work for any input shape 529 inputs = inference["original_data"]["tensor"] 530 for idx0, _ in enumerate(inputs): 531 if labels and len(inputs[idx0]) != len(labels): 532 print( 533 f"Labels are not the same len {len(labels)} as inputs {len(inference['inputs'][idx0])}" 534 ) 535 for idx1, _ in enumerate(inputs[idx0]): 536 values = [] 537 for inf in inferences: 538 values.append(inf["original_data"]["tensor"][idx0][idx1]) 539 counter = Counter(values) 540 value_pct = [c / len(values) for c in counter.values()] 541 value_pct.sort() 542 largest_pct = value_pct[-1] 543 distinct_values = len(counter.keys()) 544 label = labels[idx1] if labels else "" 545 # TODO: Rule of thumb may need better way to distinguish 546 msg = ( 547 "*** May not be continuous feature" 548 if distinct_values < 5 or largest_pct > 0.90 549 else "" 550 ) 551 print( 552 f"{idx0:5} {idx1:5} {distinct_values:14} {label:15} {largest_pct:0.4f} {msg}" 553 ) 554 555 iopath = f"inputs {idx0} {idx1}" 556 self.iopath = iopath 557 558 assays = self.interactive_run() 559 all_assays.extend(assays.raw) 560 561 return AssayAnalysisList(all_assays) 562 563 564class AssayBuilder(object): 565 """Helps build an AssayConfig""" 566 567 def __init__( 568 self, 569 client: Optional["Client"], 570 name: str, 571 pipeline_id: int, 572 pipeline_name: str, 573 model_name: str, 574 baseline_start: datetime, 575 baseline_end: datetime, 576 ): 577 self.client = client 578 self.name = name 579 self.pipeline_id = pipeline_id 580 self.pipeline_name: str = pipeline_name 581 self.active = True 582 self.status = "created" 583 self.iopath: str = "output 0 0" 584 self.baseline: Optional[BaselineConfig] = None 585 self.window: Optional[WindowConfig] = None 586 self.summarizer: Optional[SummarizerConfig] = None 587 self.warning_threshold: Optional[float] = None 588 self.alert_threshold: float = 0.25 589 self.run_until: Optional[datetime] = None 590 self.workspace_id = ( 591 None if self.client is None else self.client.get_current_workspace().id() 592 ) 593 594 self.baseline_builder = ( 595 FixedBaselineBuilder(self.pipeline_name) 596 .add_model_name(model_name) 597 .add_start(baseline_start) 598 .add_end(baseline_end) 599 ) 600 self.window_builder_ = WindowBuilder(self.pipeline_name).add_model_name( 601 model_name 602 ) 603 604 self.summarizer_builder = UnivariateContinousSummarizerBuilder() 605 606 self._baseline_df: Optional[pd.DataFrame] = None 607 608 def baseline_dataframe(self): 609 if self._baseline_df is None: 610 client = unwrap(self.client) 611 self._baseline_df = client.get_pipeline_inference_dataframe( 612 client.get_topic_name(self.pipeline_id), 613 unwrap(self.baseline_builder.start), 614 unwrap(self.baseline_builder.end), 615 self.baseline_builder.model_name, 616 ) 617 return self._baseline_df 618 619 def baseline_histogram( 620 self, bins: Optional[Union[str, int]] = None, log_scale: bool = False 621 ): 622 623 df = self.baseline_dataframe() 624 625 n_bins = calc_bins(df.shape[0], bins) 626 627 col_name = self.iopath.replace(" ", "_") 628 629 # type inference for the bins param to histplot is incorrect: str vs str|int. 630 sns.histplot(data=df, x=col_name, bins=n_bins, log_scale=log_scale).set( # type: ignore 631 title=f"Baseline '{self.iopath}' {self.baseline_builder.start} - {self.baseline_builder.end}" 632 ) 633 plt.show() 634 635 def baseline_kde(self, log_scale: bool = False): 636 df = self.baseline_dataframe() 637 638 col_name = self.iopath.replace(" ", "_") 639 640 sns.kdeplot(data=df, x=col_name, log_scale=log_scale).set( 641 title=f"Baseline '{self.iopath}' {self.baseline_builder.start} - {self.baseline_builder.end}" 642 ) 643 plt.grid() 644 plt.show() 645 646 def baseline_ecdf(self, log_scale: bool = False): 647 df = self.baseline_dataframe() 648 649 col_name = self.iopath.replace(" ", "_") 650 651 sns.ecdfplot(data=df, x=col_name, log_scale=log_scale).set( 652 title=f"Baseline '{self.iopath}' {self.baseline_builder.start} - {self.baseline_builder.end}" 653 ) 654 plt.grid() 655 plt.show() 656 657 def build(self) -> AssayConfig: 658 self.baseline = self.baseline_builder.build() 659 self.window = self.window_builder_.build() 660 self.summarizer = self.summarizer_builder.build() 661 662 run_until = ensure_tz(self.run_until) if self.run_until else None 663 664 return AssayConfig( 665 self.client, 666 self.name, 667 self.pipeline_id, 668 self.pipeline_name, 669 self.active, 670 self.status, 671 self.iopath, 672 unwrap(self.baseline), 673 unwrap(self.window), 674 unwrap(self.summarizer), 675 self.warning_threshold, 676 self.alert_threshold, 677 run_until, 678 self.workspace_id, 679 ) 680 681 def upload(self) -> int: 682 config = self.build() 683 684 if self.client: 685 res = self.client.upload_assay(config) # type: ignore 686 return res 687 raise RuntimeError( 688 "Assay config was created for standalone and may only be used to generate configuration" 689 ) 690 691 def add_name(self, name: str): 692 """Specify the assay name""" 693 self.name = name 694 return self 695 696 def add_active(self, active: bool): 697 """Specify if the assay is active or not""" 698 self.active = active 699 return self 700 701 def add_iopath(self, iopath: str): 702 """Specify what the assay should analyze. Should start with input or output and have 703 indexes (zero based) into row and column: For example 'input 0 1' specifies the second 704 column of the first input.""" 705 706 iopath = iopath.strip() 707 assert iopath.lower().startswith("input") or iopath.lower().startswith("output") 708 self.iopath = iopath 709 self._baseline_df = None 710 return self 711 712 def fixed_baseline_builder(self): 713 """Specify creates a fixed baseline builder for this assay builder.""" 714 715 bb = FixedBaselineBuilder(unwrap(self.pipeline_name)) 716 self.baseline_builder = bb 717 return bb 718 719 def add_baseline(self, baseline: BaselineConfig): 720 """Adds a specific baseline created elsewhere.""" 721 self.baseline = baseline 722 self._baseline_df = None 723 return self 724 725 def window_builder(self): 726 """Returns this assay builders window builder.""" 727 return self.window_builder_ 728 729 def add_window(self, window: WindowConfig): 730 """Adds a window created elsewhere.""" 731 self.window = window 732 return self 733 734 def univariate_continuous_summarizer(self) -> UnivariateContinousSummarizerBuilder: 735 """Creates and adds an UCS to this assay builder.""" 736 ucsb = UnivariateContinousSummarizerBuilder() 737 self.summarizer_builder = ucsb 738 return ucsb 739 740 def add_summarizer(self, summarizer: SummarizerConfig): 741 """Adds the summarizer created elsewhere to this builder.""" 742 self.summarizer = summarizer 743 return self 744 745 def add_warning_threshold(self, warning_threshold: float): 746 """Specify the warning threshold for this assay.""" 747 self.warning_threshold = warning_threshold 748 return self 749 750 def add_alert_threshold(self, alert_threshold: float): 751 """Specify the alert threshold for this assay.""" 752 self.alert_threshold = alert_threshold 753 return self 754 755 def add_run_until(self, run_until: datetime): 756 """ "How long should this assay run. Primarily useful for 757 interactive runs to limit the number of analysis.""" 758 self.run_until = run_until 759 return self 760 761 762def calc_bins(num_samples: int, bins: Optional[Union[str, int]]) -> Union[str, int]: 763 """If the users specifies a number of bins or a strategy for calculating 764 it use that. Else us the min of the square root or 50.""" 765 766 if bins is None: 767 return min(int(math.sqrt(num_samples)), 50) 768 else: 769 return bins
38def unwrap(v: Optional[T]) -> T: 39 """Simple function to placate pylance""" 40 if v: 41 return v 42 raise Exception("Expected a value in forced unwrap")
Simple function to placate pylance
45class BaselineConfig(object): 46 """Abstract base class for Baseline config objects. Currently 47 only FixedBaseline is implemented though SlidingBaseline and 48 others are planned.""" 49 50 def __init__(self): 51 pass 52 53 def to_json(self) -> str: 54 return json.dumps(self, indent=4, default=ConfigEncoder)
Abstract base class for Baseline config objects. Currently only FixedBaseline is implemented though SlidingBaseline and others are planned.
57class FixedBaseline(BaselineConfig): 58 """The FixedBaseline is calculate from the inferences from a 59 specific time window.""" 60 61 def __init__( 62 self, pipeline_name: str, model_name: str, start: datetime, end: datetime 63 ): 64 self.Fixed = { 65 "pipeline": pipeline_name, 66 "model": model_name, 67 "start_at": start.isoformat(), 68 "end_at": end.isoformat(), 69 }
The FixedBaseline is calculate from the inferences from a specific time window.
Inherited Members
72class BaselineBuilder(ABC): 73 @abstractmethod 74 def build(self) -> BaselineConfig: 75 pass 76 77 def to_json(self) -> str: 78 return json.dumps(self, indent=4, default=ConfigEncoder)
Helper class that provides a standard way to create an ABC using inheritance.
81def ensure_tz(d: datetime) -> datetime: 82 """Ensure the date it tz aware. If naive assume it is in utc.""" 83 if d.tzinfo: 84 return d 85 else: 86 return d.astimezone(tz=timezone.utc)
Ensure the date it tz aware. If naive assume it is in utc.
89class FixedBaselineBuilder(BaselineBuilder): 90 """Helps to easily create the config object for a FixedBaseline.""" 91 92 def __init__(self, pipeline_name: str): 93 self.pipeline_name = pipeline_name 94 self.model_name: Optional[str] = None 95 self.start: Optional[datetime] = None 96 self.end: Optional[datetime] = None 97 98 def add_model_name(self, model_name: str): 99 """Specify the model to use in the baseline""" 100 self.model_name = model_name 101 return self 102 103 def add_start(self, start: datetime): 104 """Specify the start of the window for the baseline""" 105 self.start = start 106 return self 107 108 def add_end(self, end: datetime): 109 """Specify the end of the window for the baseline""" 110 self.end = end 111 return self 112 113 def build(self) -> FixedBaseline: 114 """Create the FixedBaseline object.""" 115 start = ensure_tz(unwrap(self.start)) 116 end = ensure_tz(unwrap(self.end)) 117 118 return FixedBaseline(self.pipeline_name, unwrap(self.model_name), start, end)
Helps to easily create the config object for a FixedBaseline.
98 def add_model_name(self, model_name: str): 99 """Specify the model to use in the baseline""" 100 self.model_name = model_name 101 return self
Specify the model to use in the baseline
103 def add_start(self, start: datetime): 104 """Specify the start of the window for the baseline""" 105 self.start = start 106 return self
Specify the start of the window for the baseline
108 def add_end(self, end: datetime): 109 """Specify the end of the window for the baseline""" 110 self.end = end 111 return self
Specify the end of the window for the baseline
113 def build(self) -> FixedBaseline: 114 """Create the FixedBaseline object.""" 115 start = ensure_tz(unwrap(self.start)) 116 end = ensure_tz(unwrap(self.end)) 117 118 return FixedBaseline(self.pipeline_name, unwrap(self.model_name), start, end)
Create the FixedBaseline object.
Inherited Members
121class SummarizerConfig(object): 122 """The summarizer specifies how the bins of the baseline and 123 window should be compared.""" 124 125 def __init__(self): 126 pass 127 128 def to_json(self) -> str: 129 return json.dumps(self, indent=4, default=ConfigEncoder)
The summarizer specifies how the bins of the baseline and window should be compared.
132class BinMode(str, Enum): 133 """How should we calculate the bins. 134 NONE - no bins. Only useful if we only care about the mean, median, etc. 135 EQUAL - evenly spaced bins: min - max / num_bins 136 QUANTILE - based on percentages. If num_bins is 5 then quintiles 137 so bins are created at the 20%, 40%, 60%, 80% and 100% points. 138 PROVIDED - user provides the edge points for the bins. 139 """ 140 141 NONE = "None" 142 EQUAL = "Equal" 143 QUANTILE = "Quantile" 144 PROVIDED = "Provided"
How should we calculate the bins. NONE - no bins. Only useful if we only care about the mean, median, etc. EQUAL - evenly spaced bins: min - max / num_bins QUANTILE - based on percentages. If num_bins is 5 then quintiles so bins are created at the 20%, 40%, 60%, 80% and 100% points. PROVIDED - user provides the edge points for the bins.
Inherited Members
- enum.Enum
- name
- value
- builtins.str
- encode
- replace
- split
- rsplit
- join
- capitalize
- casefold
- title
- center
- count
- expandtabs
- find
- partition
- index
- ljust
- lower
- lstrip
- rfind
- rindex
- rjust
- rstrip
- rpartition
- splitlines
- strip
- swapcase
- translate
- upper
- startswith
- endswith
- removeprefix
- removesuffix
- isascii
- islower
- isupper
- istitle
- isspace
- isdecimal
- isdigit
- isnumeric
- isalpha
- isalnum
- isidentifier
- isprintable
- zfill
- format
- format_map
- maketrans
147class Aggregation(str, Enum): 148 """What we use to calculate the score. 149 EDGES - distnces between the edges. 150 DENSITY - percentage of values that fall in each bin. 151 CUMULATIVE - cumulative percentage that fall in the bins.""" 152 153 EDGES = "Edges" 154 DENSITY = "Density" 155 CUMULATIVE = "Cumulative"
What we use to calculate the score. EDGES - distnces between the edges. DENSITY - percentage of values that fall in each bin. CUMULATIVE - cumulative percentage that fall in the bins.
Inherited Members
- enum.Enum
- name
- value
- builtins.str
- encode
- replace
- split
- rsplit
- join
- capitalize
- casefold
- title
- center
- count
- expandtabs
- find
- partition
- index
- ljust
- lower
- lstrip
- rfind
- rindex
- rjust
- rstrip
- rpartition
- splitlines
- strip
- swapcase
- translate
- upper
- startswith
- endswith
- removeprefix
- removesuffix
- isascii
- islower
- isupper
- istitle
- isspace
- isdecimal
- isdigit
- isnumeric
- isalpha
- isalnum
- isidentifier
- isprintable
- zfill
- format
- format_map
- maketrans
158class Metric(str, Enum): 159 """How we calculate the score. 160 MAXDIFF - maximum difference between corresponding bins. 161 SUMDIFF - sum of differences between corresponding bins. 162 PSI - Population Stability Index""" 163 164 MAXDIFF = "MaxDiff" 165 SUMDIFF = "SumDiff" 166 PSI = "PSI"
How we calculate the score. MAXDIFF - maximum difference between corresponding bins. SUMDIFF - sum of differences between corresponding bins. PSI - Population Stability Index
Inherited Members
- enum.Enum
- name
- value
- builtins.str
- encode
- replace
- split
- rsplit
- join
- capitalize
- casefold
- title
- center
- count
- expandtabs
- find
- partition
- index
- ljust
- lower
- lstrip
- rfind
- rindex
- rjust
- rstrip
- rpartition
- splitlines
- strip
- swapcase
- translate
- upper
- startswith
- endswith
- removeprefix
- removesuffix
- isascii
- islower
- isupper
- istitle
- isspace
- isdecimal
- isdigit
- isnumeric
- isalpha
- isalnum
- isidentifier
- isprintable
- zfill
- format
- format_map
- maketrans
169class UnivariateContinousSummarizerConfig(SummarizerConfig): 170 """The UnivariateContinousSummarizer analyizes one input or output feature 171 (Univariate) at a time. Expects the values to be continous or at least numerous 172 enough to fall in various/all the bins.""" 173 174 def __init__( 175 self, 176 bin_mode: BinMode, 177 aggregation: Aggregation, 178 metric: Metric, 179 num_bins: int, 180 bin_weights: Optional[List[float]] = None, 181 bin_width: Optional[float] = None, 182 provided_edges: Optional[List[float]] = None, 183 add_outlier_edges: bool = True, 184 ): 185 self.type = "UnivariateContinuous" 186 self.bin_mode = bin_mode 187 self.aggregation = aggregation 188 self.metric = metric 189 self.num_bins = num_bins 190 self.bin_weights = bin_weights 191 self.bin_width = bin_width 192 self.provided_edges = provided_edges 193 self.add_outlier_edges = add_outlier_edges
The UnivariateContinousSummarizer analyizes one input or output feature (Univariate) at a time. Expects the values to be continous or at least numerous enough to fall in various/all the bins.
174 def __init__( 175 self, 176 bin_mode: BinMode, 177 aggregation: Aggregation, 178 metric: Metric, 179 num_bins: int, 180 bin_weights: Optional[List[float]] = None, 181 bin_width: Optional[float] = None, 182 provided_edges: Optional[List[float]] = None, 183 add_outlier_edges: bool = True, 184 ): 185 self.type = "UnivariateContinuous" 186 self.bin_mode = bin_mode 187 self.aggregation = aggregation 188 self.metric = metric 189 self.num_bins = num_bins 190 self.bin_weights = bin_weights 191 self.bin_width = bin_width 192 self.provided_edges = provided_edges 193 self.add_outlier_edges = add_outlier_edges
Inherited Members
196class SummarizerBuilder(ABC): 197 @abstractmethod 198 def build(self) -> SummarizerConfig: 199 pass
Helper class that provides a standard way to create an ABC using inheritance.
202class UnivariateContinousSummarizerBuilder(SummarizerBuilder): 203 """Builds the UnviariateSummarizer""" 204 205 def __init__(self): 206 self.bin_mode = BinMode.QUANTILE 207 self.aggregation = Aggregation.DENSITY 208 self.metric = Metric.PSI 209 self.num_bins = 5 210 self.bin_weights: Optional[List[float]] = None 211 self.bin_width: Optional[float] = None 212 self.provided_edges: Optional[List[float]] = None 213 self.add_outlier_edges = True 214 215 def build(self) -> UnivariateContinousSummarizerConfig: 216 if self.bin_mode == BinMode.PROVIDED: 217 if self.provided_edges is None: 218 raise ValueError("Edges must be provided with BinMode.PROVIDED") 219 else: 220 if self.provided_edges is not None: 221 raise ValueError( 222 f"Edges may not be provided with bin mode {self.bin_mode}" 223 ) 224 225 sum = UnivariateContinousSummarizerConfig( 226 self.bin_mode, 227 self.aggregation, 228 self.metric, 229 self.num_bins, 230 self.bin_weights, 231 self.bin_width, 232 self.provided_edges, 233 self.add_outlier_edges, 234 ) 235 return sum 236 237 def add_bin_mode(self, bin_mode: BinMode, edges: Optional[List[float]] = None): 238 """Sets the binning mode. If BinMode.PROVIDED is specified a list of edges 239 is also required.""" 240 if bin_mode == BinMode.PROVIDED: 241 if edges is None: 242 raise ValueError("Edges must be provided with BinMode.PROVIDED") 243 244 self.bin_mode = bin_mode 245 self.add_bin_edges(edges) 246 return self 247 248 def add_num_bins(self, num_bins: int): 249 """Sets the number of bins. If weights have been previously set they 250 must be set to none to allow changing the number of bins.""" 251 252 if num_bins != self.num_bins and self.bin_weights is not None: 253 if num_bins + 2 != len(self.bin_weights): 254 msg = ( 255 f"({len(self.bin_weights)}) have already been set. " 256 + f"Please set them to None before changing the number of bins." 257 ) 258 raise ValueError(msg) 259 260 if num_bins != self.num_bins and self.provided_edges is not None: 261 if not ( 262 len(self.provided_edges) == num_bins 263 or len(self.provided_edges) == num_bins + 1 264 ): 265 msg = ( 266 f"({len(self.provided_edges)}) bin edges have already been set. " 267 + f"Please set them to None before changing the number of bins." 268 ) 269 raise ValueError(msg) 270 271 self.num_bins = num_bins 272 return self 273 274 def add_bin_weights(self, weights: Union[List[float], None]): 275 """Specifies the weighting to be given to the bins. The number of weights 276 must be 2 larger than the number of bins to accomodate outliers smaller 277 and outliers larger than values seen in the baseline. 278 The passed in values can be whole or real numbers and do not need to add 279 up to 1 or any other specific value as they will be normalized during the 280 score calculation phase. 281 The weights passed in can be none to remove previously specified weights 282 and to allow changing of the number of bins.""" 283 284 if weights is not None: 285 if self.num_bins + 2 != len(weights): 286 msg = ( 287 f"The number of weights ({len(weights)}) " 288 + f"must be 2 more ({self.num_bins + 2}) than the " 289 + f"number of bins ({self.num_bins}) to allow for the " 290 + f"left and right outlier bins." 291 ) 292 raise ValueError(msg) 293 self.bin_weights = weights 294 return self 295 296 def add_metric(self, metric: Metric): 297 """Sets the metric mode.""" 298 self.metric = metric 299 return self 300 301 def add_aggregation(self, aggregation: Aggregation): 302 """Sets the aggregation style.""" 303 self.aggregation = aggregation 304 return self 305 306 def add_bin_edges(self, edges: Union[List[float], None]): 307 """Specifies the right hand side (max value) of the bins. The number 308 of edges must be equal to or one more than the number of bins. When 309 equal to the number of bins the edge for the left outlier bin is 310 calculated from the baseline. When an additional edge (one more than 311 number of bins) that first (lower) value is used as the max value for 312 the left outlier bin. The max value for the right hand outlier bin is 313 always Float MAX. 314 """ 315 316 if edges is not None: 317 if not (len(edges) == self.num_bins or len(edges) == self.num_bins + 1): 318 msg = ( 319 f"The number of edges ({len(edges)}) " 320 + f"must be equal to ({self.num_bins}) or one more " 321 + f"({self.num_bins + 1 }) than the number of bins to account " 322 + f"for the left outlier bin." 323 ) 324 raise ValueError(msg) 325 edges = sorted(edges) 326 327 self.provided_edges = edges 328 return self
Builds the UnviariateSummarizer
205 def __init__(self): 206 self.bin_mode = BinMode.QUANTILE 207 self.aggregation = Aggregation.DENSITY 208 self.metric = Metric.PSI 209 self.num_bins = 5 210 self.bin_weights: Optional[List[float]] = None 211 self.bin_width: Optional[float] = None 212 self.provided_edges: Optional[List[float]] = None 213 self.add_outlier_edges = True
215 def build(self) -> UnivariateContinousSummarizerConfig: 216 if self.bin_mode == BinMode.PROVIDED: 217 if self.provided_edges is None: 218 raise ValueError("Edges must be provided with BinMode.PROVIDED") 219 else: 220 if self.provided_edges is not None: 221 raise ValueError( 222 f"Edges may not be provided with bin mode {self.bin_mode}" 223 ) 224 225 sum = UnivariateContinousSummarizerConfig( 226 self.bin_mode, 227 self.aggregation, 228 self.metric, 229 self.num_bins, 230 self.bin_weights, 231 self.bin_width, 232 self.provided_edges, 233 self.add_outlier_edges, 234 ) 235 return sum
237 def add_bin_mode(self, bin_mode: BinMode, edges: Optional[List[float]] = None): 238 """Sets the binning mode. If BinMode.PROVIDED is specified a list of edges 239 is also required.""" 240 if bin_mode == BinMode.PROVIDED: 241 if edges is None: 242 raise ValueError("Edges must be provided with BinMode.PROVIDED") 243 244 self.bin_mode = bin_mode 245 self.add_bin_edges(edges) 246 return self
Sets the binning mode. If BinMode.PROVIDED is specified a list of edges is also required.
248 def add_num_bins(self, num_bins: int): 249 """Sets the number of bins. If weights have been previously set they 250 must be set to none to allow changing the number of bins.""" 251 252 if num_bins != self.num_bins and self.bin_weights is not None: 253 if num_bins + 2 != len(self.bin_weights): 254 msg = ( 255 f"({len(self.bin_weights)}) have already been set. " 256 + f"Please set them to None before changing the number of bins." 257 ) 258 raise ValueError(msg) 259 260 if num_bins != self.num_bins and self.provided_edges is not None: 261 if not ( 262 len(self.provided_edges) == num_bins 263 or len(self.provided_edges) == num_bins + 1 264 ): 265 msg = ( 266 f"({len(self.provided_edges)}) bin edges have already been set. " 267 + f"Please set them to None before changing the number of bins." 268 ) 269 raise ValueError(msg) 270 271 self.num_bins = num_bins 272 return self
Sets the number of bins. If weights have been previously set they must be set to none to allow changing the number of bins.
274 def add_bin_weights(self, weights: Union[List[float], None]): 275 """Specifies the weighting to be given to the bins. The number of weights 276 must be 2 larger than the number of bins to accomodate outliers smaller 277 and outliers larger than values seen in the baseline. 278 The passed in values can be whole or real numbers and do not need to add 279 up to 1 or any other specific value as they will be normalized during the 280 score calculation phase. 281 The weights passed in can be none to remove previously specified weights 282 and to allow changing of the number of bins.""" 283 284 if weights is not None: 285 if self.num_bins + 2 != len(weights): 286 msg = ( 287 f"The number of weights ({len(weights)}) " 288 + f"must be 2 more ({self.num_bins + 2}) than the " 289 + f"number of bins ({self.num_bins}) to allow for the " 290 + f"left and right outlier bins." 291 ) 292 raise ValueError(msg) 293 self.bin_weights = weights 294 return self
Specifies the weighting to be given to the bins. The number of weights must be 2 larger than the number of bins to accomodate outliers smaller and outliers larger than values seen in the baseline. The passed in values can be whole or real numbers and do not need to add up to 1 or any other specific value as they will be normalized during the score calculation phase. The weights passed in can be none to remove previously specified weights and to allow changing of the number of bins.
296 def add_metric(self, metric: Metric): 297 """Sets the metric mode.""" 298 self.metric = metric 299 return self
Sets the metric mode.
301 def add_aggregation(self, aggregation: Aggregation): 302 """Sets the aggregation style.""" 303 self.aggregation = aggregation 304 return self
Sets the aggregation style.
306 def add_bin_edges(self, edges: Union[List[float], None]): 307 """Specifies the right hand side (max value) of the bins. The number 308 of edges must be equal to or one more than the number of bins. When 309 equal to the number of bins the edge for the left outlier bin is 310 calculated from the baseline. When an additional edge (one more than 311 number of bins) that first (lower) value is used as the max value for 312 the left outlier bin. The max value for the right hand outlier bin is 313 always Float MAX. 314 """ 315 316 if edges is not None: 317 if not (len(edges) == self.num_bins or len(edges) == self.num_bins + 1): 318 msg = ( 319 f"The number of edges ({len(edges)}) " 320 + f"must be equal to ({self.num_bins}) or one more " 321 + f"({self.num_bins + 1 }) than the number of bins to account " 322 + f"for the left outlier bin." 323 ) 324 raise ValueError(msg) 325 edges = sorted(edges) 326 327 self.provided_edges = edges 328 return self
Specifies the right hand side (max value) of the bins. The number of edges must be equal to or one more than the number of bins. When equal to the number of bins the edge for the left outlier bin is calculated from the baseline. When an additional edge (one more than number of bins) that first (lower) value is used as the max value for the left outlier bin. The max value for the right hand outlier bin is always Float MAX.
331class WindowConfig(object): 332 """Configures a window to be compared against the baseline.""" 333 334 def __init__( 335 self, 336 pipeline_name: str, 337 model_name: str, 338 width: str, 339 start: Optional[datetime] = None, 340 interval: Optional[str] = None, 341 ): 342 self.pipeline = pipeline_name 343 self.model = model_name 344 self.width = width 345 self.start = start 346 self.interval = interval 347 348 def to_json(self) -> str: 349 return json.dumps(self, indent=4, default=ConfigEncoder)
Configures a window to be compared against the baseline.
334 def __init__( 335 self, 336 pipeline_name: str, 337 model_name: str, 338 width: str, 339 start: Optional[datetime] = None, 340 interval: Optional[str] = None, 341 ): 342 self.pipeline = pipeline_name 343 self.model = model_name 344 self.width = width 345 self.start = start 346 self.interval = interval
352class WindowBuilder(object): 353 """Helps build a WindowConfig. model and width are required but there are no 354 good default values for them because they depend on the baseline. We leave it 355 up to the assay builder to configure the window correctly after it is created. 356 """ 357 358 def __init__(self, pipeline_name: str): 359 self.pipeline = pipeline_name 360 self.model: Optional[str] = None 361 self.width: Optional[str] = "24 hours" 362 self.start: Optional[datetime] = None 363 self.interval: Optional[str] = None 364 365 def add_model_name(self, model_name: str): 366 """The model name (model_id) that the window should analyze.""" 367 self.model = model_name 368 return self 369 370 def _duration_kw_to_str(self, **kwargs) -> str: 371 interval_names = ["minute", "hour", "day", "week"] 372 duration_str = None 373 kw_count = 0 374 375 for interval_name in interval_names: 376 plural = interval_name + "s" 377 378 for kw in [interval_name, plural]: 379 if kw in kwargs: 380 duration_str = f"{kwargs[kw]} {plural}" 381 kw_count += 1 382 383 if kw_count == 0: 384 raise Exception( 385 "Please specify one of 'minutes', 'hours', 'days' or 'weeks' keyword args" 386 ) 387 388 elif kw_count > 1: 389 raise Exception( 390 "Please specify only one of 'minutes', 'hours', 'days' or 'weeks' keyword args" 391 ) 392 else: 393 return unwrap(duration_str) 394 395 def add_width(self, **kwargs: int): 396 """The width of the window to use when collecting data for analysis.""" 397 self.width = self._duration_kw_to_str(**kwargs) 398 return self 399 400 def add_interval(self, **kwargs: int): 401 """The width of the window to use when collecting data for analysis.""" 402 self.interval = self._duration_kw_to_str(**kwargs) 403 return self 404 405 def add_start(self, start: datetime): 406 self.start = start 407 return self 408 409 def build(self) -> WindowConfig: 410 start = ensure_tz(self.start) if self.start else None 411 412 return WindowConfig( 413 self.pipeline, 414 unwrap(self.model), 415 unwrap(self.width), 416 start, 417 self.interval, 418 )
Helps build a WindowConfig. model and width are required but there are no good default values for them because they depend on the baseline. We leave it up to the assay builder to configure the window correctly after it is created.
365 def add_model_name(self, model_name: str): 366 """The model name (model_id) that the window should analyze.""" 367 self.model = model_name 368 return self
The model name (model_id) that the window should analyze.
395 def add_width(self, **kwargs: int): 396 """The width of the window to use when collecting data for analysis.""" 397 self.width = self._duration_kw_to_str(**kwargs) 398 return self
The width of the window to use when collecting data for analysis.
400 def add_interval(self, **kwargs: int): 401 """The width of the window to use when collecting data for analysis.""" 402 self.interval = self._duration_kw_to_str(**kwargs) 403 return self
The width of the window to use when collecting data for analysis.
421def ConfigEncoder(o): 422 """Used to format datetimes as we need when encoding to JSON""" 423 if isinstance(o, datetime): 424 return o.isoformat() 425 else: 426 return o.__dict__
Used to format datetimes as we need when encoding to JSON
429class AssayConfig(object): 430 """Configuration for an Assay record.""" 431 432 def __init__( 433 self, 434 client: Optional["Client"], 435 name: str, 436 pipeline_id: int, 437 pipeline_name: str, 438 active: bool, 439 status: str, 440 iopath: str, 441 baseline: BaselineConfig, 442 window: WindowConfig, 443 summarizer: SummarizerConfig, 444 warning_threshold: Optional[float], 445 alert_threshold: float, 446 run_until: Optional[datetime], 447 workspace_id: Optional[int], 448 ): 449 self.client = client 450 self.name = name 451 self.pipeline_id = pipeline_id 452 self.pipeline_name = pipeline_name 453 self.active = active 454 self.status = status 455 self.iopath = iopath 456 self.baseline = baseline 457 self.window = window 458 self.summarizer = summarizer 459 self.warning_threshold = warning_threshold 460 self.alert_threshold = alert_threshold 461 self.run_until = run_until 462 self.workspace_id = workspace_id 463 464 def to_json(self) -> str: 465 payload = self.__dict__.copy() 466 payload.pop("client", None) 467 payload.pop("model_insights_url", None) 468 return json.dumps(payload, indent=4, default=ConfigEncoder) 469 470 def interactive_run(self) -> AssayAnalysisList: 471 """Runs this assay interactively. The assay is not saved to the database 472 nor are analyis records saved to a Plateau topic. Useful for exploring 473 pipeline inference data and experimenting with thresholds.""" 474 475 client = unwrap(self.client) 476 payload = { 477 **json.loads(self.to_json()), 478 "created_at": datetime.now(timezone.utc).isoformat(), 479 } 480 mlops_client = client.mlops() 481 mlops_client.timeout = 5 * 60 482 ret = assays_run_interactive.sync( 483 client=mlops_client, 484 json_body=AssaysRunInteractiveJsonBody.from_dict(payload), 485 ) 486 487 analysis_list = [] 488 if ret is not None: 489 if not isinstance(ret, List): 490 raise Exception(ret.msg) 491 492 analysis_list = [AssayAnalysis(ar.to_dict()) for ar in ret] 493 494 return AssayAnalysisList(analysis_list) 495 496 def interactive_baseline_run(self) -> Optional[AssayAnalysis]: 497 498 client = unwrap(self.client) 499 payload = { 500 **json.loads(self.to_json()), 501 "created_at": datetime.now(timezone.utc).isoformat(), 502 } 503 ret = assays_run_interactive_baseline.sync( 504 client=client.mlops(), 505 json_body=AssaysRunInteractiveBaselineJsonBody.from_dict(payload), 506 ) 507 508 if ret is not None: 509 if not isinstance(ret, AssaysRunInteractiveBaselineResponse200): 510 raise Exception(ret.msg) 511 512 aa = ret.to_dict() 513 return AssayAnalysis(aa) 514 515 return None 516 517 def interactive_input_run( 518 self, inferences: List[Dict], labels: Optional[List[str]] 519 ) -> AssayAnalysisList: 520 """Analyzes the inputs given to create an interactive run for each feature 521 column. The assay is not saved to the database nor are analyis records saved 522 to a Plateau topic. Usefull for exploring inputs for possible causes when a 523 difference is detected in the output.""" 524 525 all_assays = [] 526 inference = inferences[0] 527 528 print(f"input column distinct_vals label largest_pct") 529 # TODO extend this to work for any input shape 530 inputs = inference["original_data"]["tensor"] 531 for idx0, _ in enumerate(inputs): 532 if labels and len(inputs[idx0]) != len(labels): 533 print( 534 f"Labels are not the same len {len(labels)} as inputs {len(inference['inputs'][idx0])}" 535 ) 536 for idx1, _ in enumerate(inputs[idx0]): 537 values = [] 538 for inf in inferences: 539 values.append(inf["original_data"]["tensor"][idx0][idx1]) 540 counter = Counter(values) 541 value_pct = [c / len(values) for c in counter.values()] 542 value_pct.sort() 543 largest_pct = value_pct[-1] 544 distinct_values = len(counter.keys()) 545 label = labels[idx1] if labels else "" 546 # TODO: Rule of thumb may need better way to distinguish 547 msg = ( 548 "*** May not be continuous feature" 549 if distinct_values < 5 or largest_pct > 0.90 550 else "" 551 ) 552 print( 553 f"{idx0:5} {idx1:5} {distinct_values:14} {label:15} {largest_pct:0.4f} {msg}" 554 ) 555 556 iopath = f"inputs {idx0} {idx1}" 557 self.iopath = iopath 558 559 assays = self.interactive_run() 560 all_assays.extend(assays.raw) 561 562 return AssayAnalysisList(all_assays)
Configuration for an Assay record.
432 def __init__( 433 self, 434 client: Optional["Client"], 435 name: str, 436 pipeline_id: int, 437 pipeline_name: str, 438 active: bool, 439 status: str, 440 iopath: str, 441 baseline: BaselineConfig, 442 window: WindowConfig, 443 summarizer: SummarizerConfig, 444 warning_threshold: Optional[float], 445 alert_threshold: float, 446 run_until: Optional[datetime], 447 workspace_id: Optional[int], 448 ): 449 self.client = client 450 self.name = name 451 self.pipeline_id = pipeline_id 452 self.pipeline_name = pipeline_name 453 self.active = active 454 self.status = status 455 self.iopath = iopath 456 self.baseline = baseline 457 self.window = window 458 self.summarizer = summarizer 459 self.warning_threshold = warning_threshold 460 self.alert_threshold = alert_threshold 461 self.run_until = run_until 462 self.workspace_id = workspace_id
470 def interactive_run(self) -> AssayAnalysisList: 471 """Runs this assay interactively. The assay is not saved to the database 472 nor are analyis records saved to a Plateau topic. Useful for exploring 473 pipeline inference data and experimenting with thresholds.""" 474 475 client = unwrap(self.client) 476 payload = { 477 **json.loads(self.to_json()), 478 "created_at": datetime.now(timezone.utc).isoformat(), 479 } 480 mlops_client = client.mlops() 481 mlops_client.timeout = 5 * 60 482 ret = assays_run_interactive.sync( 483 client=mlops_client, 484 json_body=AssaysRunInteractiveJsonBody.from_dict(payload), 485 ) 486 487 analysis_list = [] 488 if ret is not None: 489 if not isinstance(ret, List): 490 raise Exception(ret.msg) 491 492 analysis_list = [AssayAnalysis(ar.to_dict()) for ar in ret] 493 494 return AssayAnalysisList(analysis_list)
Runs this assay interactively. The assay is not saved to the database nor are analyis records saved to a Plateau topic. Useful for exploring pipeline inference data and experimenting with thresholds.
496 def interactive_baseline_run(self) -> Optional[AssayAnalysis]: 497 498 client = unwrap(self.client) 499 payload = { 500 **json.loads(self.to_json()), 501 "created_at": datetime.now(timezone.utc).isoformat(), 502 } 503 ret = assays_run_interactive_baseline.sync( 504 client=client.mlops(), 505 json_body=AssaysRunInteractiveBaselineJsonBody.from_dict(payload), 506 ) 507 508 if ret is not None: 509 if not isinstance(ret, AssaysRunInteractiveBaselineResponse200): 510 raise Exception(ret.msg) 511 512 aa = ret.to_dict() 513 return AssayAnalysis(aa) 514 515 return None
517 def interactive_input_run( 518 self, inferences: List[Dict], labels: Optional[List[str]] 519 ) -> AssayAnalysisList: 520 """Analyzes the inputs given to create an interactive run for each feature 521 column. The assay is not saved to the database nor are analyis records saved 522 to a Plateau topic. Usefull for exploring inputs for possible causes when a 523 difference is detected in the output.""" 524 525 all_assays = [] 526 inference = inferences[0] 527 528 print(f"input column distinct_vals label largest_pct") 529 # TODO extend this to work for any input shape 530 inputs = inference["original_data"]["tensor"] 531 for idx0, _ in enumerate(inputs): 532 if labels and len(inputs[idx0]) != len(labels): 533 print( 534 f"Labels are not the same len {len(labels)} as inputs {len(inference['inputs'][idx0])}" 535 ) 536 for idx1, _ in enumerate(inputs[idx0]): 537 values = [] 538 for inf in inferences: 539 values.append(inf["original_data"]["tensor"][idx0][idx1]) 540 counter = Counter(values) 541 value_pct = [c / len(values) for c in counter.values()] 542 value_pct.sort() 543 largest_pct = value_pct[-1] 544 distinct_values = len(counter.keys()) 545 label = labels[idx1] if labels else "" 546 # TODO: Rule of thumb may need better way to distinguish 547 msg = ( 548 "*** May not be continuous feature" 549 if distinct_values < 5 or largest_pct > 0.90 550 else "" 551 ) 552 print( 553 f"{idx0:5} {idx1:5} {distinct_values:14} {label:15} {largest_pct:0.4f} {msg}" 554 ) 555 556 iopath = f"inputs {idx0} {idx1}" 557 self.iopath = iopath 558 559 assays = self.interactive_run() 560 all_assays.extend(assays.raw) 561 562 return AssayAnalysisList(all_assays)
Analyzes the inputs given to create an interactive run for each feature column. The assay is not saved to the database nor are analyis records saved to a Plateau topic. Usefull for exploring inputs for possible causes when a difference is detected in the output.
565class AssayBuilder(object): 566 """Helps build an AssayConfig""" 567 568 def __init__( 569 self, 570 client: Optional["Client"], 571 name: str, 572 pipeline_id: int, 573 pipeline_name: str, 574 model_name: str, 575 baseline_start: datetime, 576 baseline_end: datetime, 577 ): 578 self.client = client 579 self.name = name 580 self.pipeline_id = pipeline_id 581 self.pipeline_name: str = pipeline_name 582 self.active = True 583 self.status = "created" 584 self.iopath: str = "output 0 0" 585 self.baseline: Optional[BaselineConfig] = None 586 self.window: Optional[WindowConfig] = None 587 self.summarizer: Optional[SummarizerConfig] = None 588 self.warning_threshold: Optional[float] = None 589 self.alert_threshold: float = 0.25 590 self.run_until: Optional[datetime] = None 591 self.workspace_id = ( 592 None if self.client is None else self.client.get_current_workspace().id() 593 ) 594 595 self.baseline_builder = ( 596 FixedBaselineBuilder(self.pipeline_name) 597 .add_model_name(model_name) 598 .add_start(baseline_start) 599 .add_end(baseline_end) 600 ) 601 self.window_builder_ = WindowBuilder(self.pipeline_name).add_model_name( 602 model_name 603 ) 604 605 self.summarizer_builder = UnivariateContinousSummarizerBuilder() 606 607 self._baseline_df: Optional[pd.DataFrame] = None 608 609 def baseline_dataframe(self): 610 if self._baseline_df is None: 611 client = unwrap(self.client) 612 self._baseline_df = client.get_pipeline_inference_dataframe( 613 client.get_topic_name(self.pipeline_id), 614 unwrap(self.baseline_builder.start), 615 unwrap(self.baseline_builder.end), 616 self.baseline_builder.model_name, 617 ) 618 return self._baseline_df 619 620 def baseline_histogram( 621 self, bins: Optional[Union[str, int]] = None, log_scale: bool = False 622 ): 623 624 df = self.baseline_dataframe() 625 626 n_bins = calc_bins(df.shape[0], bins) 627 628 col_name = self.iopath.replace(" ", "_") 629 630 # type inference for the bins param to histplot is incorrect: str vs str|int. 631 sns.histplot(data=df, x=col_name, bins=n_bins, log_scale=log_scale).set( # type: ignore 632 title=f"Baseline '{self.iopath}' {self.baseline_builder.start} - {self.baseline_builder.end}" 633 ) 634 plt.show() 635 636 def baseline_kde(self, log_scale: bool = False): 637 df = self.baseline_dataframe() 638 639 col_name = self.iopath.replace(" ", "_") 640 641 sns.kdeplot(data=df, x=col_name, log_scale=log_scale).set( 642 title=f"Baseline '{self.iopath}' {self.baseline_builder.start} - {self.baseline_builder.end}" 643 ) 644 plt.grid() 645 plt.show() 646 647 def baseline_ecdf(self, log_scale: bool = False): 648 df = self.baseline_dataframe() 649 650 col_name = self.iopath.replace(" ", "_") 651 652 sns.ecdfplot(data=df, x=col_name, log_scale=log_scale).set( 653 title=f"Baseline '{self.iopath}' {self.baseline_builder.start} - {self.baseline_builder.end}" 654 ) 655 plt.grid() 656 plt.show() 657 658 def build(self) -> AssayConfig: 659 self.baseline = self.baseline_builder.build() 660 self.window = self.window_builder_.build() 661 self.summarizer = self.summarizer_builder.build() 662 663 run_until = ensure_tz(self.run_until) if self.run_until else None 664 665 return AssayConfig( 666 self.client, 667 self.name, 668 self.pipeline_id, 669 self.pipeline_name, 670 self.active, 671 self.status, 672 self.iopath, 673 unwrap(self.baseline), 674 unwrap(self.window), 675 unwrap(self.summarizer), 676 self.warning_threshold, 677 self.alert_threshold, 678 run_until, 679 self.workspace_id, 680 ) 681 682 def upload(self) -> int: 683 config = self.build() 684 685 if self.client: 686 res = self.client.upload_assay(config) # type: ignore 687 return res 688 raise RuntimeError( 689 "Assay config was created for standalone and may only be used to generate configuration" 690 ) 691 692 def add_name(self, name: str): 693 """Specify the assay name""" 694 self.name = name 695 return self 696 697 def add_active(self, active: bool): 698 """Specify if the assay is active or not""" 699 self.active = active 700 return self 701 702 def add_iopath(self, iopath: str): 703 """Specify what the assay should analyze. Should start with input or output and have 704 indexes (zero based) into row and column: For example 'input 0 1' specifies the second 705 column of the first input.""" 706 707 iopath = iopath.strip() 708 assert iopath.lower().startswith("input") or iopath.lower().startswith("output") 709 self.iopath = iopath 710 self._baseline_df = None 711 return self 712 713 def fixed_baseline_builder(self): 714 """Specify creates a fixed baseline builder for this assay builder.""" 715 716 bb = FixedBaselineBuilder(unwrap(self.pipeline_name)) 717 self.baseline_builder = bb 718 return bb 719 720 def add_baseline(self, baseline: BaselineConfig): 721 """Adds a specific baseline created elsewhere.""" 722 self.baseline = baseline 723 self._baseline_df = None 724 return self 725 726 def window_builder(self): 727 """Returns this assay builders window builder.""" 728 return self.window_builder_ 729 730 def add_window(self, window: WindowConfig): 731 """Adds a window created elsewhere.""" 732 self.window = window 733 return self 734 735 def univariate_continuous_summarizer(self) -> UnivariateContinousSummarizerBuilder: 736 """Creates and adds an UCS to this assay builder.""" 737 ucsb = UnivariateContinousSummarizerBuilder() 738 self.summarizer_builder = ucsb 739 return ucsb 740 741 def add_summarizer(self, summarizer: SummarizerConfig): 742 """Adds the summarizer created elsewhere to this builder.""" 743 self.summarizer = summarizer 744 return self 745 746 def add_warning_threshold(self, warning_threshold: float): 747 """Specify the warning threshold for this assay.""" 748 self.warning_threshold = warning_threshold 749 return self 750 751 def add_alert_threshold(self, alert_threshold: float): 752 """Specify the alert threshold for this assay.""" 753 self.alert_threshold = alert_threshold 754 return self 755 756 def add_run_until(self, run_until: datetime): 757 """ "How long should this assay run. Primarily useful for 758 interactive runs to limit the number of analysis.""" 759 self.run_until = run_until 760 return self
Helps build an AssayConfig
568 def __init__( 569 self, 570 client: Optional["Client"], 571 name: str, 572 pipeline_id: int, 573 pipeline_name: str, 574 model_name: str, 575 baseline_start: datetime, 576 baseline_end: datetime, 577 ): 578 self.client = client 579 self.name = name 580 self.pipeline_id = pipeline_id 581 self.pipeline_name: str = pipeline_name 582 self.active = True 583 self.status = "created" 584 self.iopath: str = "output 0 0" 585 self.baseline: Optional[BaselineConfig] = None 586 self.window: Optional[WindowConfig] = None 587 self.summarizer: Optional[SummarizerConfig] = None 588 self.warning_threshold: Optional[float] = None 589 self.alert_threshold: float = 0.25 590 self.run_until: Optional[datetime] = None 591 self.workspace_id = ( 592 None if self.client is None else self.client.get_current_workspace().id() 593 ) 594 595 self.baseline_builder = ( 596 FixedBaselineBuilder(self.pipeline_name) 597 .add_model_name(model_name) 598 .add_start(baseline_start) 599 .add_end(baseline_end) 600 ) 601 self.window_builder_ = WindowBuilder(self.pipeline_name).add_model_name( 602 model_name 603 ) 604 605 self.summarizer_builder = UnivariateContinousSummarizerBuilder() 606 607 self._baseline_df: Optional[pd.DataFrame] = None
609 def baseline_dataframe(self): 610 if self._baseline_df is None: 611 client = unwrap(self.client) 612 self._baseline_df = client.get_pipeline_inference_dataframe( 613 client.get_topic_name(self.pipeline_id), 614 unwrap(self.baseline_builder.start), 615 unwrap(self.baseline_builder.end), 616 self.baseline_builder.model_name, 617 ) 618 return self._baseline_df
620 def baseline_histogram( 621 self, bins: Optional[Union[str, int]] = None, log_scale: bool = False 622 ): 623 624 df = self.baseline_dataframe() 625 626 n_bins = calc_bins(df.shape[0], bins) 627 628 col_name = self.iopath.replace(" ", "_") 629 630 # type inference for the bins param to histplot is incorrect: str vs str|int. 631 sns.histplot(data=df, x=col_name, bins=n_bins, log_scale=log_scale).set( # type: ignore 632 title=f"Baseline '{self.iopath}' {self.baseline_builder.start} - {self.baseline_builder.end}" 633 ) 634 plt.show()
636 def baseline_kde(self, log_scale: bool = False): 637 df = self.baseline_dataframe() 638 639 col_name = self.iopath.replace(" ", "_") 640 641 sns.kdeplot(data=df, x=col_name, log_scale=log_scale).set( 642 title=f"Baseline '{self.iopath}' {self.baseline_builder.start} - {self.baseline_builder.end}" 643 ) 644 plt.grid() 645 plt.show()
647 def baseline_ecdf(self, log_scale: bool = False): 648 df = self.baseline_dataframe() 649 650 col_name = self.iopath.replace(" ", "_") 651 652 sns.ecdfplot(data=df, x=col_name, log_scale=log_scale).set( 653 title=f"Baseline '{self.iopath}' {self.baseline_builder.start} - {self.baseline_builder.end}" 654 ) 655 plt.grid() 656 plt.show()
658 def build(self) -> AssayConfig: 659 self.baseline = self.baseline_builder.build() 660 self.window = self.window_builder_.build() 661 self.summarizer = self.summarizer_builder.build() 662 663 run_until = ensure_tz(self.run_until) if self.run_until else None 664 665 return AssayConfig( 666 self.client, 667 self.name, 668 self.pipeline_id, 669 self.pipeline_name, 670 self.active, 671 self.status, 672 self.iopath, 673 unwrap(self.baseline), 674 unwrap(self.window), 675 unwrap(self.summarizer), 676 self.warning_threshold, 677 self.alert_threshold, 678 run_until, 679 self.workspace_id, 680 )
692 def add_name(self, name: str): 693 """Specify the assay name""" 694 self.name = name 695 return self
Specify the assay name
697 def add_active(self, active: bool): 698 """Specify if the assay is active or not""" 699 self.active = active 700 return self
Specify if the assay is active or not
702 def add_iopath(self, iopath: str): 703 """Specify what the assay should analyze. Should start with input or output and have 704 indexes (zero based) into row and column: For example 'input 0 1' specifies the second 705 column of the first input.""" 706 707 iopath = iopath.strip() 708 assert iopath.lower().startswith("input") or iopath.lower().startswith("output") 709 self.iopath = iopath 710 self._baseline_df = None 711 return self
Specify what the assay should analyze. Should start with input or output and have indexes (zero based) into row and column: For example 'input 0 1' specifies the second column of the first input.
713 def fixed_baseline_builder(self): 714 """Specify creates a fixed baseline builder for this assay builder.""" 715 716 bb = FixedBaselineBuilder(unwrap(self.pipeline_name)) 717 self.baseline_builder = bb 718 return bb
Specify creates a fixed baseline builder for this assay builder.
720 def add_baseline(self, baseline: BaselineConfig): 721 """Adds a specific baseline created elsewhere.""" 722 self.baseline = baseline 723 self._baseline_df = None 724 return self
Adds a specific baseline created elsewhere.
726 def window_builder(self): 727 """Returns this assay builders window builder.""" 728 return self.window_builder_
Returns this assay builders window builder.
730 def add_window(self, window: WindowConfig): 731 """Adds a window created elsewhere.""" 732 self.window = window 733 return self
Adds a window created elsewhere.
735 def univariate_continuous_summarizer(self) -> UnivariateContinousSummarizerBuilder: 736 """Creates and adds an UCS to this assay builder.""" 737 ucsb = UnivariateContinousSummarizerBuilder() 738 self.summarizer_builder = ucsb 739 return ucsb
Creates and adds an UCS to this assay builder.
741 def add_summarizer(self, summarizer: SummarizerConfig): 742 """Adds the summarizer created elsewhere to this builder.""" 743 self.summarizer = summarizer 744 return self
Adds the summarizer created elsewhere to this builder.
746 def add_warning_threshold(self, warning_threshold: float): 747 """Specify the warning threshold for this assay.""" 748 self.warning_threshold = warning_threshold 749 return self
Specify the warning threshold for this assay.
751 def add_alert_threshold(self, alert_threshold: float): 752 """Specify the alert threshold for this assay.""" 753 self.alert_threshold = alert_threshold 754 return self
Specify the alert threshold for this assay.
756 def add_run_until(self, run_until: datetime): 757 """ "How long should this assay run. Primarily useful for 758 interactive runs to limit the number of analysis.""" 759 self.run_until = run_until 760 return self
"How long should this assay run. Primarily useful for interactive runs to limit the number of analysis.
763def calc_bins(num_samples: int, bins: Optional[Union[str, int]]) -> Union[str, int]: 764 """If the users specifies a number of bins or a strategy for calculating 765 it use that. Else us the min of the square root or 50.""" 766 767 if bins is None: 768 return min(int(math.sqrt(num_samples)), 50) 769 else: 770 return bins
If the users specifies a number of bins or a strategy for calculating it use that. Else us the min of the square root or 50.