mgplot.summary_plot

summary_plot.py: Produce a summary plot for the data in a given DataFrame. The data is normalised to z-scores and scaled.

  1"""
  2summary_plot.py:
  3Produce a summary plot for the data in a given DataFrame.
  4The data is normalised to z-scores and scaled.
  5"""
  6
  7# --- imports
  8# system imports
  9from typing import Any
 10
 11# from collections.abc import Sequence
 12
 13# analytic third-party imports
 14from numpy import ndarray, array
 15from matplotlib.pyplot import Axes, subplots
 16from pandas import DataFrame, Period
 17
 18# local imports
 19from mgplot.settings import DataT
 20from mgplot.finalise_plot import make_legend
 21from mgplot.utilities import constrain_data, check_clean_timeseries
 22from mgplot.kw_type_checking import (
 23    report_kwargs,
 24    ExpectedTypeDict,
 25    validate_expected,
 26    validate_kwargs,
 27)
 28
 29
 30# --- constants
 31ZSCORES = "zscores"
 32ZSCALED = "zscaled"
 33
 34SUMMARY_KW_TYPES: ExpectedTypeDict = {
 35    "verbose": bool,
 36    "middle": float,
 37    "plot_type": str,
 38    "plot_from": (int, Period, type(None)),
 39    "legend": (type(None), bool, dict, (str, object)),
 40}
 41validate_expected(SUMMARY_KW_TYPES, "summary_plot")
 42
 43
 44# --- functions
 45def _calc_quantiles(middle: float) -> ndarray:
 46    """Calculate the quantiles for the middle of the data."""
 47    return array([(1 - middle) / 2.0, 1 - (1 - middle) / 2.0])
 48
 49
 50def _calculate_z(
 51    original: DataFrame,  # only contains the data points of interest
 52    middle: float,  # middle proportion of data to highlight (eg. 0.8)
 53    verbose: bool = False,  # print the summary data
 54) -> tuple[DataFrame, DataFrame]:
 55    """Calculate z-scores, scaled z-scores and middle quantiles.
 56    Return z_scores, z_scaled, q (which are the quantiles for the
 57    start/end of the middle proportion of data to highlight)."""
 58
 59    # calculate z-scores, scaled scores and middle quantiles
 60    z_scores: DataFrame = (original - original.mean()) / original.std()
 61    z_scaled: DataFrame = (
 62        # scale z-scores between -1 and +1
 63        (((z_scores - z_scores.min()) / (z_scores.max() - z_scores.min())) - 0.5)
 64        * 2
 65    )
 66    q_middle = _calc_quantiles(middle)
 67
 68    if verbose:
 69        frame = DataFrame(
 70            {
 71                "count": original.count(),
 72                "mean": original.mean(),
 73                "median": original.median(),
 74                "min shaded": original.quantile(q=q_middle[0]),
 75                "max shaded": original.quantile(q=q_middle[1]),
 76                "z-scores": z_scores.iloc[-1],
 77                "scaled": z_scaled.iloc[-1],
 78            }
 79        )
 80        print(frame)
 81
 82    return DataFrame(z_scores), DataFrame(z_scaled)  # syntactic sugar for type hinting
 83
 84
 85def _plot_middle_bars(
 86    adjusted: DataFrame,
 87    middle: float,
 88    kwargs: dict[str, Any],  # must be a dictionary, not a splat
 89) -> Axes:
 90    """Plot the middle (typically 80%) of the data as a bar.
 91    Note: also sets the x-axis limits in kwargs.
 92    Return the matplotlib Axes object."""
 93
 94    q = _calc_quantiles(middle)
 95    lo_hi: DataFrame = adjusted.quantile(q=q).T  # get the middle section of data
 96    span = 1.15
 97    space = 0.2
 98    low = min(adjusted.iloc[-1].min(), lo_hi.min().min(), -span) - space
 99    high = max(adjusted.iloc[-1].max(), lo_hi.max().max(), span) + space
100    kwargs["xlim"] = (low, high)  # remember the x-axis limits
101    _fig, ax = subplots()
102    ax.barh(
103        y=lo_hi.index,
104        width=lo_hi[q[1]] - lo_hi[q[0]],
105        left=lo_hi[q[0]],
106        color="#bbbbbb",
107        label=f"Middle {middle*100:0.0f}% of prints",
108    )
109    return ax
110
111
112def _plot_latest_datapoint(
113    ax: Axes,
114    original: DataFrame,
115    adjusted: DataFrame,
116    f_size: int,
117) -> None:
118    """Add the latest datapoints to the summary plot"""
119
120    ax.scatter(adjusted.iloc[-1], adjusted.columns, color="darkorange", label="Latest")
121    f_size = 10
122    row = adjusted.index[-1]
123    for col_num, col_name in enumerate(original.columns):
124        ax.text(
125            x=adjusted.at[row, col_name],
126            y=col_num,
127            s=f"{original.at[row, col_name]:.1f}",
128            ha="center",
129            va="center",
130            size=f_size,
131        )
132
133
134def _label_extremes(
135    ax: Axes,
136    data: tuple[DataFrame, DataFrame],
137    plot_type: str,
138    f_size: int,
139    kwargs: dict[str, Any],  # must be a dictionary, not a splat
140) -> None:
141    """Label the extremes in the scaled plots."""
142
143    original, adjusted = data
144    low, high = kwargs["xlim"]
145    if plot_type == ZSCALED:
146        ax.axvline(-1, color="#555555", linewidth=0.5, linestyle="--")
147        ax.axvline(1, color="#555555", linewidth=0.5, linestyle="--")
148        ax.scatter(
149            adjusted.median(),
150            adjusted.columns,
151            color="darkorchid",
152            marker="x",
153            s=5,
154            label="Median",
155        )
156        for col_num, col_name in enumerate(original.columns):
157            ax.text(
158                low,
159                col_num,
160                f" {original[col_name].min():.1f}",
161                ha="left",
162                va="center",
163                size=f_size,
164            )
165            ax.text(
166                high,
167                col_num,
168                f"{original[col_name].max():.1f} ",
169                ha="right",
170                va="center",
171                size=f_size,
172            )
173
174
175def _horizontal_bar_plot(
176    original: DataFrame,
177    adjusted: DataFrame,
178    middle: float,
179    plot_type: str,
180    kwargs: dict[str, Any],  # must be a dictionary, not a splat
181) -> Axes:
182    """Plot horizontal bars for the middle of the data."""
183
184    # kwargs is a dictionary, not a splat
185    # so that we can pass it to the Axes object and
186    # set the x-axis limits.
187
188    ax = _plot_middle_bars(adjusted, middle, kwargs)
189    f_size = 10
190    _plot_latest_datapoint(ax, original, adjusted, f_size)
191    _label_extremes(
192        ax, data=(original, adjusted), plot_type=plot_type, f_size=f_size, kwargs=kwargs
193    )
194
195    return ax
196
197
198# public
199def summary_plot(
200    data: DataT,  # summary data
201    **kwargs,
202) -> Axes:
203    """Plot a summary of historical data for a given DataFrame.
204
205    Args:
206    - summary: DataFrame containing the summary data. The column names are
207      used as labels for the plot.
208    - kwargs: additional arguments for the plot, including:
209        - plot_from: int | Period | None
210        - verbose: if True, print the summary data.
211        - middle: proportion of data to highlight (default is 0.8).
212        - plot_types: list of plot types to generate.
213
214
215    Returns Axes.
216    """
217
218    # --- check the kwargs
219    me = "summary_plot"
220    report_kwargs(called_from=me, **kwargs)
221    validate_kwargs(SUMMARY_KW_TYPES, me, **kwargs)
222
223    # --- check the data
224    data = check_clean_timeseries(data, me)
225    if not isinstance(data, DataFrame):
226        raise TypeError("data must be a pandas DataFrame for summary_plot()")
227    df = DataFrame(data)  # syntactic sugar for type hinting
228
229    # --- optional arguments
230    verbose = kwargs.pop("verbose", False)
231    middle = float(kwargs.pop("middle", 0.8))
232    plot_type = kwargs.pop("plot_type", ZSCORES)
233    kwargs["legend"] = kwargs.get(
234        "legend",
235        {
236            # put the legend below the x-axis label
237            "loc": "upper center",
238            "fontsize": "xx-small",
239            "bbox_to_anchor": (0.5, -0.125),
240            "ncol": 4,
241        },
242    )
243
244    # get the data, calculate z-scores and scaled scores based on the start period
245    subset, kwargs = constrain_data(df, **kwargs)
246    z_scores, z_scaled = _calculate_z(subset, middle, verbose=verbose)
247
248    # plot as required by the plot_types argument
249    adjusted = z_scores if plot_type == ZSCORES else z_scaled
250    ax = _horizontal_bar_plot(subset, adjusted, middle, plot_type, kwargs)
251    ax.tick_params(axis="y", labelsize="small")
252    make_legend(ax, kwargs["legend"])
253    ax.set_xlim(kwargs.get("xlim", None))  # provide space for the labels
254
255    return ax
ZSCORES = 'zscores'
ZSCALED = 'zscaled'
SUMMARY_KW_TYPES: mgplot.kw_type_checking.ExpectedTypeDict = {'verbose': <class 'bool'>, 'middle': <class 'float'>, 'plot_type': <class 'str'>, 'plot_from': (<class 'int'>, <class 'pandas._libs.tslibs.period.Period'>, <class 'NoneType'>), 'legend': (<class 'NoneType'>, <class 'bool'>, <class 'dict'>, (<class 'str'>, <class 'object'>))}
def summary_plot(data: ~DataT, **kwargs) -> matplotlib.axes._axes.Axes:
200def summary_plot(
201    data: DataT,  # summary data
202    **kwargs,
203) -> Axes:
204    """Plot a summary of historical data for a given DataFrame.
205
206    Args:
207    - summary: DataFrame containing the summary data. The column names are
208      used as labels for the plot.
209    - kwargs: additional arguments for the plot, including:
210        - plot_from: int | Period | None
211        - verbose: if True, print the summary data.
212        - middle: proportion of data to highlight (default is 0.8).
213        - plot_types: list of plot types to generate.
214
215
216    Returns Axes.
217    """
218
219    # --- check the kwargs
220    me = "summary_plot"
221    report_kwargs(called_from=me, **kwargs)
222    validate_kwargs(SUMMARY_KW_TYPES, me, **kwargs)
223
224    # --- check the data
225    data = check_clean_timeseries(data, me)
226    if not isinstance(data, DataFrame):
227        raise TypeError("data must be a pandas DataFrame for summary_plot()")
228    df = DataFrame(data)  # syntactic sugar for type hinting
229
230    # --- optional arguments
231    verbose = kwargs.pop("verbose", False)
232    middle = float(kwargs.pop("middle", 0.8))
233    plot_type = kwargs.pop("plot_type", ZSCORES)
234    kwargs["legend"] = kwargs.get(
235        "legend",
236        {
237            # put the legend below the x-axis label
238            "loc": "upper center",
239            "fontsize": "xx-small",
240            "bbox_to_anchor": (0.5, -0.125),
241            "ncol": 4,
242        },
243    )
244
245    # get the data, calculate z-scores and scaled scores based on the start period
246    subset, kwargs = constrain_data(df, **kwargs)
247    z_scores, z_scaled = _calculate_z(subset, middle, verbose=verbose)
248
249    # plot as required by the plot_types argument
250    adjusted = z_scores if plot_type == ZSCORES else z_scaled
251    ax = _horizontal_bar_plot(subset, adjusted, middle, plot_type, kwargs)
252    ax.tick_params(axis="y", labelsize="small")
253    make_legend(ax, kwargs["legend"])
254    ax.set_xlim(kwargs.get("xlim", None))  # provide space for the labels
255
256    return ax

Plot a summary of historical data for a given DataFrame.

Args:

  • summary: DataFrame containing the summary data. The column names are used as labels for the plot.
  • kwargs: additional arguments for the plot, including:
    • plot_from: int | Period | None
    • verbose: if True, print the summary data.
    • middle: proportion of data to highlight (default is 0.8).
    • plot_types: list of plot types to generate.

Returns Axes.