Source code for hifis_surveyval.plotting.matplotlib_plotter

# hifis-surveyval
# Framework to help developing analysis scripts for the HIFIS Software survey.
#
# SPDX-FileCopyrightText: 2021 HIFIS Software <support@hifis.net>
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""
This module provides functionality to plot survey data.

This module is called by survey analysis scripts and is a helper module
which processes `pandas` data-frames as input and transforms them into
informative plots.
The actual plotting is done by utilizing a separate plotting library called
`matplotlib`.
"""
import logging
import math
from inspect import FrameInfo, getmodulename, stack
from pathlib import Path
from textwrap import wrap
from typing import List, Optional, Tuple

from matplotlib import colors, pyplot, rcParams
from pandas import DataFrame

from hifis_surveyval.plotting.plotter import Plotter
from hifis_surveyval.plotting.supported_output_format import (
    SupportedOutputFormat,
)


[docs]class MatplotlibPlotter(Plotter): """Provides standardized plotting functionalities with matplotlib.""" def _output_pyplot_image(self, output_file_stem: str = "") -> None: """ Render pyplot images depending on output settings. Use this after construction a pyplot plot to display the plot or render it into an image depending on the application settings. (Basically instead of pyplot.show() or pyplot.savefig()) Args: output_file_stem (str): The stem of the desired filename (without extension). Defaults to an empty string which will prompt the automatic generation of a file name from the date of the run and the module producing the image. """ if self.OUTPUT_FORMAT == SupportedOutputFormat.SCREEN: pyplot.show() pyplot.close() return if not output_file_stem: # Auto-generate the file stem # Get the calling module's name, assuming this function is called # from a survey evaluation script # Keep in mind that # * FrameInfo is a named tuple in which the second entry is the # fully qualified module name # * The first element on the stack is this function, the caller is # the second frame calling_module_frame: FrameInfo = stack()[1] calling_module_name: str = getmodulename(calling_module_frame[1]) output_file_stem: str = f"{calling_module_name}" file_ending: str = self.OUTPUT_FORMAT.name.lower() file_name: str = f"{output_file_stem}.{file_ending}" output_path: Path = self.ANALYSIS_OUTPUT_PATH / file_name if output_path.exists(): logging.warning(f"Overriding existing output file {output_path}") pyplot.savefig(f"{output_path}") pyplot.close() @classmethod def _set_figure_size(cls, width: float, height: float): """ Set the figure size so the main area fits into the desired space. Args: width (float): Desired plotting space width in inches. height (float): Desired plotting space height in inches. """ figure = pyplot.gcf() left: float = figure.subplotpars.left right: float = figure.subplotpars.right top: float = figure.subplotpars.top bottom: float = figure.subplotpars.bottom figure_width: float = float(width) / (right - left) figure_height: float = float(height) / (top - bottom) figure.set_size_inches(figure_width, figure_height) @classmethod def _customize_figure_size(cls, figure_size: Tuple[float]) -> None: """ Set custom figure size if figure size is given. Args: figure_size: Tuple[float]: Tuple with two entries representing custom width and height of the figure. Figure size is not set if figure size is not given. """ if figure_size and len(figure_size) == 2: MatplotlibPlotter._set_figure_size(figure_size[0], figure_size[1])
[docs] def plot_bar_chart( self, data_frame: DataFrame, plot_file_name: str = "", show_legend: bool = True, show_value_labels: bool = True, round_value_labels_to_decimals: int = 0, **kwargs, ) -> None: r""" Plot given data-frame as a (stacked) bar chart. A pandas DataFrame is used as input data from which a (stacked) bar chart is generated. This DataFrame need be structured in a particular way. The actual data values are taken column by column and plotted as bars for each index entry. In case of a normal bar chart this means each sequence of bars / column is put next to each other while each sequence is grouped by the series / rows and labeled accordingly in the legend. By contrast in case of a stacked bar chart each sequence of bars / column is stacked on top of the previous instead of put next to each other and labeled accordingly in the legend. The index names are used to label the ticks along the x-axis, while the column names are used as labels in the legend. Args: data_frame (DataFrame): All data needed for this function to plot a stacked bar chart is encapsulated in this DataFrame. plot_file_name (str): Optional file name which is used to store the plot to a file. If this argument is an empty string (Default) for this argument, a suitable file name is auto-generated. show_legend (bool): Used to control whether a legend is included in the plot or not. (Default: True) show_value_labels (bool): Enable or disable labels to show the values of each bar. ( Default: True) round_value_labels_to_decimals (int): Round label values to the number of decimals. (Default: 0) \*\*kwargs: stacked (bool): Prompts the generation of a stacked bar chart instead on bars being grouped side-by-side. plot_title (str): The title text for the plot. (Default: "") x_axis_label (str): The label for the x-axis. Default: "") x_label_rotation (int): Allows to rotate the x-axis labels for better readability. Value is given in degrees. (Default: 0) y_axis_label (str): The label for the y-axis. Default: "") legend_location (str): Specifies positioning of the plot's legend. (Default: "best") See Also: pandas.Axis.legend(loc) legend_anchor (BboxBase): Allows to specify an anchor point for the plot's legend ( Default: None) See Also: pandas.Axis.legend(bbox_to_anchor) ylim (Set[float]): Allows to specify the maximum and minimum values of the y axis (Default: None) See Also: matplotlib.axes.Axes.set_ylim figure_size (Tuple[float]): This tuple indicates the aspect ratio in terms of the figure width and height of an image to plot. (Default: The figure is auto-sized if the figure size is not given.) """ rcParams.update({"figure.autolayout": True}) # Color map Generation: # 1. Pick a suitable predefined colormap. Chose one with light colors # so the value labels can stand out by darkening them. base_color_map = pyplot.get_cmap("Pastel1") color_count = len(base_color_map.colors) if len(data_frame.columns) > color_count: raise NotImplementedError( f"Attempt to plot a bar chart " f"with more then {color_count} columns per row." f"Color palette has not enough colors for all of them." f"(is bar chart a fitting diagram type here?)" f"(Would transposing the data frame help?)" ) # 2. Reduce the colormap to have only as much colors as there are # columns so each columns color index matches the column index # (If the colormap were larger one would have to do linear # interpolation to obtain the proper color.) color_map = colors.ListedColormap( [ base_color_map.colors[index] for index in range(len(data_frame.columns)) ] ) # This new colormap is handed to the graph and used in the value labels plot_stacked: bool = kwargs.get("stacked", False) x_rotation: int = kwargs.get("x_label_rotation", 0) data_frame.plot(kind="bar", stacked=plot_stacked, cmap=color_map) axes = pyplot.gca() axes.set_title(kwargs.get("plot_title", "")) axes.set_xlabel(kwargs.get("x_axis_label", "")) axes.set_ylabel(kwargs.get("y_axis_label", "")) axes.set_xticklabels( data_frame.index.values, rotation=x_rotation, ha="right" if x_rotation else "center", rotation_mode="anchor", ) if "ylim" in kwargs: axes.set_ylim(kwargs.get("ylim")) if show_legend: axes.legend( data_frame.columns.values, loc=kwargs.get("legend_location", "best"), bbox_to_anchor=kwargs.get("legend_anchor", None), ) else: axes.get_legend().remove() if show_value_labels: self._add_bar_chart_value_labels( data_frame, color_map, plot_stacked, round_value_labels_to_decimals, ) # Set custom figure size or auto-size the figure if figure size is not # given. default_width = len(data_frame.index) * 0.25 default_height = 5 MatplotlibPlotter._customize_figure_size( kwargs.get("figure_size", (default_width, default_height))) self._output_pyplot_image(plot_file_name)
@classmethod def _add_bar_chart_value_labels( cls, data_frame: DataFrame, color_map: colors.Colormap, plot_stacked: bool, round_value_labels_to_decimals: int = 0, ) -> None: """ Add value labels to a bar chart. This is a helper method and not supposed to be called on its own. Args: data_frame (DataFrame): The data frame providing the data for the chart. color_map (Colormap): The color map used by the bar chart. plot_stacked (bool): Whether the chart is a stacked bar chart or not. round_value_labels_to_decimals (int): Round label values to the number of decimals. (Default: 0) """ default_font_size = rcParams["font.size"] axes = pyplot.gca() # Loop over the data and annotate the actual values column_count: int = len(data_frame.columns) row_count: int = len(data_frame.index) if plot_stacked: sums = data_frame.sum(axis=1) minimum_value = sums.min() maximum_value = sums.max() if minimum_value > 0: minimum_value = 0 # The lower boundary of the plotting area y_range = maximum_value - minimum_value # The minimum height until which a label can be included # directly in the bar as fraction of the plot height min_include_height = y_range / 15 for row in range(row_count): row_sum = 0 for column in range(column_count): value = data_frame.iloc[row, column] # Skip values that can not be plotted in a stacked bar # chart if (value in [0, None]) or math.isnan(value): continue color = color_map.colors[column] # Darken the color by setting each component to 50% color_dark = [0.5 * component for component in color] bar_center_y = row_sum + value / 2 # If the value fits inside the bar plot it directly in the # center, otherwise move it to the outside and add a line # as indicator. if value > min_include_height: text_x = row text_y = bar_center_y else: # The label has to go outside the bar # The following numbers come from eyeballing and # implicit knowledge how the x-axis is organized. # See also below in the non-stacked section. # The text goes to the left of the bar if the column # index is an odd number, otherwise the text is offset # to the right. # 0.375 is the middle of the white space between the # section borders (relative coordinates -0.5 … +0.5 ) # and bar borders (relative coordinates -0.25 … + 0.25) text_left: bool = bool(column % 2) text_x_offset: float = -0.375 if text_left else 0.375 line_x_overhang: float = -0.1 if text_left else 0.1 # Move the text_y up a bit so it does not overlap the # indicator line text_y = bar_center_y + min_include_height / 4 text_x = row + text_x_offset # Plot the indicator line pyplot.plot( [text_x + line_x_overhang, row], [bar_center_y, bar_center_y], color=color, ) # Round values to the number of decimals given in parameter # round_value_labels_decimals. value_rounded = ( int(value.round(round_value_labels_to_decimals)) if round_value_labels_to_decimals == 0 else value.round(round_value_labels_to_decimals) ) # Values with more than 2 digits get displayed with smaller # font size to fit them better axes.text( text_x, text_y, value_rounded, ha="center", va="center", color=color_dark, size=default_font_size if value < 100 else "smaller", ) # for next row iteration: row_sum += value else: maximum_value = data_frame.max().max() minimum_value = data_frame.min().min() # If not stacked, all bar texts share the same y-component # This has to account for the possible range of values # and negative values. # An offset from the x-axis of 1/10th of the most extreme value was # chosen for the text labels by iterative eyeballing. if abs(maximum_value) >= abs(minimum_value): text_y = maximum_value / 10 else: text_y = minimum_value / 10 # For the x-offset calculation, negative values mean left, # positive values mean right. # The x-axis is subdivided into <row_count> equal sections # Each section has by definition a width of 1 unit # The bar area starts at 0.25 units to the left # and ends at 0.25 units to the right bar_area_offset = 0.25 bar_area_width = 0.5 # In case of non-stacked plots, the bar area is equally distributed # across <column_count> bars and each label is offset by a half bar # width within the bar to center it bar_width = bar_area_width / column_count bar_offset = bar_width / 2 for row in range(row_count): for column in range(column_count): color = color_map.colors[column] # Darken the color by setting each component to 50% color = [0.5 * component for component in color] # Within each bar area the bar for each column starts at: column_offset = bar_width * column # And thus, the final x-component is distributed around # <row>, which indicates the center of the section. # The bar_area_offset gives the leftmost point of the bar # area, the column_offset yields the leftmost point of the # current column within the bar area and from there out, # the bar_offset gives the center of the bar. So: text_x = row - bar_area_offset + column_offset + bar_offset value = data_frame.iloc[row, column] # Round values to the number of digits given in parameter # round_value_labels_decimals. value_rounded = ( int(value.round(round_value_labels_to_decimals)) if round_value_labels_to_decimals == 0 else value.round(round_value_labels_to_decimals) ) # Values with more than 2 digits get displayed with smaller # font size to fit them better axes.text( text_x, text_y, value_rounded, ha="center", va="center", color=color, size=default_font_size if value < 100 else "smaller", )
[docs] def plot_box_chart( self, data_frame: Optional[DataFrame] = None, data_frames: List[DataFrame] = None, plot_file_name: str = "", **kwargs, ) -> None: r""" Generate a box chart from the provided data. Each column in the frame corresponds to one box with whiskers. Args: data_frame (Optional[DataFrame]): A singular data frame. Syntactic sugar for using `data_frame=x` instead of `data_frames=[x]`. data_frames (List[DataFrame]): A list of data frames. The columns with the same column index across all the frames are grouped together. plot_file_name (str): Optional file name which is used to store the plot to a file. If this argument is an empty string (Default) for this argument, a suitable file name is auto-generated. \*\*kwargs: plot_title (str): The title text for the plot. (Default: "") x_axis_label (str): The label for the x-axis. Default: "") x_label_rotation (int): Allows to rotate the x-axis labels for better readability. Value is given in degrees. (Default: 0) y_axis_label (str): The label for the y-axis. Default: "") figure_size (Tuple[float]): This tuple indicates the aspect ratio in terms of the figure width and height of an image to plot. (Default: The figure is auto-sized if the figure size is not given.) """ rcParams.update({"figure.autolayout": True}) x_rotation: int = kwargs.get("x_label_rotation", 0) # Create a new holder for the data frames to not manipulate the default # parameter _data_frames: List[DataFrame] = ( data_frames.copy() if data_frames else [] ) if (data_frame is not None) and (not data_frame.empty): _data_frames.append(data_frame.copy()) if not _data_frames: logging.warning( f"Attempt to create box plot {plot_file_name} without " f"data. Skipping this plot." ) return column_count: int = max(len(frame.columns) for frame in _data_frames) frame_count: int = len(_data_frames) figure, axes = pyplot.subplots() axes.set_title(kwargs.get("plot_title", "")) axes.set_xlabel(kwargs.get("x_axis_label", "")) axes.set_ylabel(kwargs.get("y_axis_label", "")) axes.grid("on", axis="y", which="major", color="lightgray") x_tick_labels: List[str] = [] for column_index in range(column_count): group_position = column_index * frame_count data_frame_counter: int = 0 for frame in _data_frames: # Handle frames with fewer columns if len(frame.columns) <= column_index: continue # Using the factor 0.75 to keep plots of one group closer # together position = group_position + data_frame_counter * 0.75 x_tick_labels.append(frame.columns[column_index]) column_frame = frame.iloc[:, column_index].dropna() plot = axes.boxplot( column_frame, positions=[position], widths=0.5, patch_artist=True, ) # Fill the box background so the boxes overlay the grid lines for patch in plot["boxes"]: patch.set_facecolor("wheat") data_frame_counter += 1 axes.set_xticklabels( x_tick_labels, rotation=x_rotation, ha="right" if x_rotation else "center", rotation_mode="anchor", ) # Set custom figure size or auto-size the figure if figure size is not # given. default_width = column_count * frame_count * 0.25 default_height = 5 MatplotlibPlotter._customize_figure_size( kwargs.get("figure_size", (default_width, default_height))) self._output_pyplot_image(plot_file_name)
[docs] def plot_matrix_chart( self, data_frame: DataFrame, plot_file_name: str = "", invert_colors: bool = False, value_label_decimals: int = 2, **kwargs, ) -> None: r""" Plot given data frame as matrix chart. Args: data_frame (DataFrame): The data frame to plot. plot_file_name (str): (Optional) The file name stem for the output file. invert_colors (bool): (Optional) Use an inverted color scheme for plotting. This is recommended for plotting data that represents the absence of something. Defaults to False. value_label_decimals (int): Round label values to the number of decimals. (Default: 2) \*\*kwargs: plot_title (str): The title text for the plot. (Dafault: "") x_axis_label (str): The label for the x-axis. Default: "") x_label_rotation (int): Allows to rotate the x-axis labels for better readability. Value is given in degrees. (Default: 0) y_axis_label (str): The label for the y-axis. Default: "") figure_size (Tuple[float]): This tuple indicates the aspect ratio in terms of the figure width and height of an image to plot. (Default: The figure is auto-sized if the figure size is not given.) """ rcParams.update({"figure.autolayout": True}) color_map_name = "Blues_r" if invert_colors else "Blues" color_map = pyplot.get_cmap(color_map_name) column_count: int = len(data_frame.columns) row_count: int = len(data_frame.index) x_tick_labels = [ "\n".join(wrap(label, 20)) for label in data_frame.columns.values ] x_rotation: int = kwargs.get("x_label_rotation", 0) figure, axes = pyplot.subplots() axes.imshow(data_frame, aspect="auto", cmap=color_map) axes.set_title(kwargs.get("plot_title", "")) axes.set_xlabel(kwargs.get("x_axis_label", "")) axes.set_ylabel(kwargs.get("y_axis_label", "")) axes.set_xticks(range(column_count)) axes.set_yticks(range(row_count)) axes.set_xticklabels( x_tick_labels, rotation=x_rotation, ha="right" if x_rotation else "center", rotation_mode="anchor", ) axes.set_yticklabels(data_frame.index.values) # Loop over the data and annotate the actual values upper_limit = data_frame.max().max() threshold = 0.25 * upper_limit if invert_colors else 0.75 * upper_limit for i in range(row_count): for j in range(column_count): value = round(data_frame.iloc[i, j], value_label_decimals) switch_color = bool( (value < threshold) if invert_colors else (value > threshold) ) axes.text( j, i, value, ha="center", va="center", color="white" if switch_color else "black", ) # Set custom figure size or auto-size the figure if figure size is not # given. default_width = column_count * 0.35 default_height = row_count * 0.5 MatplotlibPlotter._customize_figure_size( kwargs.get("figure_size", (default_width, default_height))) self._output_pyplot_image(plot_file_name)