Source code for pimmslearn.plotting

from __future__ import annotations

import logging
import pathlib
from functools import partial

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn

import pimmslearn.pandas
from pimmslearn.plotting import data, defaults, errors, plotly
from pimmslearn.plotting.errors import plot_rolling_error

seaborn.set_style("whitegrid")
# seaborn.set_theme()

plt.rcParams["figure.figsize"] = [16.0, 7.0]  # [4, 2], [4, 3]
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["ps.fonttype"] = 42

plt.rcParams["figure.dpi"] = 147


logger = logging.getLogger(__name__)

__all__ = [
    "plotly",
    "data",
    "defaults",
    "errors",
    "plot_rolling_error",
    # define in this file
    "savefig",
    "select_xticks",
    "select_dates",
    "make_large_descriptors",
    "plot_feat_counts",
    "plot_cutoffs",
]


def _savefig(
    fig,
    name,
    folder: pathlib.Path = ".",
    pdf=True,
    dpi=300,  # default 'figure',
    tight_layout=True,
):
    """Save matplotlib Figure (having method `savefig`) as pdf and png."""
    folder = pathlib.Path(folder)
    fname = folder / name
    folder = fname.parent  # in case name specifies folders
    folder.mkdir(exist_ok=True, parents=True)
    if tight_layout:
        fig.tight_layout()
    fig.savefig(fname.with_suffix(".png"), dpi=dpi)
    if pdf:
        fig.savefig(fname.with_suffix(".pdf"), dpi=dpi)
    logger.info(f"Saved Figures to {fname}")


savefig = _savefig


[docs] def select_xticks(ax: matplotlib.axes.Axes, max_ticks: int = 50) -> list: """Limit the number of xticks displayed. Parameters ---------- ax : matplotlib.axes.Axes Axes object to manipulate max_ticks : int, optional maximum number of set ticks on x-axis, by default 50 Returns ------- list list of current ticks for x-axis. Either new or old (depending if something was changed). """ x_ticks = ax.get_xticks() offset = len(x_ticks) // max_ticks if offset > 1: # if larger than 1 return ax.set_xticks(x_ticks[::offset]) return x_ticks
[docs] def select_dates(date_series: pd.Series, max_ticks=30) -> np.array: """Get unique dates (single days) for selection in pd.plot.line with xticks argument. Parameters ---------- date_series : pd.Series datetime series to use (values, not index) max_ticks : int, optional maximum number of unique ticks to select, by default 30 Returns ------- np.array _description_ """ xticks = date_series.dt.date.unique() offset = len(xticks) // max_ticks if offset > 1: return xticks[::offset] else: return xticks
[docs] def make_large_descriptors(size="xx-large"): """Helper function to have very large titles, labes and tick texts for matplotlib plots per default. size: str fontsize or allowed category. Change default if necessary, default 'xx-large' """ plt.rcParams.update( { k: size for k in [ "xtick.labelsize", "ytick.labelsize", "axes.titlesize", "axes.labelsize", "legend.fontsize", "legend.title_fontsize", ] } )
set_font_sizes = make_large_descriptors def add_prop_as_second_yaxis( ax: matplotlib.axes.Axes, n_samples: int, format_str: str = "{x:,.3f}" ) -> matplotlib.axes.Axes: """Add proportion as second axis. Try to align cleverly Parameters ---------- ax : matplotlib.axes.Axes Axes for which you want to add a second y-axis n_samples : int Number of total samples (to normalize against) Returns ------- matplotlib.axes.Axes Second layover twin Axes with right-hand side y-axis """ ax2 = ax.twinx() n_min, n_max = np.round(ax.get_ybound()) logger.info(f"{n_min = }, {n_max = }") lower_prop = n_min / n_samples + (ax.get_ybound()[0] - n_min) / n_samples upper_prop = n_max / n_samples + (ax.get_ybound()[1] - n_max) / n_samples logger.info(f"{lower_prop = }, {upper_prop = }") ax2.set_ybound(lower_prop, upper_prop) # _ = ax2.set_yticks(np.linspace(n_min/n_samples, # n_max /n_samples, len(ax.get_yticks())-2)) _ = ax2.set_yticks(ax.get_yticks()[1:-1] / n_samples) ax2.yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter(format_str)) return ax2 def add_height_to_barplot(ax, size=5, rotated=False): ax.annotate = partial( ax.annotate, text="NA", xytext=(0, int(size / 2)), ha="center", size=size, textcoords="offset points", ) ax.annotate = partial(ax.annotate, rotation=0, va="center") if rotated: ax.annotate = partial( ax.annotate, xytext=(1, int(size / 3)), rotation=90, va="bottom" ) for bar in ax.patches: if not bar.get_height(): xy = (bar.get_x() + bar.get_width() / 2, 0.0) ax.annotate( text="NA", xy=xy, ) continue ax.annotate( text=format(bar.get_height(), ".2f"), xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), ) return ax def add_text_to_barplot(ax, text, size=5): for bar, text_ in zip(ax.patches, text): logger.debug(f"{bar = }, f{text = }, {bar.get_height() = }") if not bar.get_height(): continue ax.annotate( text=text_, xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), xytext=(1, -5), rotation=90, ha="center", va="top", size=size, textcoords="offset points", ) return ax def format_large_numbers( ax: matplotlib.axes.Axes, format_str: str = "{x:,.0f}" ) -> matplotlib.axes.Axes: """Format large integer numbers to be read more easily. Parameters ---------- ax : matplotlib.axes.Axes Axes which labels should be manipulated. format_str : str, optional Default float format string, by default '{x:,.0f}' Returns ------- matplotlib.axes.Axes _description_ """ ax.xaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter(format_str)) ax.yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter(format_str)) return ax
[docs] def plot_feat_counts( df_counts: pd.DataFrame, feat_name: str, n_samples: int, ax=None, figsize=(15, 10), count_col="counts", **kwargs, ): args = dict( ylabel="count", xlabel=f"{feat_name} ordered by completeness", title=f"Count and proportion of {len(df_counts):,d} {feat_name}s over {n_samples:,d} samples", ) args.update(kwargs) ax = df_counts[count_col].plot(figsize=figsize, grid=True, ax=ax, **args) # default nearly okay, but rather customize to see minimal and maxium proportion # ax = peptide_counts['proportion'].plot(secondary_y=True, style='b') ax2 = add_prop_as_second_yaxis(ax=ax, n_samples=n_samples) ax2.set_ylabel("proportion") ax = format_large_numbers(ax=ax) return ax
def plot_counts( df_counts: pd.DataFrame, n_samples, feat_col_name: str = "count", feature_name=None, ax=None, prop_feat=0.25, min_feat_prop=0.01, **kwargs, ): """Plot counts based on get_df_counts.""" if feature_name is None: feature_name = feat_col_name # df_counts = df_counts[[feat_col_name]].copy() ax = plot_feat_counts( df_counts, feat_name=feature_name, n_samples=n_samples, count_col=feat_col_name, ax=ax, **kwargs, ) df_counts["prop"] = df_counts[feat_col_name] / n_samples n_feat_cutoff = pimmslearn.pandas.get_last_index_matching_proportion( df_counts=df_counts, prop=prop_feat, prop_col="prop" ) n_samples_cutoff = df_counts.loc[n_feat_cutoff, feat_col_name] logger.info(f"{n_feat_cutoff = }, {n_samples_cutoff = }") x_lim_max = pimmslearn.pandas.get_last_index_matching_proportion( df_counts, min_feat_prop, prop_col="prop" ) logger.info(f"{x_lim_max = }") ax.set_xlim(-1, x_lim_max) ax.axvline(n_feat_cutoff, c="red") # ax.text(n_feat_cutoff + 0.03 * x_lim_max, # n_samples_cutoff, '25% cutoff', # style='italic', fontsize=12, # bbox={'facecolor': 'grey', 'alpha': 0.5, 'pad': 10}) ax.annotate( f"{prop_feat*100}% cutoff", xy=(n_feat_cutoff, n_samples_cutoff), xytext=(n_feat_cutoff + 0.1 * x_lim_max, n_samples_cutoff), fontsize=16, arrowprops=dict(facecolor="black", shrink=0.05), ) return ax
[docs] def plot_cutoffs( df: pd.DataFrame, feat_completness_over_samples: int = None, min_feat_in_sample: int = None, ) -> tuple[matplotlib.figure.Figure, np.array[matplotlib.axes.Axes]]: """plot number of available features along index and columns (feat vs samples), potentially including some cutoff. Parameters ---------- df : pd.DataFrame DataFrame in wide data format. feat_completness_over_samples : int, optional horizental line to plot as cutoff for features, by default None min_feat_in_sample : int, optional horizental line to plot as cutoff for samples, by default None Returns ------- tuple[matplotlib.figure.Figure, np.array[matplotlib.axes.Axes]] _description_ """ notna = df.notna() fig, axes = plt.subplots(1, 2) ax = axes[0] notna.sum(axis=0).sort_values().plot( rot=90, ax=ax, ylabel="count samples", xlabel="feature name" ) if feat_completness_over_samples is not None: ax.axhline(feat_completness_over_samples) ax = axes[1] notna.sum(axis=1).sort_values().plot( rot=90, ax=ax, ylabel="count features", xlabel="sample name" ) if min_feat_in_sample is not None: ax.axhline(min_feat_in_sample) return fig, axes
def only_every_x_ticks(ax, x=2, axis=None): """Sparse out ticks on both axis by factor x""" if axis is None: ax.set_xticks(ax.get_xticks()[::x]) ax.set_yticks(ax.get_yticks()[::x]) else: if axis == 0: ax.set_xticks(ax.get_xticks()[::x]) elif axis == 1: ax.set_yticks(ax.get_yticks()[::x]) else: raise ValueError(f"axis must be 0 or 1, got {axis}") return ax def use_first_n_chars_in_labels(ax, x=2): """Take first N characters of labels and use them as new labels""" # xaxis _new_labels = [_l.get_text()[:x] for _l in ax.get_xticklabels()] _ = ax.set_xticklabels(_new_labels) # yaxis _new_labels = [_l.get_text()[:x] for _l in ax.get_yticklabels()] _ = ax.set_yticklabels(_new_labels) return ax def split_xticklabels(ax, PG_SEPARATOR=";"): """Split labels by PG_SEPARATOR and only use first part""" if PG_SEPARATOR is not None: _new_labels = [ _l.get_text().split(PG_SEPARATOR)[0] for _l in ax.get_xticklabels() ] _ = ax.set_xticklabels(_new_labels) return ax