Source code for pimmslearn.imputation

"""
Reduce number of missing values of DDA massspectromety.

Imputation can be down by column.


"""

import logging
from typing import Dict, Tuple

import numpy as np
import pandas as pd

logger = logging.getLogger(__name__)


RANDOMSEED = 123


[docs] def impute_shifted_normal( df_wide: pd.DataFrame, mean_shift: float = 1.8, std_shrinkage: float = 0.3, completeness: float = 0.6, axis=1, seed=RANDOMSEED, ) -> pd.Series: """Get replacements for missing values. Parameters ---------- df_wide : pd.DataFrame DataFrame in wide format, contains missing mean_shift : float, optional shift mean of feature by factor of standard deviations, by default 1.8 std_shrinkage : float, optional shrinks standard deviation by facotr, by default 0.3 axis: int, optional axis along which to impute, by default 1 (i.e. mean and std per row) Returns ------- pd.Series Series of imputed values in long format. """ # add check if there ar e NaNs or inf in data? see tests # np.isinf(df_wide).values.sum() if axis == 1: min_N = int(len(df_wide) * completeness) selected = df_wide.dropna(axis=1, thresh=min_N) elif axis == 0: min_M = int(df_wide.shape[1] * completeness) selected = df_wide.dropna(axis=0, thresh=min_M) else: raise ValueError( "Please specify axis as 0 or 1, for axis along which to impute." ) logger.info( f"Meand and standard deviation based on seleted data of shape {selected.shape}" ) mean = selected.mean(axis=axis) std = selected.std(axis=axis) mean_shifted = mean - (std * mean_shift) std_shrinked = std * std_shrinkage # rng=np.random.default_rng(seed=seed) # rng.normal() np.random.seed(seed) N, M = df_wide.shape if axis == 1: imputed_shifted_normal = pd.DataFrame( np.random.normal(mean_shifted, std_shrinked, size=(M, N)), index=df_wide.columns, columns=df_wide.index, ) imputed_shifted_normal = imputed_shifted_normal.T else: imputed_shifted_normal = pd.DataFrame( np.random.normal(mean_shifted, std_shrinked, size=(N, M)), index=df_wide.index, columns=df_wide.columns, ) imputed_shifted_normal = imputed_shifted_normal[df_wide.isna()].stack() return imputed_shifted_normal
[docs] def compute_moments_shift( observed: pd.Series, imputed: pd.Series, names: Tuple[str, str] = ("observed", "imputed"), ) -> Dict[str, float]: """Summary of overall shift of mean and std. dev. of predictions for a imputation method.""" name_obs, name_model = names data = { name: {"mean": series.mean(), "std": series.std()} for series, name in zip([observed, imputed], names) } observed, imputed = data[name_obs], data[name_model] data[name_model]["mean shift (in std)"] = ( observed["mean"] - imputed["mean"] ) / observed["std"] data[name_model]["std shrinkage"] = imputed["std"] / observed["std"] return data
[docs] def stats_by_level( series: pd.Series, index_level: int = 0, min_count: int = 5 ) -> pd.Series: """Count, mean and std. dev. by index level.""" agg = series.groupby(level=index_level).agg(["count", "mean", "std"]) agg = agg.loc[agg["count"] > min_count] return agg.mean()