Source code for pimmslearn.pandas.missing_data

"""Functionality related to analyzing missing values in a pandas DataFrame."""

from __future__ import annotations

import math
from pathlib import Path
from typing import Union

import pandas as pd


[docs] def percent_missing(df: pd.DataFrame): """Total percentage of missing values in a DataFrame. Parameters ---------- df : pd.DataFrame DataFrame with data. Returns ------- float Proportion of missing values in the DataFrame. """ return df.isna().sum().sum() / math.prod(df.shape)
[docs] def percent_non_missing(df: pd.DataFrame) -> float: return df.notna().sum().sum() / math.prod(df.shape)
[docs] def list_files(folder=".") -> list[str]: return [f.as_posix() for f in Path(folder).iterdir()]
[docs] def get_record(data: pd.DataFrame, columns_sample=False) -> dict: """Get summary record of data.""" if columns_sample: M, N = data.shape else: N, M = data.shape N_obs = data.notna().sum().sum() N_mis = N * M - N_obs missing = N_mis / (N_obs + N_mis) record = dict( N=int(N), M=int(M), N_obs=int(N_obs), N_mis=int(N_mis), missing=float(missing), ) return record
[docs] def decompose_NAs( data: pd.DataFrame, level: Union[int, str], label: int = "summary" ) -> pd.DataFrame: """Decompose missing values by a level into real and indirectly imputed missing values. Real missing value have missing for all samples in a group. Indirectly imputed missing values are in MS-based proteomics data that would be imputed by the mean (or median) of the observed values in a group if the mean (or median) is used for imputation. Parameters ---------- data : pd.DataFrame DataFrame with samples in columns and features in rows. level : Union[int, str] Index level to group by. Examples: Protein groups, peptides or precursors in MS data. label : int, optional Column name of single column dataframe returned, by default 'summary' Returns ------- pd.DataFrame One column DataFrame with summary information about missing values. """ real_mvs = 0 ii_mvs = 0 grouped = data.groupby(level=level) for _, _df in grouped: if len(_df) == 1: # single precursors -> all RMVs real_mvs += _df.isna().sum().sum() elif len(_df) > 1: # caculate the number of missing values for samples where one precursor was observed total_NAs = _df.isna().sum().sum() M = len(_df) # normally 2 or 3 _real_mvs = _df.isna().all(axis=0).sum() * M real_mvs += _real_mvs ii_mvs += total_NAs - _real_mvs else: ValueError("Something went wrong") assert data.isna().sum().sum() == real_mvs + ii_mvs return ( pd.Series( { "total_obs": data.notna().sum().sum(), "total_MVs": data.isna().sum().sum(), "real_MVs": real_mvs, "indirectly_imputed_MVs": ii_mvs, "real_MVs_ratio": real_mvs / data.isna().sum().sum(), "indirectly_imputed_MVs_ratio": ii_mvs / data.isna().sum().sum(), "total_MVs_ratio": data.isna().sum().sum() / data.size, } ) .to_frame(name=label) .T.convert_dtypes() )