Source code for pimmslearn.pandas.missing_data
"""Functionality related to analyzing missing values in a pandas DataFrame."""
from __future__ import annotations
import math
from pathlib import Path
from typing import Union
import pandas as pd
[docs]
def percent_missing(df: pd.DataFrame):
"""Total percentage of missing values in a DataFrame.
Parameters
----------
df : pd.DataFrame
DataFrame with data.
Returns
-------
float
Proportion of missing values in the DataFrame.
"""
return df.isna().sum().sum() / math.prod(df.shape)
[docs]
def percent_non_missing(df: pd.DataFrame) -> float:
return df.notna().sum().sum() / math.prod(df.shape)
[docs]
def list_files(folder=".") -> list[str]:
return [f.as_posix() for f in Path(folder).iterdir()]
[docs]
def get_record(data: pd.DataFrame, columns_sample=False) -> dict:
"""Get summary record of data."""
if columns_sample:
M, N = data.shape
else:
N, M = data.shape
N_obs = data.notna().sum().sum()
N_mis = N * M - N_obs
missing = N_mis / (N_obs + N_mis)
record = dict(
N=int(N),
M=int(M),
N_obs=int(N_obs),
N_mis=int(N_mis),
missing=float(missing),
)
return record
[docs]
def decompose_NAs(
data: pd.DataFrame, level: Union[int, str], label: int = "summary"
) -> pd.DataFrame:
"""Decompose missing values by a level into real and indirectly imputed missing values.
Real missing value have missing for all samples in a group. Indirectly imputed missing values
are in MS-based proteomics data that would be imputed by the mean (or median) of the observed
values in a group if the mean (or median) is used for imputation.
Parameters
----------
data : pd.DataFrame
DataFrame with samples in columns and features in rows.
level : Union[int, str]
Index level to group by. Examples: Protein groups, peptides or precursors in MS data.
label : int, optional
Column name of single column dataframe returned, by default 'summary'
Returns
-------
pd.DataFrame
One column DataFrame with summary information about missing values.
"""
real_mvs = 0
ii_mvs = 0
grouped = data.groupby(level=level)
for _, _df in grouped:
if len(_df) == 1:
# single precursors -> all RMVs
real_mvs += _df.isna().sum().sum()
elif len(_df) > 1:
# caculate the number of missing values for samples where one precursor was observed
total_NAs = _df.isna().sum().sum()
M = len(_df) # normally 2 or 3
_real_mvs = _df.isna().all(axis=0).sum() * M
real_mvs += _real_mvs
ii_mvs += total_NAs - _real_mvs
else:
ValueError("Something went wrong")
assert data.isna().sum().sum() == real_mvs + ii_mvs
return (
pd.Series(
{
"total_obs": data.notna().sum().sum(),
"total_MVs": data.isna().sum().sum(),
"real_MVs": real_mvs,
"indirectly_imputed_MVs": ii_mvs,
"real_MVs_ratio": real_mvs / data.isna().sum().sum(),
"indirectly_imputed_MVs_ratio": ii_mvs / data.isna().sum().sum(),
"total_MVs_ratio": data.isna().sum().sum() / data.size,
}
)
.to_frame(name=label)
.T.convert_dtypes()
)