Source code for pimmslearn.data_handling

"""
Functionality to handle protein and peptide datasets.
"""

import numpy as np
import pandas as pd



[docs]
def coverage(X: pd.DataFrame, coverage_col: float, coverage_row: float):
    """Select proteins by column depending on their coverage.
    Of these selected proteins, where the rows have a certain number of overall proteins.
    """
    mask_col = X.isnull().mean() <= 1 - coverage_col
    df = X.loc[:, mask_col]
    mask_row = df.isnull().mean(axis=1) <= 1 - coverage_row
    df = df.loc[mask_row, :]
    return df




[docs]
def compute_stats_missing(
    X: pd.DataFrame,
    col_no_missing: str = "no_missing",
    col_no_identified: str = "no_identified",
    col_prop_samples: str = "prop_samples",
) -> pd.DataFrame:
    """Dataset of repeated samples indicating if an observation
    has the variables observed or missing x in {0,1}"""
    if X.index.name:
        index_col = X.index.name
    else:
        index_col = "INDEX"
    sample_stats = X.index.to_frame(index=False).reset_index()
    sample_stats.columns = ["SampleID_int", index_col]
    sample_stats.set_index(index_col, inplace=True)

    sample_stats[col_no_identified] = X.sum(axis=1)
    sample_stats[col_no_missing] = (X == 0).sum(axis=1)

    assert all(
        sample_stats[[col_no_identified, col_no_missing]].sum(axis=1) == X.shape[1]
    )
    sample_stats = sample_stats.sort_values(by=col_no_identified, ascending=False)
    sample_stats[col_prop_samples] = np.array(range(1, len(sample_stats) + 1)) / len(
        sample_stats
    )
    return sample_stats




[docs]
def get_sorted_not_missing(X: pd.DataFrame) -> pd.DataFrame:
    """Return a Dataframe with missing values. Order columns by degree of completness
    over columns from variables least to most shared among observations."""
    X = X.notna().astype(int)
    return X[X.mean().sort_values().index]