Source code for pimmslearn.data_handling

"""
Functionality to handle protein and peptide datasets.
"""

import numpy as np
import pandas as pd


[docs] def coverage(X: pd.DataFrame, coverage_col: float, coverage_row: float): """Select proteins by column depending on their coverage. Of these selected proteins, where the rows have a certain number of overall proteins. """ mask_col = X.isnull().mean() <= 1 - coverage_col df = X.loc[:, mask_col] mask_row = df.isnull().mean(axis=1) <= 1 - coverage_row df = df.loc[mask_row, :] return df
[docs] def compute_stats_missing( X: pd.DataFrame, col_no_missing: str = "no_missing", col_no_identified: str = "no_identified", col_prop_samples: str = "prop_samples", ) -> pd.DataFrame: """Dataset of repeated samples indicating if an observation has the variables observed or missing x in {0,1}""" if X.index.name: index_col = X.index.name else: index_col = "INDEX" sample_stats = X.index.to_frame(index=False).reset_index() sample_stats.columns = ["SampleID_int", index_col] sample_stats.set_index(index_col, inplace=True) sample_stats[col_no_identified] = X.sum(axis=1) sample_stats[col_no_missing] = (X == 0).sum(axis=1) assert all( sample_stats[[col_no_identified, col_no_missing]].sum(axis=1) == X.shape[1] ) sample_stats = sample_stats.sort_values(by=col_no_identified, ascending=False) sample_stats[col_prop_samples] = np.array(range(1, len(sample_stats) + 1)) / len( sample_stats ) return sample_stats
[docs] def get_sorted_not_missing(X: pd.DataFrame) -> pd.DataFrame: """Return a Dataframe with missing values. Order columns by degree of completness over columns from variables least to most shared among observations.""" X = X.notna().astype(int) return X[X.mean().sort_values().index]