Source code for pimmslearn.analyzers.diff_analysis

from __future__ import annotations

import logging
from collections import namedtuple

import pandas as pd

logger = logging.getLogger()


Cutoffs = namedtuple("Cutoffs", "feat_completness_over_samples min_feat_in_sample")


[docs] def select_raw_data( df: pd.DataFrame, data_completeness: float, frac_protein_groups: int ) -> tuple[pd.DataFrame, Cutoffs]: msg = "N samples: {}, M feat: {}" N, M = df.shape logger.info("Initally: " + msg.format(N, M)) min_sample_for_feat = int(N * data_completeness) df = df.dropna(axis=1, thresh=min_sample_for_feat) logger.info( f"Dropped features quantified in less than {int(min_sample_for_feat)} samples." ) N, M = df.shape logger.info("After feat selection: " + msg.format(N, M)) min_feat_per_sample = int(M * frac_protein_groups) logger.info(f"Min No. of Protein-Groups in single sample: {min_feat_per_sample}") df = df.dropna(axis=0, thresh=min_feat_per_sample) logger.info("Finally: " + msg.format(*df.shape)) return df, Cutoffs(min_sample_for_feat, min_feat_per_sample)
[docs] def select_feat(df_qc: pd.DataFrame, threshold: float = 0.4, axis: int = 0): qc_cv_feat = df_qc.std(axis=axis) / df_qc.mean(axis=axis) mask = qc_cv_feat < threshold return qc_cv_feat.loc[mask].index