vaep.analyzers package#

General classes formalizing an experiment.

class vaep.analyzers.Analysis[source]#

Bases: SimpleNamespace

Submodules#

vaep.analyzers.analyzers module#

A collection of Analyzers to perform certain type of analysis.

class vaep.analyzers.analyzers.AnalyzePeptides(data: DataFrame, is_log_transformed: bool = False, is_wide_format: bool = True, ind_unstack: str = '')[source]#

Bases: SimpleNamespace

Namespace for current analysis

df#

current eagerly loaded data in wide format only: sample index, features in columns

Type:

pandas.DataFrame

stats#

Some statistics of certain aspects. Normally each will be a DataFrame.

Type:

types.SimpleNamespace

Many more attributes are set dynamically depending on the concrete analysis.
calculate_PCs(new_df, is_wide=True)[source]#
describe_peptides(sample_n: Optional[int] = None)[source]#
property df_long#
property df_wide#
property fname_stub#
classmethod from_csv(fname: str, nrows: Optional[int] = None, index_col: Union[int, str, List] = 'Sample ID', verify_fname: bool = False, usecols=None, **kwargs)[source]#
classmethod from_pickle(fname: str, index_col: Union[int, str, List] = 'Sample ID', verify_fname: bool = False, usecols=None, **kwargs)[source]#
get_PCA(n_components=2, imputer=<class 'sklearn.impute._base.SimpleImputer'>)[source]#
get_consecutive_dates(n_samples, seed=42)[source]#

Select n consecutive samples using a seed.

Updated the original DataFrame attribute: df

get_dectection_limit()[source]#

Compute information on detection limit in dataset.

Returns:

Information on detection limit

Return type:

str

get_prop_not_na()[source]#

Get prop. of not NA values for each sample.

log_transform(log_fct: ufunc)[source]#

Log transform data in-place.

Parameters:

log_fct (np.ufunc) – Numpy log-function

Raises:

Exception – if data has been previously log-transformed.

plot_pca()[source]#

Create principal component plot with three heatmaps showing instrument, degree of non NA data and sample by date.

to_long_format(colname_values: str = 'intensity', index_name: str = 'Sample ID', inplace: str = False) DataFrame[source]#

[summary]

Parameters:
  • colname_values (str, optional) – New column name for values in matrix, by default ‘intensity’

  • index_name (str, optional) – Name of column to assign as index (based on long-data format), by default ‘Sample ID’

  • inplace (bool, optional) – Assign result to df_long (False), or to df (True) attribute, by default False

Returns:

Data in long-format as DataFrame

Return type:

pd.DataFrame

to_wide_format(columns: str = 'Sample ID', name_values: str = 'intensity', inplace: bool = False) DataFrame[source]#

[summary]

Parameters:
  • columns (str, optional) – Index level to be shown as columns, by default ‘Sample ID’

  • name_values (str, optional) – Column in long-data format to be used as values, by default ‘intensity’

  • inplace (bool, optional) – Assign result to df_wide (False), or to df (True) attribute, by default False

Returns:

[description]

Return type:

pd.DataFrame

class vaep.analyzers.analyzers.LatentAnalysis(latent_space: DataFrame, meta_data: DataFrame, model_name: str, fig_size: Tuple[int, int] = (15, 15), folder: Optional[Path] = None)[source]#

Bases: Analysis

plot_by_category(meta_key: str, save: bool = True)[source]#
plot_by_date(meta_key: str = 'date', save: bool = True)[source]#
vaep.analyzers.analyzers.add_date_colorbar(mappable, ax)[source]#
vaep.analyzers.analyzers.cast_object_to_category(df: DataFrame) DataFrame[source]#

Cast object columns to category dtype.

Parameters:

df (pd.DataFrame) – DataFrame with columns

Returns:

DataFrame with category columns instead of object columns.

Return type:

pd.DataFrame

vaep.analyzers.analyzers.corr_lower_triangle(df)[source]#

Compute the correlation matrix, returning only unique values.

vaep.analyzers.analyzers.get_consecutive_data_indices(df, n_samples)[source]#
vaep.analyzers.analyzers.plot_corr_histogram(corr_lower_triangle, bins=10)[source]#
vaep.analyzers.analyzers.plot_date_map(df, ax, dates: Optional[Series] = None, meta: Optional[Series] = None, title: str = 'by date', fontsize=8, size=2)[source]#
vaep.analyzers.analyzers.plot_scatter(df, ax, meta: Series, feat_name_display: str = 'features', title: Optional[str] = None, alpha=0.5, fontsize=8, size=2)[source]#
vaep.analyzers.analyzers.scatter_plot_w_dates(ax, df, dates=None, marker=None, errors='raise', size=2)[source]#

plot first vs. second column in DataFrame. Use dates to color data.

errors{‘ignore’, ‘raise’, ‘coerce’}, default ‘raise’

Passed on to pandas.to_datetime - If ‘raise’, then invalid parsing will raise an exception. - If ‘coerce’, then invalid parsing will be set as NaT. - If ‘ignore’, then invalid parsing will return the input.

vaep.analyzers.analyzers.seaborn_scatter(df, ax, meta: Series, title: str = 'by some metadata', alpha=0.5, fontsize=5, size=5)[source]#

vaep.analyzers.compare_predictions module#

vaep.analyzers.compare_predictions.load_predictions(pred_files: List, shared_columns=['observed'])[source]#
vaep.analyzers.compare_predictions.load_single_csv_pred_file(fname: str | Path, value_name: str = 'intensity') pd.Series[source]#
Load a single pred file from a single model.

Last column are measurments, other are index.

Parameters:
  • fname (str | Path) – Path to csv file to be loaded

  • value_name (str, optional) – name for measurments to be used, by default ‘intensity’

Returns:

measurments as a single column with set indices

Return type:

pd.Series

vaep.analyzers.compare_predictions.load_split_prediction_by_modelkey(experiment_folder: Path, split: str, model_keys: list[str], allow_missing=False, shared_columns: list[str] = None)[source]#

Load predictions from a list of models.

Parameters:
  • experiment_folder (Path) – Path to experiment folder

  • split (str) – which split of simulated data to load

  • model_keys (List) – List of model keys to be loaded

  • allow_missing (bool, optional) – Ignore missing pred files of requested model, default False

  • shared_columns (List, optional) – List of columns that are shared between all models, by default None

Returns:

Prediction data frame with shared columns and model predictions

Return type:

pd.DataFrame

vaep.analyzers.diff_analysis module#

class vaep.analyzers.diff_analysis.Cutoffs(feat_completness_over_samples, min_feat_in_sample)#

Bases: tuple

feat_completness_over_samples#

Alias for field number 0

min_feat_in_sample#

Alias for field number 1

vaep.analyzers.diff_analysis.select_feat(df_qc: DataFrame, threshold: float = 0.4, axis: int = 0)[source]#
vaep.analyzers.diff_analysis.select_raw_data(df: pd.DataFrame, data_completeness: float, frac_protein_groups: int) tuple[pd.DataFrame, Cutoffs][source]#