Scikit-learn styple transformers of the data#

Load data into pandas dataframe
Fit transformer on training data
Impute only missing values with predictions from model

Autoencoders need wide training data, i.e. a sample with all its features’ intensities, whereas Collaborative Filtering needs long training data, i.e. sample identifier a feature identifier and the intensity. Both data formats can be transformed into each other, but models using long data format do not need to take care of missing values.

import os
from importlib import metadata
IN_COLAB = 'COLAB_GPU' in os.environ
if IN_COLAB:
    try:
        _v = metadata.version('pimms-learn')
        print(f"Running in colab and pimms-learn ({_v}) is installed.")
    except metadata.PackageNotFoundError:
        print("Install PIMMS...")
        # !pip install git+https://github.com/RasmussenLab/pimms.git@dev
        !pip install pimms-learn   

If on colab, please restart the environment and run everything from here on.

import os
IN_COLAB = 'COLAB_GPU' in os.environ

fn_intensities = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'
if IN_COLAB:
    fn_intensities = 'https://raw.githubusercontent.com/RasmussenLab/pimms/main/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt


from vaep.plotting.defaults import color_model_mapping
import vaep.plotting.data 
import vaep.sampling

from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer
from vaep.sklearn.ae_transformer import AETransformer

vaep.plotting.make_large_descriptors(8)

Matplotlib is building the font cache; this may take a moment.

Data#

df = pd.read_csv(fn_intensities, index_col=0)
df.head()

	AAAS	AACS	AAMDC	AAMP	AAR2	AARS	AARS2	AASDHPPT	AATF	ABCB10	...	ZNHIT2	ZNRF2	ZPR1	ZRANB2	ZW10	ZWILCH	ZWINT	ZYX	hCG_2014768;TMA7	pk;ZAK
Sample ID
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070	341,970,000.000	73,598,000.000	NaN	114,990,000.000	159,370,000.000	4,916,300,000.000	149,190,000.000	245,660,000.000	407,590,000.000	72,440,000.000	...	50,194,000.000	23,201,000.000	332,480,000.000	477,690,000.000	484,070,000.000	NaN	21,823,000.000	721,850,000.000	283,680,000.000	7,714,600.000
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070	211,690,000.000	33,991,000.000	19,762,000.000	80,960,000.000	155,320,000.000	4,233,400,000.000	96,914,000.000	306,530,000.000	257,840,000.000	55,844,000.000	...	NaN	NaN	294,320,000.000	161,550,000.000	317,600,000.000	NaN	NaN	283,840,000.000	NaN	NaN
2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070	342,650,000.000	14,015,000.000	NaN	143,640,000.000	174,350,000.000	7,929,200,000.000	191,730,000.000	373,270,000.000	458,030,000.000	115,780,000.000	...	38,113,000.000	NaN	525,090,000.000	167,830,000.000	702,420,000.000	NaN	58,540,000.000	772,560,000.000	NaN	NaN
2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070	118,930,000.000	NaN	NaN	80,158,000.000	NaN	4,081,400,000.000	74,818,000.000	208,420,000.000	242,070,000.000	42,650,000.000	...	NaN	12,589,000.000	208,620,000.000	162,760,000.000	282,950,000.000	NaN	27,023,000.000	461,970,000.000	NaN	NaN
2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070	177,550,000.000	129,510,000.000	15,272,000.000	132,520,000.000	96,217,000.000	3,854,300,000.000	42,534,000.000	179,130,000.000	186,440,000.000	NaN	...	25,275,000.000	NaN	59,980,000.000	673,000,000.000	188,690,000.000	88,943,000.000	NaN	1,438,800,000.000	NaN	128,670,000.000

5 rows × 4535 columns

We will need the data in long format for Collaborative Filtering. Naming both the row and column index assures that the data can be transformed very easily into long format:

df.index.name = 'Sample ID'  # already set
df.columns.name = 'protein group'  # not set due to csv disk file format
df.head()

protein group	AAAS	AACS	AAMDC	AAMP	AAR2	AARS	AARS2	AASDHPPT	AATF	ABCB10	...	ZNHIT2	ZNRF2	ZPR1	ZRANB2	ZW10	ZWILCH	ZWINT	ZYX	hCG_2014768;TMA7	pk;ZAK
Sample ID
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070	341,970,000.000	73,598,000.000	NaN	114,990,000.000	159,370,000.000	4,916,300,000.000	149,190,000.000	245,660,000.000	407,590,000.000	72,440,000.000	...	50,194,000.000	23,201,000.000	332,480,000.000	477,690,000.000	484,070,000.000	NaN	21,823,000.000	721,850,000.000	283,680,000.000	7,714,600.000
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070	211,690,000.000	33,991,000.000	19,762,000.000	80,960,000.000	155,320,000.000	4,233,400,000.000	96,914,000.000	306,530,000.000	257,840,000.000	55,844,000.000	...	NaN	NaN	294,320,000.000	161,550,000.000	317,600,000.000	NaN	NaN	283,840,000.000	NaN	NaN
2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070	342,650,000.000	14,015,000.000	NaN	143,640,000.000	174,350,000.000	7,929,200,000.000	191,730,000.000	373,270,000.000	458,030,000.000	115,780,000.000	...	38,113,000.000	NaN	525,090,000.000	167,830,000.000	702,420,000.000	NaN	58,540,000.000	772,560,000.000	NaN	NaN
2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070	118,930,000.000	NaN	NaN	80,158,000.000	NaN	4,081,400,000.000	74,818,000.000	208,420,000.000	242,070,000.000	42,650,000.000	...	NaN	12,589,000.000	208,620,000.000	162,760,000.000	282,950,000.000	NaN	27,023,000.000	461,970,000.000	NaN	NaN
2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070	177,550,000.000	129,510,000.000	15,272,000.000	132,520,000.000	96,217,000.000	3,854,300,000.000	42,534,000.000	179,130,000.000	186,440,000.000	NaN	...	25,275,000.000	NaN	59,980,000.000	673,000,000.000	188,690,000.000	88,943,000.000	NaN	1,438,800,000.000	NaN	128,670,000.000

5 rows × 4535 columns

Transform the data using the logarithm, here using base 2:

df = np.log2(df + 1)
df.head()

protein group	AAAS	AACS	AAMDC	AAMP	AAR2	AARS	AARS2	AASDHPPT	AATF	ABCB10	...	ZNHIT2	ZNRF2	ZPR1	ZRANB2	ZW10	ZWILCH	ZWINT	ZYX	hCG_2014768;TMA7	pk;ZAK
Sample ID
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070	28.349	26.133	NaN	26.777	27.248	32.195	27.153	27.872	28.603	26.110	...	25.581	24.468	28.309	28.831	28.851	NaN	24.379	29.427	28.080	22.879
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070	27.657	25.019	24.236	26.271	27.211	31.979	26.530	28.191	27.942	25.735	...	NaN	NaN	28.133	27.267	28.243	NaN	NaN	28.081	NaN	NaN
2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070	28.352	23.740	NaN	27.098	27.377	32.885	27.515	28.476	28.771	26.787	...	25.184	NaN	28.968	27.322	29.388	NaN	25.803	29.525	NaN	NaN
2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070	26.826	NaN	NaN	26.256	NaN	31.926	26.157	27.635	27.851	25.346	...	NaN	23.586	27.636	27.278	28.076	NaN	24.688	28.783	NaN	NaN
2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070	27.404	26.948	23.864	26.982	26.520	31.844	25.342	27.416	27.474	NaN	...	24.591	NaN	25.838	29.326	27.491	26.406	NaN	30.422	NaN	26.939

5 rows × 4535 columns

two plots on data availability:

proportion of missing values per feature median (N = protein groups)
CDF of available intensities per protein group

ax = vaep.plotting.data.plot_feat_median_over_prop_missing(
    data=df, type='boxplot')

../_images/58767ddff90283351db96a79e9742e319700b287a690509cca2f72a38f87d70a.png

df.notna().sum().sort_values().plot()

<Axes: xlabel='protein group'>

../_images/9c8b2dfe50aeb8ae8545aa2546995bc8bb9185616806c8937651c93b6639d280.png

define a minimum feature and sample frequency for a feature to be included

SELECT_FEAT = True


def select_features(df, feat_prevalence=.2, axis=0):
    # ! vaep.filter.select_features
    N = df.shape[axis]
    minimum_freq = N * feat_prevalence
    freq = df.notna().sum(axis=axis)
    mask = freq >= minimum_freq
    print(f"Drop {(~mask).sum()} along axis {axis}.")
    freq = freq.loc[mask]
    if axis == 0:
        df = df.loc[:, mask]
    else:
        df = df.loc[mask]
    return df


if SELECT_FEAT:
    # potentially this can take a few iterations to stabilize.
    df = select_features(df, feat_prevalence=.2)
    df = select_features(df=df, feat_prevalence=.3, axis=1)
df.shape

Drop 91 along axis 0.
Drop 0 along axis 1.

(50, 4444)

Transform to long-data format:

df = df.stack().to_frame('intensity')
df.head()

		intensity
Sample ID	protein group
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070	AAAS	28.349
	AACS	26.133
	AAMP	26.777
	AAR2	27.248
	AARS	32.195

The resulting DataFrame with one column has an MulitIndex with the sample and feature identifier.

Collaborative Filtering#

# # # CollaborativeFilteringTransformer?

Let’s set up collaborative filtering without a validation or test set, using all the data there is.

cf_model = CollaborativeFilteringTransformer(
    target_column='intensity',
    sample_column='Sample ID',
    item_column='protein group',
    out_folder='runs/scikit_interface')

We use fit and transform to train the model and impute the missing values.

Scikit learns interface requires a X and y. y is the validation data in our context. We might have to change the interface to allow usage within pipelines (-> y is not needed). This will probably mean setting up a validation set within the model.

cf_model.fit(df,
             cuda=False,
             epochs_max=20,
             )

suggested_lr.valley = 0.00759

epoch	train_loss	valid_loss	time
0	8.529963	None	00:00
1	7.200674	None	00:00
2	3.814727	None	00:00
3	2.007462	None	00:00
4	1.234751	None	00:00
5	0.811100	None	00:00
6	0.610548	None	00:00
7	0.512519	None	00:00
8	0.456441	None	00:00
9	0.416702	None	00:00
10	0.382653	None	00:00
11	0.350776	None	00:00
12	0.326353	None	00:00
13	0.307845	None	00:00
14	0.291788	None	00:00
15	0.278374	None	00:00
16	0.270331	None	00:00
17	0.265799	None	00:00
18	0.262403	None	00:00
19	0.260033	None	00:00

CollaborativeFilteringTransformer(item_column='protein group',
                                  out_folder=Path('runs/scikit_interface'),
                                  sample_column='Sample ID',
                                  target_column='intensity')

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

../_images/2852300caf229898d8629babb851e40beff82a95c971ba1cc7189f9d8994d04b.png

../_images/d67a19f3eb83cf3b9688920a2555603b5229b1dc00e7b5716862bc02368ba218.png

df_imputed = cf_model.transform(df).unstack()
assert df_imputed.isna().sum().sum() == 0
df_imputed.head()

protein group	AAAS	AACS	AAMDC	AAMP	AAR2	AARS	AARS2	AASDHPPT	AATF	ABCB10	...	ZNHIT2	ZNRF2	ZPR1	ZRANB2	ZW10	ZWILCH	ZWINT	ZYX	hCG_2014768;TMA7	pk;ZAK
Sample ID
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070	28.349	26.133	24.183	26.777	27.248	32.195	27.153	27.872	28.603	26.110	...	25.581	24.468	28.309	28.831	28.851	23.767	24.379	29.427	28.080	22.879
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070	27.657	25.019	24.236	26.271	27.211	31.979	26.530	28.191	27.942	25.735	...	24.828	23.950	28.133	27.267	28.243	23.228	24.977	28.081	28.355	22.424
2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070	28.352	23.740	24.532	27.098	27.377	32.885	27.515	28.476	28.771	26.787	...	25.184	24.808	28.968	27.322	29.388	24.116	25.803	29.525	28.997	23.098
2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070	26.826	24.534	24.005	26.256	27.314	31.926	26.157	27.635	27.851	25.346	...	25.152	23.586	27.636	27.278	28.076	23.038	24.688	28.783	28.311	22.552
2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070	27.404	26.948	23.864	26.982	26.520	31.844	25.342	27.416	27.474	27.151	...	24.591	25.831	25.838	29.326	27.491	26.406	25.206	30.422	29.979	26.939

5 rows × 4444 columns

Let’s plot the distribution of the imputed values vs the ones used for training:

df_imputed = df_imputed.stack()  # long-format
observed = df_imputed.loc[df.index]
imputed = df_imputed.loc[df_imputed.index.difference(df.index)]
df_imputed = df_imputed.unstack()  # back to wide-format
# some checks
assert len(df) == len(observed)
assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed)

fig, axes = plt.subplots(2, figsize=(8, 4))

min_max = vaep.plotting.data.get_min_max_iterable(
    [observed, imputed])
label_template = '{method} (N={n:,d})'
ax, _ = vaep.plotting.data.plot_histogram_intensities(
    observed,
    ax=axes[0],
    min_max=min_max,
    label=label_template.format(method='measured',
                                n=len(observed),
                                ),
    color='grey',
    alpha=1)
_ = ax.legend()
ax, _ = vaep.plotting.data.plot_histogram_intensities(
    imputed,
    ax=axes[1],
    min_max=min_max,
    label=label_template.format(method='CF imputed',
                                n=len(imputed),
                                ),
    color=color_model_mapping['CF'],
    alpha=1)
_ = ax.legend()

../_images/1d6c5e2bf230c20556353da7d8816fd0485d65980d778e3557d10609e6de07ca.png

AutoEncoder architectures#

# Reload data (for demonstration)

df = pd.read_csv(fn_intensities, index_col=0)
df.index.name = 'Sample ID'  # already set
df.columns.name = 'protein group'  # not set due to csv disk file format
df = np.log2(df + 1)  # log transform
df.head()

protein group	AAAS	AACS	AAMDC	AAMP	AAR2	AARS	AARS2	AASDHPPT	AATF	ABCB10	...	ZNHIT2	ZNRF2	ZPR1	ZRANB2	ZW10	ZWILCH	ZWINT	ZYX	hCG_2014768;TMA7	pk;ZAK
Sample ID
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070	28.349	26.133	NaN	26.777	27.248	32.195	27.153	27.872	28.603	26.110	...	25.581	24.468	28.309	28.831	28.851	NaN	24.379	29.427	28.080	22.879
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070	27.657	25.019	24.236	26.271	27.211	31.979	26.530	28.191	27.942	25.735	...	NaN	NaN	28.133	27.267	28.243	NaN	NaN	28.081	NaN	NaN
2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070	28.352	23.740	NaN	27.098	27.377	32.885	27.515	28.476	28.771	26.787	...	25.184	NaN	28.968	27.322	29.388	NaN	25.803	29.525	NaN	NaN
2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070	26.826	NaN	NaN	26.256	NaN	31.926	26.157	27.635	27.851	25.346	...	NaN	23.586	27.636	27.278	28.076	NaN	24.688	28.783	NaN	NaN
2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070	27.404	26.948	23.864	26.982	26.520	31.844	25.342	27.416	27.474	NaN	...	24.591	NaN	25.838	29.326	27.491	26.406	NaN	30.422	NaN	26.939

5 rows × 4535 columns

The AutoEncoder model currently need validation data for training. We will use 10% of the training data for validation.

Expect this limitation to be dropped in the next release. It will still be recommended to use validation data for early stopping.

freq_feat = df.notna().sum()
freq_feat.head()  # training data

protein group
AAAS    50
AACS    41
AAMDC   34
AAMP    49
AAR2    45
dtype: int64

We will use the sampling module to sample the validation data from the training data. Could be split differently by providing another weights vector.

val_X, train_X = vaep.sampling.sample_data(df.stack(),
                                           sample_index_to_drop=0,
                                           weights=freq_feat,
                                           frac=0.1,
                                           random_state=42,)
val_X, train_X = val_X.unstack(), train_X.unstack()
val_X = pd.DataFrame(pd.NA, index=train_X.index,
                     columns=train_X.columns).fillna(val_X)

Training data and validation data have the same shape:

val_X.shape, train_X.shape

((50, 4535), (50, 4535))

… but different number of intensities (non-missing values):

train_X.notna().sum().sum(), val_X.notna().sum().sum(),

(168772, 18749)

Select either DAE or VAE model:

model_selected = 'VAE'  # 'DAE'
model = AETransformer(
    model=model_selected,
    hidden_layers=[512,],
    latent_dim=50,
    out_folder='runs/scikit_interface',
    batch_size=10,
)

model.fit(train_X, val_X,
          epochs_max=50,
          cuda=False)

epoch	train_loss	valid_loss	time
0	5263.688965	575.540161	00:00
1	5250.653320	582.976990	00:00
2	5232.384277	589.242188	00:00
3	5203.558105	590.081299	00:00
4	5161.442871	578.325928	00:00
5	5100.220215	562.913696	00:00
6	5008.938477	542.291443	00:00
7	4884.893066	527.598633	00:00
8	4762.076660	534.657715	00:00
9	4619.890625	470.860931	00:00
10	4479.796387	432.840454	00:00
11	4362.392090	426.794220	00:00
12	4210.000000	456.412994	00:00
13	4120.044434	409.974152	00:00
14	4012.461670	425.563751	00:00
15	3904.450439	399.370148	00:00
16	3836.915039	409.731140	00:00
17	3762.779053	390.391632	00:00
18	3668.801270	390.791595	00:00
19	3571.271484	385.722076	00:00
20	3565.185059	388.589874	00:00
21	3510.432861	388.273590	00:00
22	3501.600342	401.065979	00:00
23	3456.472900	402.170776	00:00
24	3383.094727	389.488190	00:00
25	3327.834229	378.514038	00:00
26	3325.521484	376.065063	00:00
27	3327.646484	388.688782	00:00
28	3314.706299	391.576630	00:00
29	3276.506592	391.946381	00:00
30	3237.869385	384.283508	00:00
31	3188.368652	378.874847	00:00
32	3200.665039	378.588867	00:00
33	3154.601562	377.464325	00:00
34	3113.523926	375.160248	00:00
35	3101.998535	372.518280	00:00
36	3088.725830	372.239777	00:00
37	3103.972900	373.625519	00:00
38	3060.247803	371.208466	00:00
39	3069.627197	369.537537	00:00
40	3007.891113	371.785583	00:00
41	2999.756592	369.459991	00:00
42	3058.050049	371.742096	00:00
43	3041.616943	373.255280	00:00
44	2997.935303	373.980865	00:00
45	2993.135742	373.912109	00:00
46	2983.536621	371.833679	00:00
47	2969.012207	372.888062	00:00
48	2950.870850	372.356720	00:00
49	2918.028076	371.421478	00:00

AETransformer(batch_size=10, hidden_layers=[512], latent_dim=50,
              model=<class 'vaep.models.vae.VAE'>,
              out_folder=Path('runs/scikit_interface'))

../_images/e31ea1be85c04c18a93bfde390b7915101d6621f814e6928ea042291a6f401b3.png

../_images/ff8adc33327f25756b5cbb2f1900ee4f139b7831b2e83a9c19ac5e0eeb9196d3.png

df_imputed = model.transform(train_X)
df_imputed

protein group	AAAS	AACS	AAMDC	AAMP	AAR2	AARS	AARS2	AASDHPPT	AATF	ABCB10	...	ZNHIT2	ZNRF2	ZPR1	ZRANB2	ZW10	ZWILCH	ZWINT	ZYX	hCG_2014768;TMA7	pk;ZAK
Sample ID
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070	28.349	26.133	24.687	26.777	27.248	32.195	27.153	27.872	28.603	26.110	...	25.581	24.468	28.309	28.831	28.851	24.142	24.379	29.427	28.080	22.879
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070	27.657	25.019	24.236	26.271	27.211	31.979	26.530	28.191	27.942	25.735	...	25.037	24.124	28.133	27.267	28.243	23.879	25.562	29.072	27.862	24.778
2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070	28.352	23.740	24.631	27.098	27.377	32.885	27.515	28.476	28.771	26.787	...	25.184	24.374	28.968	27.322	29.388	23.927	25.803	29.525	28.484	24.597
2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070	26.826	25.263	24.446	26.256	26.351	31.926	26.157	27.635	27.851	25.346	...	24.888	23.586	27.636	27.278	28.067	23.561	24.688	28.783	27.626	24.350
2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070	27.404	26.948	23.864	26.982	26.520	31.844	25.342	27.416	27.474	26.771	...	24.591	25.577	25.838	29.326	27.491	26.406	24.891	30.422	29.450	26.939
2019_12_29_18_18_Q-Exactive-HF-X-Orbitrap_6070	27.891	26.481	26.348	27.849	26.917	32.274	26.128	27.514	28.081	27.181	...	24.849	25.820	26.750	29.652	27.635	26.037	25.002	30.928	29.656	27.088
2020_01_02_17_38_Q-Exactive-HF-X-Orbitrap_6070	25.527	24.733	23.395	25.810	25.368	30.226	24.805	23.801	25.130	25.508	...	25.393	24.554	25.560	27.518	25.148	24.108	25.444	28.465	28.902	25.637
2020_01_03_11_17_Q-Exactive-HF-X-Orbitrap_6070	26.346	25.169	24.433	25.275	24.846	31.184	25.643	24.893	25.324	25.888	...	25.185	24.573	24.336	27.579	26.528	24.643	25.398	29.726	28.933	26.036
2020_01_03_16_58_Q-Exactive-HF-X-Orbitrap_6070	27.620	25.624	23.520	27.136	25.971	31.415	25.360	26.165	25.750	26.424	...	24.982	24.940	25.198	28.906	27.331	25.069	25.009	30.952	29.513	27.086
2020_01_03_20_10_Q-Exactive-HF-X-Orbitrap_6070	27.300	25.950	25.660	27.133	26.897	31.455	25.437	26.813	26.201	26.407	...	24.685	24.661	25.071	28.636	26.542	24.685	25.325	31.005	29.326	27.081
2020_01_04_04_23_Q-Exactive-HF-X-Orbitrap_6070	27.978	27.612	26.211	27.912	26.846	32.207	26.207	27.276	26.386	26.789	...	25.052	25.824	26.646	29.886	28.079	24.789	25.821	31.040	29.584	27.296
2020_01_04_10_03_Q-Exactive-HF-X-Orbitrap_6070	22.666	24.490	23.081	25.780	24.899	30.259	25.352	23.758	25.000	25.412	...	25.809	24.116	25.909	25.360	23.210	23.409	26.160	27.829	29.235	24.343
2020_01_04_14_59_Q-Exactive-HF-X-Orbitrap_6070	25.504	24.762	23.456	24.765	25.300	29.998	25.209	24.174	24.324	25.649	...	25.516	24.421	25.253	28.000	24.597	24.003	25.747	29.215	29.202	25.338
2020_01_06_20_17_Q-Exactive-HF-X-Orbitrap_6070	28.265	27.735	25.685	27.814	28.167	32.554	25.292	27.979	26.711	27.397	...	25.068	26.179	26.728	28.904	28.565	25.628	25.977	31.364	29.846	28.089
2020_01_08_16_43_Q-Exactive-HF-X-Orbitrap_6070	25.734	25.442	23.851	25.347	25.573	31.127	25.185	24.365	26.179	25.720	...	25.009	24.531	25.172	28.213	27.034	24.768	25.026	29.550	29.219	26.038
2020_01_09_11_07_Q-Exactive-HF-X-Orbitrap_6070	26.167	25.039	24.302	26.877	25.986	31.450	23.883	25.810	26.233	25.938	...	24.810	24.843	25.139	29.214	26.790	24.450	24.593	30.142	29.919	26.579
2020_01_15_13_56_Q-Exactive-HF-X-Orbitrap_6070	26.019	23.935	23.789	24.982	26.254	31.423	25.218	27.088	27.190	24.796	...	24.424	24.100	27.588	27.579	27.219	23.043	24.458	28.274	27.318	23.882
2020_01_20_15_10_Q-Exactive-HF-X-Orbitrap_6070	28.735	25.257	26.648	29.133	28.211	32.706	26.536	28.103	28.168	27.969	...	25.249	26.922	26.504	30.309	29.199	26.144	26.088	31.172	30.340	28.287
2020_02_05_20_55_Q-Exactive-HF-X-Orbitrap_6070	28.251	27.489	25.840	28.594	27.077	32.396	26.182	27.940	27.834	27.513	...	24.657	27.386	25.160	29.116	28.574	25.718	26.055	31.277	30.084	28.386
2020_02_10_15_41_Q-Exactive-HF-X-Orbitrap_6070	29.216	26.687	25.797	27.454	28.929	33.656	27.984	29.301	30.141	27.712	...	26.945	25.851	30.179	29.271	29.721	26.007	26.972	30.912	28.756	26.306
2020_02_11_10_35_Q-Exactive-HF-X-Orbitrap_6070	29.156	27.582	25.516	26.618	28.682	33.080	27.977	29.028	29.880	27.453	...	27.031	25.395	29.081	29.526	29.949	25.797	31.137	30.381	28.867	25.823
2020_02_12_05_06_Q-Exactive-HF-X-Orbitrap_6070	28.986	27.394	26.058	27.348	28.755	33.793	29.140	28.825	30.001	28.209	...	27.372	25.831	29.925	29.958	30.142	25.309	27.381	30.655	29.114	26.138
2020_02_13_00_26_Q-Exactive-HF-X-Orbitrap_6070	29.606	26.766	26.377	26.431	27.898	33.962	28.432	29.481	30.180	27.825	...	26.946	26.276	30.236	29.713	29.636	26.038	27.857	30.738	29.097	26.175
2020_02_13_03_11_Q-Exactive-HF-X-Orbitrap_6070	29.156	26.283	26.031	26.762	28.006	33.848	28.954	29.958	29.293	28.178	...	26.706	26.542	30.055	29.757	29.471	25.575	27.986	30.951	29.057	26.259
2020_02_17_13_55_Q-Exactive-HF-X-Orbitrap_6070	28.435	26.624	25.520	26.281	26.224	32.584	28.020	27.786	28.410	26.512	...	25.928	24.560	28.609	29.086	28.943	24.317	25.066	29.478	28.396	24.931
2020_02_18_01_25_Q-Exactive-HF-X-Orbitrap_6070	28.010	25.331	24.825	25.831	23.969	32.741	26.967	28.236	28.260	27.266	...	24.340	24.216	29.082	28.328	28.738	23.202	25.830	29.733	28.120	22.075
2020_02_18_18_55_Q-Exactive-HF-X-Orbitrap_6070	29.169	24.772	23.877	26.013	27.353	32.433	27.537	28.055	28.457	27.914	...	25.758	24.773	28.231	27.755	28.956	24.298	25.263	29.180	27.856	24.382
2020_02_28_12_27_Q-Exactive-HF-X-Orbitrap_6070	27.702	25.953	24.649	26.582	27.248	32.374	26.985	27.615	27.874	26.436	...	23.680	24.114	28.108	27.367	28.125	23.248	23.587	29.475	28.028	24.595
2020_03_01_23_00_Q-Exactive-HF-X-Orbitrap_6070	27.596	25.257	22.921	24.869	26.878	32.064	26.496	26.617	27.837	25.671	...	23.869	24.104	28.118	27.876	27.828	23.337	25.334	28.864	27.486	24.127
2020_03_06_16_22_Q-Exactive-HF-X-Orbitrap_6070	26.089	23.806	22.751	25.799	24.384	30.550	24.357	24.911	25.216	24.823	...	24.889	24.581	24.773	27.669	25.744	24.451	24.689	28.426	29.060	25.356
2020_03_07_18_15_Q-Exactive-HF-X-Orbitrap_6070	27.002	24.577	25.052	26.740	26.882	31.519	24.914	25.916	24.219	26.380	...	24.296	25.001	25.151	28.962	26.890	25.057	24.324	29.190	29.350	26.855
2020_03_11_11_25_Q-Exactive-HF-X-Orbitrap_6070	27.699	26.344	23.317	25.791	27.523	32.387	27.716	28.179	28.601	27.517	...	26.404	23.770	28.379	28.115	28.806	24.492	25.244	29.521	28.009	23.867
2020_05_04_11_39_Q-Exactive-HF-X-Orbitrap_6070	26.127	25.320	23.936	25.801	25.236	31.588	26.096	27.627	26.755	24.640	...	25.051	23.949	27.260	28.086	27.064	22.932	25.153	28.430	27.139	23.606
2020_05_12_15_13_Q-Exactive-HF-X-Orbitrap_6070	27.138	25.875	24.069	25.722	26.513	31.377	26.142	27.486	27.216	26.009	...	24.442	24.141	27.425	27.544	27.718	22.851	24.988	28.414	27.406	24.011
2020_05_12_18_10_Q-Exactive-HF-X-Orbitrap_6070	26.844	24.719	25.460	25.282	25.656	31.438	26.304	26.419	26.825	24.954	...	24.311	24.117	27.222	26.817	27.165	23.220	22.620	27.841	26.861	23.780
2020_05_12_21_07_Q-Exactive-HF-X-Orbitrap_6070	26.853	25.421	23.887	24.534	25.596	31.049	26.026	26.243	27.130	25.424	...	24.957	24.075	26.966	27.552	26.945	22.819	24.906	28.062	26.830	23.613
2020_05_14_14_46_Q-Exactive-HF-X-Orbitrap_6070	26.203	24.863	23.161	24.822	26.838	31.714	26.299	26.211	26.895	24.676	...	23.434	24.072	27.176	27.620	27.762	23.260	28.917	28.554	27.281	23.539
2020_05_14_17_28_Q-Exactive-HF-X-Orbitrap_6070	27.203	23.885	23.234	25.293	26.415	31.904	26.677	26.635	27.414	25.155	...	25.425	24.426	27.383	27.327	27.766	22.375	29.057	28.312	27.458	24.152
2020_05_14_20_11_Q-Exactive-HF-X-Orbitrap_6070	27.429	25.154	24.332	25.126	26.066	31.983	26.534	27.689	27.233	25.296	...	24.855	24.085	27.429	28.054	27.582	22.707	25.492	28.834	27.948	24.312
2020_05_15_10_30_Q-Exactive-HF-X-Orbitrap_6070	27.494	25.606	25.852	25.617	24.857	32.562	27.693	28.119	28.009	25.884	...	25.679	24.240	27.664	28.274	28.464	22.570	26.519	29.342	27.787	23.671
2020_05_20_12_33_Q-Exactive-HF-X-Orbitrap_6070	27.396	25.293	24.217	25.741	25.893	31.680	25.822	27.158	27.412	25.629	...	25.054	24.491	28.688	25.496	27.340	23.640	25.492	28.276	28.252	24.493
2020_05_20_15_35_Q-Exactive-HF-X-Orbitrap_6070	27.721	24.916	24.125	26.078	26.726	32.361	27.015	27.807	27.217	27.048	...	25.176	23.532	28.525	28.459	27.581	23.529	26.087	28.691	28.056	24.099
2020_05_22_14_57_Q-Exactive-HF-X-Orbitrap_6070	27.526	24.714	24.350	25.857	25.693	32.394	27.619	27.153	28.066	25.988	...	25.031	24.173	27.836	27.771	27.941	23.626	25.669	28.632	26.564	24.705
2020_05_22_17_43_Q-Exactive-HF-X-Orbitrap_6070	28.051	25.608	26.030	25.883	25.536	32.177	27.501	28.506	27.846	25.779	...	26.065	23.442	28.081	28.437	28.081	22.678	25.442	28.992	28.031	24.119
2020_05_26_14_20_Q-Exactive-HF-X-Orbitrap_6070	27.325	26.800	25.519	25.204	27.263	33.171	28.125	28.389	28.409	27.050	...	25.907	25.162	29.071	28.690	28.289	23.853	26.854	29.962	28.684	25.004
2020_05_27_13_57_Q-Exactive-HF-X-Orbitrap_6070	29.119	27.061	25.989	27.999	28.294	34.448	29.088	30.015	29.825	28.311	...	26.922	25.578	30.025	29.927	30.126	24.652	27.951	30.794	29.299	26.444
2020_05_28_04_06_Q-Exactive-HF-X-Orbitrap_6070	30.080	27.373	26.760	27.601	28.023	34.192	28.785	30.316	29.114	29.002	...	26.960	26.407	30.156	30.137	30.456	26.042	28.032	31.340	29.689	26.766
2020_06_01_10_22_Q-Exactive-HF-X-Orbitrap_6070	27.298	25.592	23.002	28.318	25.985	31.398	23.917	25.811	26.357	25.992	...	25.702	24.568	23.989	27.854	27.574	24.337	25.434	29.383	29.336	25.825
2020_06_01_15_41_Q-Exactive-HF-X-Orbitrap_6070	27.121	26.384	25.407	27.254	26.707	31.958	26.180	26.219	28.125	27.165	...	26.590	25.582	26.180	29.010	28.721	25.082	26.552	30.147	29.542	26.405
2020_06_02_09_41_Q-Exactive-HF-X-Orbitrap_6070	29.038	25.989	25.120	26.409	28.088	33.220	27.455	28.759	28.676	27.160	...	26.946	24.944	28.451	28.098	28.760	24.503	24.794	29.847	29.212	25.445

50 rows × 4535 columns

Evaluate the model using the validation data:

pred_val = val_X.stack().to_frame('observed')
pred_val[model_selected] = df_imputed.stack()
pred_val

		observed	VAE
Sample ID	protein group
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070	ABHD12	26.389	26.197
	ABI1	26.321	26.613
	ACACA	29.997	30.802
	ACIN1	30.397	29.896
	ACTR1A	29.517	29.938
...	...	...	...
2020_06_02_09_41_Q-Exactive-HF-X-Orbitrap_6070	ZNF598	27.412	28.527
	ZNF787	26.053	25.885
	ZNF830	25.724	25.375
	ZPR1	28.973	28.451
	ZW10	29.425	28.760

18749 rows × 2 columns

val_metrics = vaep.models.calculte_metrics(pred_val, 'observed')
# val_metrics = metrics.add_metrics(
#     pred_val, key='test data')
# val_metrics = pd.DataFrame(val_metrics)
# val_metrics
pd.DataFrame(val_metrics)

	VAE
MAE	0.502
MSE	0.519
N	18,749.000
prop	1.000

fig, ax = plt.subplots(figsize=(8, 2))

ax, errors_binned = vaep.plotting.errors.plot_errors_by_median(
    pred=pred_val,
    target_col='observed',
    feat_medians=train_X.median(),
    ax=ax,
    metric_name='MAE',
    palette=color_model_mapping
)

../_images/3b5866951dbabbdc43d201a6cb3165ad8cdb9eaa07faa6b385de6a809161a1f1.png

replace predicted values with validation data values

df_imputed = df_imputed.replace(val_X)

df = df.stack()  # long-format
df_imputed = df_imputed.stack()  # long-format
observed = df_imputed.loc[df.index]
imputed = df_imputed.loc[df_imputed.index.difference(df.index)]

fig, axes = plt.subplots(2, figsize=(8, 4))

min_max = vaep.plotting.data.get_min_max_iterable(
    [observed, imputed])
label_template = '{method} (N={n:,d})'
ax, _ = vaep.plotting.data.plot_histogram_intensities(
    observed,
    ax=axes[0],
    min_max=min_max,
    label=label_template.format(method='measured',
                                n=len(observed),
                                ),
    color='grey',
    alpha=1)
_ = ax.legend()
ax, _ = vaep.plotting.data.plot_histogram_intensities(
    imputed,
    ax=axes[1],
    min_max=min_max,
    label=label_template.format(method=f'{model_selected} imputed',
                                n=len(imputed),
                                ),
    color=color_model_mapping[model_selected],
    alpha=1)
_ = ax.legend()

../_images/91d252e3c3f69e1bdc50c2b6278430c5d9133fc2f717c6789440e422ae0bde8e.png

Scikit-learn styple transformers of the data

Contents

Scikit-learn styple transformers of the data#

Data#

Collaborative Filtering#

AutoEncoder architectures#