Scikit-learn styple transformers of the data#
Load data into pandas dataframe
Fit transformer on training data
Impute only missing values with predictions from model
Autoencoders need wide training data, i.e. a sample with all its features’ intensities, whereas Collaborative Filtering needs long training data, i.e. sample identifier a feature identifier and the intensity. Both data formats can be transformed into each other, but models using long data format do not need to take care of missing values.
import os
from importlib import metadata
IN_COLAB = 'COLAB_GPU' in os.environ
if IN_COLAB:
try:
_v = metadata.version('pimms-learn')
print(f"Running in colab and pimms-learn ({_v}) is installed.")
except metadata.PackageNotFoundError:
print("Install PIMMS...")
# !pip install git+https://github.com/RasmussenLab/pimms.git@dev
!pip install pimms-learn
If on colab, please restart the environment and run everything from here on.
import os
IN_COLAB = 'COLAB_GPU' in os.environ
fn_intensities = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'
if IN_COLAB:
fn_intensities = 'https://raw.githubusercontent.com/RasmussenLab/pimms/main/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from vaep.plotting.defaults import color_model_mapping
import vaep.plotting.data
import vaep.sampling
from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer
from vaep.sklearn.ae_transformer import AETransformer
vaep.plotting.make_large_descriptors(8)
Matplotlib is building the font cache; this may take a moment.
Data#
df = pd.read_csv(fn_intensities, index_col=0)
df.head()
AAAS | AACS | AAMDC | AAMP | AAR2 | AARS | AARS2 | AASDHPPT | AATF | ABCB10 | ... | ZNHIT2 | ZNRF2 | ZPR1 | ZRANB2 | ZW10 | ZWILCH | ZWINT | ZYX | hCG_2014768;TMA7 | pk;ZAK | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Sample ID | |||||||||||||||||||||
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070 | 341,970,000.000 | 73,598,000.000 | NaN | 114,990,000.000 | 159,370,000.000 | 4,916,300,000.000 | 149,190,000.000 | 245,660,000.000 | 407,590,000.000 | 72,440,000.000 | ... | 50,194,000.000 | 23,201,000.000 | 332,480,000.000 | 477,690,000.000 | 484,070,000.000 | NaN | 21,823,000.000 | 721,850,000.000 | 283,680,000.000 | 7,714,600.000 |
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070 | 211,690,000.000 | 33,991,000.000 | 19,762,000.000 | 80,960,000.000 | 155,320,000.000 | 4,233,400,000.000 | 96,914,000.000 | 306,530,000.000 | 257,840,000.000 | 55,844,000.000 | ... | NaN | NaN | 294,320,000.000 | 161,550,000.000 | 317,600,000.000 | NaN | NaN | 283,840,000.000 | NaN | NaN |
2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070 | 342,650,000.000 | 14,015,000.000 | NaN | 143,640,000.000 | 174,350,000.000 | 7,929,200,000.000 | 191,730,000.000 | 373,270,000.000 | 458,030,000.000 | 115,780,000.000 | ... | 38,113,000.000 | NaN | 525,090,000.000 | 167,830,000.000 | 702,420,000.000 | NaN | 58,540,000.000 | 772,560,000.000 | NaN | NaN |
2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070 | 118,930,000.000 | NaN | NaN | 80,158,000.000 | NaN | 4,081,400,000.000 | 74,818,000.000 | 208,420,000.000 | 242,070,000.000 | 42,650,000.000 | ... | NaN | 12,589,000.000 | 208,620,000.000 | 162,760,000.000 | 282,950,000.000 | NaN | 27,023,000.000 | 461,970,000.000 | NaN | NaN |
2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070 | 177,550,000.000 | 129,510,000.000 | 15,272,000.000 | 132,520,000.000 | 96,217,000.000 | 3,854,300,000.000 | 42,534,000.000 | 179,130,000.000 | 186,440,000.000 | NaN | ... | 25,275,000.000 | NaN | 59,980,000.000 | 673,000,000.000 | 188,690,000.000 | 88,943,000.000 | NaN | 1,438,800,000.000 | NaN | 128,670,000.000 |
5 rows × 4535 columns
We will need the data in long format for Collaborative Filtering. Naming both the row and column index assures that the data can be transformed very easily into long format:
df.index.name = 'Sample ID' # already set
df.columns.name = 'protein group' # not set due to csv disk file format
df.head()
protein group | AAAS | AACS | AAMDC | AAMP | AAR2 | AARS | AARS2 | AASDHPPT | AATF | ABCB10 | ... | ZNHIT2 | ZNRF2 | ZPR1 | ZRANB2 | ZW10 | ZWILCH | ZWINT | ZYX | hCG_2014768;TMA7 | pk;ZAK |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Sample ID | |||||||||||||||||||||
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070 | 341,970,000.000 | 73,598,000.000 | NaN | 114,990,000.000 | 159,370,000.000 | 4,916,300,000.000 | 149,190,000.000 | 245,660,000.000 | 407,590,000.000 | 72,440,000.000 | ... | 50,194,000.000 | 23,201,000.000 | 332,480,000.000 | 477,690,000.000 | 484,070,000.000 | NaN | 21,823,000.000 | 721,850,000.000 | 283,680,000.000 | 7,714,600.000 |
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070 | 211,690,000.000 | 33,991,000.000 | 19,762,000.000 | 80,960,000.000 | 155,320,000.000 | 4,233,400,000.000 | 96,914,000.000 | 306,530,000.000 | 257,840,000.000 | 55,844,000.000 | ... | NaN | NaN | 294,320,000.000 | 161,550,000.000 | 317,600,000.000 | NaN | NaN | 283,840,000.000 | NaN | NaN |
2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070 | 342,650,000.000 | 14,015,000.000 | NaN | 143,640,000.000 | 174,350,000.000 | 7,929,200,000.000 | 191,730,000.000 | 373,270,000.000 | 458,030,000.000 | 115,780,000.000 | ... | 38,113,000.000 | NaN | 525,090,000.000 | 167,830,000.000 | 702,420,000.000 | NaN | 58,540,000.000 | 772,560,000.000 | NaN | NaN |
2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070 | 118,930,000.000 | NaN | NaN | 80,158,000.000 | NaN | 4,081,400,000.000 | 74,818,000.000 | 208,420,000.000 | 242,070,000.000 | 42,650,000.000 | ... | NaN | 12,589,000.000 | 208,620,000.000 | 162,760,000.000 | 282,950,000.000 | NaN | 27,023,000.000 | 461,970,000.000 | NaN | NaN |
2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070 | 177,550,000.000 | 129,510,000.000 | 15,272,000.000 | 132,520,000.000 | 96,217,000.000 | 3,854,300,000.000 | 42,534,000.000 | 179,130,000.000 | 186,440,000.000 | NaN | ... | 25,275,000.000 | NaN | 59,980,000.000 | 673,000,000.000 | 188,690,000.000 | 88,943,000.000 | NaN | 1,438,800,000.000 | NaN | 128,670,000.000 |
5 rows × 4535 columns
Transform the data using the logarithm, here using base 2:
df = np.log2(df + 1)
df.head()
protein group | AAAS | AACS | AAMDC | AAMP | AAR2 | AARS | AARS2 | AASDHPPT | AATF | ABCB10 | ... | ZNHIT2 | ZNRF2 | ZPR1 | ZRANB2 | ZW10 | ZWILCH | ZWINT | ZYX | hCG_2014768;TMA7 | pk;ZAK |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Sample ID | |||||||||||||||||||||
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070 | 28.349 | 26.133 | NaN | 26.777 | 27.248 | 32.195 | 27.153 | 27.872 | 28.603 | 26.110 | ... | 25.581 | 24.468 | 28.309 | 28.831 | 28.851 | NaN | 24.379 | 29.427 | 28.080 | 22.879 |
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070 | 27.657 | 25.019 | 24.236 | 26.271 | 27.211 | 31.979 | 26.530 | 28.191 | 27.942 | 25.735 | ... | NaN | NaN | 28.133 | 27.267 | 28.243 | NaN | NaN | 28.081 | NaN | NaN |
2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070 | 28.352 | 23.740 | NaN | 27.098 | 27.377 | 32.885 | 27.515 | 28.476 | 28.771 | 26.787 | ... | 25.184 | NaN | 28.968 | 27.322 | 29.388 | NaN | 25.803 | 29.525 | NaN | NaN |
2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070 | 26.826 | NaN | NaN | 26.256 | NaN | 31.926 | 26.157 | 27.635 | 27.851 | 25.346 | ... | NaN | 23.586 | 27.636 | 27.278 | 28.076 | NaN | 24.688 | 28.783 | NaN | NaN |
2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070 | 27.404 | 26.948 | 23.864 | 26.982 | 26.520 | 31.844 | 25.342 | 27.416 | 27.474 | NaN | ... | 24.591 | NaN | 25.838 | 29.326 | 27.491 | 26.406 | NaN | 30.422 | NaN | 26.939 |
5 rows × 4535 columns
two plots on data availability:
proportion of missing values per feature median (N = protein groups)
CDF of available intensities per protein group
ax = vaep.plotting.data.plot_feat_median_over_prop_missing(
data=df, type='boxplot')
df.notna().sum().sort_values().plot()
<Axes: xlabel='protein group'>
define a minimum feature and sample frequency for a feature to be included
SELECT_FEAT = True
def select_features(df, feat_prevalence=.2, axis=0):
# ! vaep.filter.select_features
N = df.shape[axis]
minimum_freq = N * feat_prevalence
freq = df.notna().sum(axis=axis)
mask = freq >= minimum_freq
print(f"Drop {(~mask).sum()} along axis {axis}.")
freq = freq.loc[mask]
if axis == 0:
df = df.loc[:, mask]
else:
df = df.loc[mask]
return df
if SELECT_FEAT:
# potentially this can take a few iterations to stabilize.
df = select_features(df, feat_prevalence=.2)
df = select_features(df=df, feat_prevalence=.3, axis=1)
df.shape
Drop 91 along axis 0.
Drop 0 along axis 1.
(50, 4444)
Transform to long-data format:
df = df.stack().to_frame('intensity')
df.head()
intensity | ||
---|---|---|
Sample ID | protein group | |
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070 | AAAS | 28.349 |
AACS | 26.133 | |
AAMP | 26.777 | |
AAR2 | 27.248 | |
AARS | 32.195 |
The resulting DataFrame with one column has an MulitIndex
with the sample and feature identifier.
Collaborative Filtering#
# # # CollaborativeFilteringTransformer?
Let’s set up collaborative filtering without a validation or test set, using all the data there is.
cf_model = CollaborativeFilteringTransformer(
target_column='intensity',
sample_column='Sample ID',
item_column='protein group',
out_folder='runs/scikit_interface')
We use fit
and transform
to train the model and impute the missing values.
Scikit learns interface requires a
X
andy
.y
is the validation data in our context. We might have to change the interface to allow usage within pipelines (->y
is not needed). This will probably mean setting up a validation set within the model.
cf_model.fit(df,
cuda=False,
epochs_max=20,
)
suggested_lr.valley = 0.00759
epoch | train_loss | valid_loss | time |
---|---|---|---|
0 | 8.529963 | None | 00:00 |
1 | 7.200674 | None | 00:00 |
2 | 3.814727 | None | 00:00 |
3 | 2.007462 | None | 00:00 |
4 | 1.234751 | None | 00:00 |
5 | 0.811100 | None | 00:00 |
6 | 0.610548 | None | 00:00 |
7 | 0.512519 | None | 00:00 |
8 | 0.456441 | None | 00:00 |
9 | 0.416702 | None | 00:00 |
10 | 0.382653 | None | 00:00 |
11 | 0.350776 | None | 00:00 |
12 | 0.326353 | None | 00:00 |
13 | 0.307845 | None | 00:00 |
14 | 0.291788 | None | 00:00 |
15 | 0.278374 | None | 00:00 |
16 | 0.270331 | None | 00:00 |
17 | 0.265799 | None | 00:00 |
18 | 0.262403 | None | 00:00 |
19 | 0.260033 | None | 00:00 |
CollaborativeFilteringTransformer(item_column='protein group', out_folder=Path('runs/scikit_interface'), sample_column='Sample ID', target_column='intensity')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CollaborativeFilteringTransformer(item_column='protein group', out_folder=Path('runs/scikit_interface'), sample_column='Sample ID', target_column='intensity')
df_imputed = cf_model.transform(df).unstack()
assert df_imputed.isna().sum().sum() == 0
df_imputed.head()
protein group | AAAS | AACS | AAMDC | AAMP | AAR2 | AARS | AARS2 | AASDHPPT | AATF | ABCB10 | ... | ZNHIT2 | ZNRF2 | ZPR1 | ZRANB2 | ZW10 | ZWILCH | ZWINT | ZYX | hCG_2014768;TMA7 | pk;ZAK |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Sample ID | |||||||||||||||||||||
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070 | 28.349 | 26.133 | 24.183 | 26.777 | 27.248 | 32.195 | 27.153 | 27.872 | 28.603 | 26.110 | ... | 25.581 | 24.468 | 28.309 | 28.831 | 28.851 | 23.767 | 24.379 | 29.427 | 28.080 | 22.879 |
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070 | 27.657 | 25.019 | 24.236 | 26.271 | 27.211 | 31.979 | 26.530 | 28.191 | 27.942 | 25.735 | ... | 24.828 | 23.950 | 28.133 | 27.267 | 28.243 | 23.228 | 24.977 | 28.081 | 28.355 | 22.424 |
2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070 | 28.352 | 23.740 | 24.532 | 27.098 | 27.377 | 32.885 | 27.515 | 28.476 | 28.771 | 26.787 | ... | 25.184 | 24.808 | 28.968 | 27.322 | 29.388 | 24.116 | 25.803 | 29.525 | 28.997 | 23.098 |
2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070 | 26.826 | 24.534 | 24.005 | 26.256 | 27.314 | 31.926 | 26.157 | 27.635 | 27.851 | 25.346 | ... | 25.152 | 23.586 | 27.636 | 27.278 | 28.076 | 23.038 | 24.688 | 28.783 | 28.311 | 22.552 |
2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070 | 27.404 | 26.948 | 23.864 | 26.982 | 26.520 | 31.844 | 25.342 | 27.416 | 27.474 | 27.151 | ... | 24.591 | 25.831 | 25.838 | 29.326 | 27.491 | 26.406 | 25.206 | 30.422 | 29.979 | 26.939 |
5 rows × 4444 columns
Let’s plot the distribution of the imputed values vs the ones used for training:
df_imputed = df_imputed.stack() # long-format
observed = df_imputed.loc[df.index]
imputed = df_imputed.loc[df_imputed.index.difference(df.index)]
df_imputed = df_imputed.unstack() # back to wide-format
# some checks
assert len(df) == len(observed)
assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed)
fig, axes = plt.subplots(2, figsize=(8, 4))
min_max = vaep.plotting.data.get_min_max_iterable(
[observed, imputed])
label_template = '{method} (N={n:,d})'
ax, _ = vaep.plotting.data.plot_histogram_intensities(
observed,
ax=axes[0],
min_max=min_max,
label=label_template.format(method='measured',
n=len(observed),
),
color='grey',
alpha=1)
_ = ax.legend()
ax, _ = vaep.plotting.data.plot_histogram_intensities(
imputed,
ax=axes[1],
min_max=min_max,
label=label_template.format(method='CF imputed',
n=len(imputed),
),
color=color_model_mapping['CF'],
alpha=1)
_ = ax.legend()
AutoEncoder architectures#
# Reload data (for demonstration)
df = pd.read_csv(fn_intensities, index_col=0)
df.index.name = 'Sample ID' # already set
df.columns.name = 'protein group' # not set due to csv disk file format
df = np.log2(df + 1) # log transform
df.head()
protein group | AAAS | AACS | AAMDC | AAMP | AAR2 | AARS | AARS2 | AASDHPPT | AATF | ABCB10 | ... | ZNHIT2 | ZNRF2 | ZPR1 | ZRANB2 | ZW10 | ZWILCH | ZWINT | ZYX | hCG_2014768;TMA7 | pk;ZAK |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Sample ID | |||||||||||||||||||||
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070 | 28.349 | 26.133 | NaN | 26.777 | 27.248 | 32.195 | 27.153 | 27.872 | 28.603 | 26.110 | ... | 25.581 | 24.468 | 28.309 | 28.831 | 28.851 | NaN | 24.379 | 29.427 | 28.080 | 22.879 |
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070 | 27.657 | 25.019 | 24.236 | 26.271 | 27.211 | 31.979 | 26.530 | 28.191 | 27.942 | 25.735 | ... | NaN | NaN | 28.133 | 27.267 | 28.243 | NaN | NaN | 28.081 | NaN | NaN |
2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070 | 28.352 | 23.740 | NaN | 27.098 | 27.377 | 32.885 | 27.515 | 28.476 | 28.771 | 26.787 | ... | 25.184 | NaN | 28.968 | 27.322 | 29.388 | NaN | 25.803 | 29.525 | NaN | NaN |
2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070 | 26.826 | NaN | NaN | 26.256 | NaN | 31.926 | 26.157 | 27.635 | 27.851 | 25.346 | ... | NaN | 23.586 | 27.636 | 27.278 | 28.076 | NaN | 24.688 | 28.783 | NaN | NaN |
2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070 | 27.404 | 26.948 | 23.864 | 26.982 | 26.520 | 31.844 | 25.342 | 27.416 | 27.474 | NaN | ... | 24.591 | NaN | 25.838 | 29.326 | 27.491 | 26.406 | NaN | 30.422 | NaN | 26.939 |
5 rows × 4535 columns
The AutoEncoder model currently need validation data for training. We will use 10% of the training data for validation.
Expect this limitation to be dropped in the next release. It will still be recommended to use validation data for early stopping.
freq_feat = df.notna().sum()
freq_feat.head() # training data
protein group
AAAS 50
AACS 41
AAMDC 34
AAMP 49
AAR2 45
dtype: int64
We will use the sampling
module to sample the validation data from the training data.
Could be split differently by providing another weights
vector.
val_X, train_X = vaep.sampling.sample_data(df.stack(),
sample_index_to_drop=0,
weights=freq_feat,
frac=0.1,
random_state=42,)
val_X, train_X = val_X.unstack(), train_X.unstack()
val_X = pd.DataFrame(pd.NA, index=train_X.index,
columns=train_X.columns).fillna(val_X)
Training data and validation data have the same shape:
val_X.shape, train_X.shape
((50, 4535), (50, 4535))
… but different number of intensities (non-missing values):
train_X.notna().sum().sum(), val_X.notna().sum().sum(),
(168772, 18749)
Select either DAE
or VAE
model:
model_selected = 'VAE' # 'DAE'
model = AETransformer(
model=model_selected,
hidden_layers=[512,],
latent_dim=50,
out_folder='runs/scikit_interface',
batch_size=10,
)
model.fit(train_X, val_X,
epochs_max=50,
cuda=False)
epoch | train_loss | valid_loss | time |
---|---|---|---|
0 | 5263.688965 | 575.540161 | 00:00 |
1 | 5250.653320 | 582.976990 | 00:00 |
2 | 5232.384277 | 589.242188 | 00:00 |
3 | 5203.558105 | 590.081299 | 00:00 |
4 | 5161.442871 | 578.325928 | 00:00 |
5 | 5100.220215 | 562.913696 | 00:00 |
6 | 5008.938477 | 542.291443 | 00:00 |
7 | 4884.893066 | 527.598633 | 00:00 |
8 | 4762.076660 | 534.657715 | 00:00 |
9 | 4619.890625 | 470.860931 | 00:00 |
10 | 4479.796387 | 432.840454 | 00:00 |
11 | 4362.392090 | 426.794220 | 00:00 |
12 | 4210.000000 | 456.412994 | 00:00 |
13 | 4120.044434 | 409.974152 | 00:00 |
14 | 4012.461670 | 425.563751 | 00:00 |
15 | 3904.450439 | 399.370148 | 00:00 |
16 | 3836.915039 | 409.731140 | 00:00 |
17 | 3762.779053 | 390.391632 | 00:00 |
18 | 3668.801270 | 390.791595 | 00:00 |
19 | 3571.271484 | 385.722076 | 00:00 |
20 | 3565.185059 | 388.589874 | 00:00 |
21 | 3510.432861 | 388.273590 | 00:00 |
22 | 3501.600342 | 401.065979 | 00:00 |
23 | 3456.472900 | 402.170776 | 00:00 |
24 | 3383.094727 | 389.488190 | 00:00 |
25 | 3327.834229 | 378.514038 | 00:00 |
26 | 3325.521484 | 376.065063 | 00:00 |
27 | 3327.646484 | 388.688782 | 00:00 |
28 | 3314.706299 | 391.576630 | 00:00 |
29 | 3276.506592 | 391.946381 | 00:00 |
30 | 3237.869385 | 384.283508 | 00:00 |
31 | 3188.368652 | 378.874847 | 00:00 |
32 | 3200.665039 | 378.588867 | 00:00 |
33 | 3154.601562 | 377.464325 | 00:00 |
34 | 3113.523926 | 375.160248 | 00:00 |
35 | 3101.998535 | 372.518280 | 00:00 |
36 | 3088.725830 | 372.239777 | 00:00 |
37 | 3103.972900 | 373.625519 | 00:00 |
38 | 3060.247803 | 371.208466 | 00:00 |
39 | 3069.627197 | 369.537537 | 00:00 |
40 | 3007.891113 | 371.785583 | 00:00 |
41 | 2999.756592 | 369.459991 | 00:00 |
42 | 3058.050049 | 371.742096 | 00:00 |
43 | 3041.616943 | 373.255280 | 00:00 |
44 | 2997.935303 | 373.980865 | 00:00 |
45 | 2993.135742 | 373.912109 | 00:00 |
46 | 2983.536621 | 371.833679 | 00:00 |
47 | 2969.012207 | 372.888062 | 00:00 |
48 | 2950.870850 | 372.356720 | 00:00 |
49 | 2918.028076 | 371.421478 | 00:00 |
AETransformer(batch_size=10, hidden_layers=[512], latent_dim=50, model=<class 'vaep.models.vae.VAE'>, out_folder=Path('runs/scikit_interface'))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
AETransformer(batch_size=10, hidden_layers=[512], latent_dim=50, model=<class 'vaep.models.vae.VAE'>, out_folder=Path('runs/scikit_interface'))
df_imputed = model.transform(train_X)
df_imputed
protein group | AAAS | AACS | AAMDC | AAMP | AAR2 | AARS | AARS2 | AASDHPPT | AATF | ABCB10 | ... | ZNHIT2 | ZNRF2 | ZPR1 | ZRANB2 | ZW10 | ZWILCH | ZWINT | ZYX | hCG_2014768;TMA7 | pk;ZAK |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Sample ID | |||||||||||||||||||||
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070 | 28.349 | 26.133 | 24.687 | 26.777 | 27.248 | 32.195 | 27.153 | 27.872 | 28.603 | 26.110 | ... | 25.581 | 24.468 | 28.309 | 28.831 | 28.851 | 24.142 | 24.379 | 29.427 | 28.080 | 22.879 |
2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070 | 27.657 | 25.019 | 24.236 | 26.271 | 27.211 | 31.979 | 26.530 | 28.191 | 27.942 | 25.735 | ... | 25.037 | 24.124 | 28.133 | 27.267 | 28.243 | 23.879 | 25.562 | 29.072 | 27.862 | 24.778 |
2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070 | 28.352 | 23.740 | 24.631 | 27.098 | 27.377 | 32.885 | 27.515 | 28.476 | 28.771 | 26.787 | ... | 25.184 | 24.374 | 28.968 | 27.322 | 29.388 | 23.927 | 25.803 | 29.525 | 28.484 | 24.597 |
2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070 | 26.826 | 25.263 | 24.446 | 26.256 | 26.351 | 31.926 | 26.157 | 27.635 | 27.851 | 25.346 | ... | 24.888 | 23.586 | 27.636 | 27.278 | 28.067 | 23.561 | 24.688 | 28.783 | 27.626 | 24.350 |
2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070 | 27.404 | 26.948 | 23.864 | 26.982 | 26.520 | 31.844 | 25.342 | 27.416 | 27.474 | 26.771 | ... | 24.591 | 25.577 | 25.838 | 29.326 | 27.491 | 26.406 | 24.891 | 30.422 | 29.450 | 26.939 |
2019_12_29_18_18_Q-Exactive-HF-X-Orbitrap_6070 | 27.891 | 26.481 | 26.348 | 27.849 | 26.917 | 32.274 | 26.128 | 27.514 | 28.081 | 27.181 | ... | 24.849 | 25.820 | 26.750 | 29.652 | 27.635 | 26.037 | 25.002 | 30.928 | 29.656 | 27.088 |
2020_01_02_17_38_Q-Exactive-HF-X-Orbitrap_6070 | 25.527 | 24.733 | 23.395 | 25.810 | 25.368 | 30.226 | 24.805 | 23.801 | 25.130 | 25.508 | ... | 25.393 | 24.554 | 25.560 | 27.518 | 25.148 | 24.108 | 25.444 | 28.465 | 28.902 | 25.637 |
2020_01_03_11_17_Q-Exactive-HF-X-Orbitrap_6070 | 26.346 | 25.169 | 24.433 | 25.275 | 24.846 | 31.184 | 25.643 | 24.893 | 25.324 | 25.888 | ... | 25.185 | 24.573 | 24.336 | 27.579 | 26.528 | 24.643 | 25.398 | 29.726 | 28.933 | 26.036 |
2020_01_03_16_58_Q-Exactive-HF-X-Orbitrap_6070 | 27.620 | 25.624 | 23.520 | 27.136 | 25.971 | 31.415 | 25.360 | 26.165 | 25.750 | 26.424 | ... | 24.982 | 24.940 | 25.198 | 28.906 | 27.331 | 25.069 | 25.009 | 30.952 | 29.513 | 27.086 |
2020_01_03_20_10_Q-Exactive-HF-X-Orbitrap_6070 | 27.300 | 25.950 | 25.660 | 27.133 | 26.897 | 31.455 | 25.437 | 26.813 | 26.201 | 26.407 | ... | 24.685 | 24.661 | 25.071 | 28.636 | 26.542 | 24.685 | 25.325 | 31.005 | 29.326 | 27.081 |
2020_01_04_04_23_Q-Exactive-HF-X-Orbitrap_6070 | 27.978 | 27.612 | 26.211 | 27.912 | 26.846 | 32.207 | 26.207 | 27.276 | 26.386 | 26.789 | ... | 25.052 | 25.824 | 26.646 | 29.886 | 28.079 | 24.789 | 25.821 | 31.040 | 29.584 | 27.296 |
2020_01_04_10_03_Q-Exactive-HF-X-Orbitrap_6070 | 22.666 | 24.490 | 23.081 | 25.780 | 24.899 | 30.259 | 25.352 | 23.758 | 25.000 | 25.412 | ... | 25.809 | 24.116 | 25.909 | 25.360 | 23.210 | 23.409 | 26.160 | 27.829 | 29.235 | 24.343 |
2020_01_04_14_59_Q-Exactive-HF-X-Orbitrap_6070 | 25.504 | 24.762 | 23.456 | 24.765 | 25.300 | 29.998 | 25.209 | 24.174 | 24.324 | 25.649 | ... | 25.516 | 24.421 | 25.253 | 28.000 | 24.597 | 24.003 | 25.747 | 29.215 | 29.202 | 25.338 |
2020_01_06_20_17_Q-Exactive-HF-X-Orbitrap_6070 | 28.265 | 27.735 | 25.685 | 27.814 | 28.167 | 32.554 | 25.292 | 27.979 | 26.711 | 27.397 | ... | 25.068 | 26.179 | 26.728 | 28.904 | 28.565 | 25.628 | 25.977 | 31.364 | 29.846 | 28.089 |
2020_01_08_16_43_Q-Exactive-HF-X-Orbitrap_6070 | 25.734 | 25.442 | 23.851 | 25.347 | 25.573 | 31.127 | 25.185 | 24.365 | 26.179 | 25.720 | ... | 25.009 | 24.531 | 25.172 | 28.213 | 27.034 | 24.768 | 25.026 | 29.550 | 29.219 | 26.038 |
2020_01_09_11_07_Q-Exactive-HF-X-Orbitrap_6070 | 26.167 | 25.039 | 24.302 | 26.877 | 25.986 | 31.450 | 23.883 | 25.810 | 26.233 | 25.938 | ... | 24.810 | 24.843 | 25.139 | 29.214 | 26.790 | 24.450 | 24.593 | 30.142 | 29.919 | 26.579 |
2020_01_15_13_56_Q-Exactive-HF-X-Orbitrap_6070 | 26.019 | 23.935 | 23.789 | 24.982 | 26.254 | 31.423 | 25.218 | 27.088 | 27.190 | 24.796 | ... | 24.424 | 24.100 | 27.588 | 27.579 | 27.219 | 23.043 | 24.458 | 28.274 | 27.318 | 23.882 |
2020_01_20_15_10_Q-Exactive-HF-X-Orbitrap_6070 | 28.735 | 25.257 | 26.648 | 29.133 | 28.211 | 32.706 | 26.536 | 28.103 | 28.168 | 27.969 | ... | 25.249 | 26.922 | 26.504 | 30.309 | 29.199 | 26.144 | 26.088 | 31.172 | 30.340 | 28.287 |
2020_02_05_20_55_Q-Exactive-HF-X-Orbitrap_6070 | 28.251 | 27.489 | 25.840 | 28.594 | 27.077 | 32.396 | 26.182 | 27.940 | 27.834 | 27.513 | ... | 24.657 | 27.386 | 25.160 | 29.116 | 28.574 | 25.718 | 26.055 | 31.277 | 30.084 | 28.386 |
2020_02_10_15_41_Q-Exactive-HF-X-Orbitrap_6070 | 29.216 | 26.687 | 25.797 | 27.454 | 28.929 | 33.656 | 27.984 | 29.301 | 30.141 | 27.712 | ... | 26.945 | 25.851 | 30.179 | 29.271 | 29.721 | 26.007 | 26.972 | 30.912 | 28.756 | 26.306 |
2020_02_11_10_35_Q-Exactive-HF-X-Orbitrap_6070 | 29.156 | 27.582 | 25.516 | 26.618 | 28.682 | 33.080 | 27.977 | 29.028 | 29.880 | 27.453 | ... | 27.031 | 25.395 | 29.081 | 29.526 | 29.949 | 25.797 | 31.137 | 30.381 | 28.867 | 25.823 |
2020_02_12_05_06_Q-Exactive-HF-X-Orbitrap_6070 | 28.986 | 27.394 | 26.058 | 27.348 | 28.755 | 33.793 | 29.140 | 28.825 | 30.001 | 28.209 | ... | 27.372 | 25.831 | 29.925 | 29.958 | 30.142 | 25.309 | 27.381 | 30.655 | 29.114 | 26.138 |
2020_02_13_00_26_Q-Exactive-HF-X-Orbitrap_6070 | 29.606 | 26.766 | 26.377 | 26.431 | 27.898 | 33.962 | 28.432 | 29.481 | 30.180 | 27.825 | ... | 26.946 | 26.276 | 30.236 | 29.713 | 29.636 | 26.038 | 27.857 | 30.738 | 29.097 | 26.175 |
2020_02_13_03_11_Q-Exactive-HF-X-Orbitrap_6070 | 29.156 | 26.283 | 26.031 | 26.762 | 28.006 | 33.848 | 28.954 | 29.958 | 29.293 | 28.178 | ... | 26.706 | 26.542 | 30.055 | 29.757 | 29.471 | 25.575 | 27.986 | 30.951 | 29.057 | 26.259 |
2020_02_17_13_55_Q-Exactive-HF-X-Orbitrap_6070 | 28.435 | 26.624 | 25.520 | 26.281 | 26.224 | 32.584 | 28.020 | 27.786 | 28.410 | 26.512 | ... | 25.928 | 24.560 | 28.609 | 29.086 | 28.943 | 24.317 | 25.066 | 29.478 | 28.396 | 24.931 |
2020_02_18_01_25_Q-Exactive-HF-X-Orbitrap_6070 | 28.010 | 25.331 | 24.825 | 25.831 | 23.969 | 32.741 | 26.967 | 28.236 | 28.260 | 27.266 | ... | 24.340 | 24.216 | 29.082 | 28.328 | 28.738 | 23.202 | 25.830 | 29.733 | 28.120 | 22.075 |
2020_02_18_18_55_Q-Exactive-HF-X-Orbitrap_6070 | 29.169 | 24.772 | 23.877 | 26.013 | 27.353 | 32.433 | 27.537 | 28.055 | 28.457 | 27.914 | ... | 25.758 | 24.773 | 28.231 | 27.755 | 28.956 | 24.298 | 25.263 | 29.180 | 27.856 | 24.382 |
2020_02_28_12_27_Q-Exactive-HF-X-Orbitrap_6070 | 27.702 | 25.953 | 24.649 | 26.582 | 27.248 | 32.374 | 26.985 | 27.615 | 27.874 | 26.436 | ... | 23.680 | 24.114 | 28.108 | 27.367 | 28.125 | 23.248 | 23.587 | 29.475 | 28.028 | 24.595 |
2020_03_01_23_00_Q-Exactive-HF-X-Orbitrap_6070 | 27.596 | 25.257 | 22.921 | 24.869 | 26.878 | 32.064 | 26.496 | 26.617 | 27.837 | 25.671 | ... | 23.869 | 24.104 | 28.118 | 27.876 | 27.828 | 23.337 | 25.334 | 28.864 | 27.486 | 24.127 |
2020_03_06_16_22_Q-Exactive-HF-X-Orbitrap_6070 | 26.089 | 23.806 | 22.751 | 25.799 | 24.384 | 30.550 | 24.357 | 24.911 | 25.216 | 24.823 | ... | 24.889 | 24.581 | 24.773 | 27.669 | 25.744 | 24.451 | 24.689 | 28.426 | 29.060 | 25.356 |
2020_03_07_18_15_Q-Exactive-HF-X-Orbitrap_6070 | 27.002 | 24.577 | 25.052 | 26.740 | 26.882 | 31.519 | 24.914 | 25.916 | 24.219 | 26.380 | ... | 24.296 | 25.001 | 25.151 | 28.962 | 26.890 | 25.057 | 24.324 | 29.190 | 29.350 | 26.855 |
2020_03_11_11_25_Q-Exactive-HF-X-Orbitrap_6070 | 27.699 | 26.344 | 23.317 | 25.791 | 27.523 | 32.387 | 27.716 | 28.179 | 28.601 | 27.517 | ... | 26.404 | 23.770 | 28.379 | 28.115 | 28.806 | 24.492 | 25.244 | 29.521 | 28.009 | 23.867 |
2020_05_04_11_39_Q-Exactive-HF-X-Orbitrap_6070 | 26.127 | 25.320 | 23.936 | 25.801 | 25.236 | 31.588 | 26.096 | 27.627 | 26.755 | 24.640 | ... | 25.051 | 23.949 | 27.260 | 28.086 | 27.064 | 22.932 | 25.153 | 28.430 | 27.139 | 23.606 |
2020_05_12_15_13_Q-Exactive-HF-X-Orbitrap_6070 | 27.138 | 25.875 | 24.069 | 25.722 | 26.513 | 31.377 | 26.142 | 27.486 | 27.216 | 26.009 | ... | 24.442 | 24.141 | 27.425 | 27.544 | 27.718 | 22.851 | 24.988 | 28.414 | 27.406 | 24.011 |
2020_05_12_18_10_Q-Exactive-HF-X-Orbitrap_6070 | 26.844 | 24.719 | 25.460 | 25.282 | 25.656 | 31.438 | 26.304 | 26.419 | 26.825 | 24.954 | ... | 24.311 | 24.117 | 27.222 | 26.817 | 27.165 | 23.220 | 22.620 | 27.841 | 26.861 | 23.780 |
2020_05_12_21_07_Q-Exactive-HF-X-Orbitrap_6070 | 26.853 | 25.421 | 23.887 | 24.534 | 25.596 | 31.049 | 26.026 | 26.243 | 27.130 | 25.424 | ... | 24.957 | 24.075 | 26.966 | 27.552 | 26.945 | 22.819 | 24.906 | 28.062 | 26.830 | 23.613 |
2020_05_14_14_46_Q-Exactive-HF-X-Orbitrap_6070 | 26.203 | 24.863 | 23.161 | 24.822 | 26.838 | 31.714 | 26.299 | 26.211 | 26.895 | 24.676 | ... | 23.434 | 24.072 | 27.176 | 27.620 | 27.762 | 23.260 | 28.917 | 28.554 | 27.281 | 23.539 |
2020_05_14_17_28_Q-Exactive-HF-X-Orbitrap_6070 | 27.203 | 23.885 | 23.234 | 25.293 | 26.415 | 31.904 | 26.677 | 26.635 | 27.414 | 25.155 | ... | 25.425 | 24.426 | 27.383 | 27.327 | 27.766 | 22.375 | 29.057 | 28.312 | 27.458 | 24.152 |
2020_05_14_20_11_Q-Exactive-HF-X-Orbitrap_6070 | 27.429 | 25.154 | 24.332 | 25.126 | 26.066 | 31.983 | 26.534 | 27.689 | 27.233 | 25.296 | ... | 24.855 | 24.085 | 27.429 | 28.054 | 27.582 | 22.707 | 25.492 | 28.834 | 27.948 | 24.312 |
2020_05_15_10_30_Q-Exactive-HF-X-Orbitrap_6070 | 27.494 | 25.606 | 25.852 | 25.617 | 24.857 | 32.562 | 27.693 | 28.119 | 28.009 | 25.884 | ... | 25.679 | 24.240 | 27.664 | 28.274 | 28.464 | 22.570 | 26.519 | 29.342 | 27.787 | 23.671 |
2020_05_20_12_33_Q-Exactive-HF-X-Orbitrap_6070 | 27.396 | 25.293 | 24.217 | 25.741 | 25.893 | 31.680 | 25.822 | 27.158 | 27.412 | 25.629 | ... | 25.054 | 24.491 | 28.688 | 25.496 | 27.340 | 23.640 | 25.492 | 28.276 | 28.252 | 24.493 |
2020_05_20_15_35_Q-Exactive-HF-X-Orbitrap_6070 | 27.721 | 24.916 | 24.125 | 26.078 | 26.726 | 32.361 | 27.015 | 27.807 | 27.217 | 27.048 | ... | 25.176 | 23.532 | 28.525 | 28.459 | 27.581 | 23.529 | 26.087 | 28.691 | 28.056 | 24.099 |
2020_05_22_14_57_Q-Exactive-HF-X-Orbitrap_6070 | 27.526 | 24.714 | 24.350 | 25.857 | 25.693 | 32.394 | 27.619 | 27.153 | 28.066 | 25.988 | ... | 25.031 | 24.173 | 27.836 | 27.771 | 27.941 | 23.626 | 25.669 | 28.632 | 26.564 | 24.705 |
2020_05_22_17_43_Q-Exactive-HF-X-Orbitrap_6070 | 28.051 | 25.608 | 26.030 | 25.883 | 25.536 | 32.177 | 27.501 | 28.506 | 27.846 | 25.779 | ... | 26.065 | 23.442 | 28.081 | 28.437 | 28.081 | 22.678 | 25.442 | 28.992 | 28.031 | 24.119 |
2020_05_26_14_20_Q-Exactive-HF-X-Orbitrap_6070 | 27.325 | 26.800 | 25.519 | 25.204 | 27.263 | 33.171 | 28.125 | 28.389 | 28.409 | 27.050 | ... | 25.907 | 25.162 | 29.071 | 28.690 | 28.289 | 23.853 | 26.854 | 29.962 | 28.684 | 25.004 |
2020_05_27_13_57_Q-Exactive-HF-X-Orbitrap_6070 | 29.119 | 27.061 | 25.989 | 27.999 | 28.294 | 34.448 | 29.088 | 30.015 | 29.825 | 28.311 | ... | 26.922 | 25.578 | 30.025 | 29.927 | 30.126 | 24.652 | 27.951 | 30.794 | 29.299 | 26.444 |
2020_05_28_04_06_Q-Exactive-HF-X-Orbitrap_6070 | 30.080 | 27.373 | 26.760 | 27.601 | 28.023 | 34.192 | 28.785 | 30.316 | 29.114 | 29.002 | ... | 26.960 | 26.407 | 30.156 | 30.137 | 30.456 | 26.042 | 28.032 | 31.340 | 29.689 | 26.766 |
2020_06_01_10_22_Q-Exactive-HF-X-Orbitrap_6070 | 27.298 | 25.592 | 23.002 | 28.318 | 25.985 | 31.398 | 23.917 | 25.811 | 26.357 | 25.992 | ... | 25.702 | 24.568 | 23.989 | 27.854 | 27.574 | 24.337 | 25.434 | 29.383 | 29.336 | 25.825 |
2020_06_01_15_41_Q-Exactive-HF-X-Orbitrap_6070 | 27.121 | 26.384 | 25.407 | 27.254 | 26.707 | 31.958 | 26.180 | 26.219 | 28.125 | 27.165 | ... | 26.590 | 25.582 | 26.180 | 29.010 | 28.721 | 25.082 | 26.552 | 30.147 | 29.542 | 26.405 |
2020_06_02_09_41_Q-Exactive-HF-X-Orbitrap_6070 | 29.038 | 25.989 | 25.120 | 26.409 | 28.088 | 33.220 | 27.455 | 28.759 | 28.676 | 27.160 | ... | 26.946 | 24.944 | 28.451 | 28.098 | 28.760 | 24.503 | 24.794 | 29.847 | 29.212 | 25.445 |
50 rows × 4535 columns
Evaluate the model using the validation data:
pred_val = val_X.stack().to_frame('observed')
pred_val[model_selected] = df_imputed.stack()
pred_val
observed | VAE | ||
---|---|---|---|
Sample ID | protein group | ||
2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070 | ABHD12 | 26.389 | 26.197 |
ABI1 | 26.321 | 26.613 | |
ACACA | 29.997 | 30.802 | |
ACIN1 | 30.397 | 29.896 | |
ACTR1A | 29.517 | 29.938 | |
... | ... | ... | ... |
2020_06_02_09_41_Q-Exactive-HF-X-Orbitrap_6070 | ZNF598 | 27.412 | 28.527 |
ZNF787 | 26.053 | 25.885 | |
ZNF830 | 25.724 | 25.375 | |
ZPR1 | 28.973 | 28.451 | |
ZW10 | 29.425 | 28.760 |
18749 rows × 2 columns
val_metrics = vaep.models.calculte_metrics(pred_val, 'observed')
# val_metrics = metrics.add_metrics(
# pred_val, key='test data')
# val_metrics = pd.DataFrame(val_metrics)
# val_metrics
pd.DataFrame(val_metrics)
VAE | |
---|---|
MAE | 0.502 |
MSE | 0.519 |
N | 18,749.000 |
prop | 1.000 |
fig, ax = plt.subplots(figsize=(8, 2))
ax, errors_binned = vaep.plotting.errors.plot_errors_by_median(
pred=pred_val,
target_col='observed',
feat_medians=train_X.median(),
ax=ax,
metric_name='MAE',
palette=color_model_mapping
)
replace predicted values with validation data values
df_imputed = df_imputed.replace(val_X)
df = df.stack() # long-format
df_imputed = df_imputed.stack() # long-format
observed = df_imputed.loc[df.index]
imputed = df_imputed.loc[df_imputed.index.difference(df.index)]
fig, axes = plt.subplots(2, figsize=(8, 4))
min_max = vaep.plotting.data.get_min_max_iterable(
[observed, imputed])
label_template = '{method} (N={n:,d})'
ax, _ = vaep.plotting.data.plot_histogram_intensities(
observed,
ax=axes[0],
min_max=min_max,
label=label_template.format(method='measured',
n=len(observed),
),
color='grey',
alpha=1)
_ = ax.legend()
ax, _ = vaep.plotting.data.plot_histogram_intensities(
imputed,
ax=axes[1],
min_max=min_max,
label=label_template.format(method=f'{model_selected} imputed',
n=len(imputed),
),
color=color_model_mapping[model_selected],
alpha=1)
_ = ax.legend()