Source code for pimmslearn.io.datasets

from typing import Tuple

import numpy as np
import pandas as pd
import sklearn.pipeline
import torch
from torch.utils.data import Dataset

DEFAULT_DTYPE = torch.get_default_dtype()


[docs] class PeptideDatasetInMemory(Dataset): """Peptide Dataset fully in memory.""" nan = torch.tensor(float("NaN")) def __init__(self, data: np.array, mask: np.array = None, fill_na=0.0): """Build torch.Tensors for DataLoader. Parameters ---------- data : np.array Peptide data for training, potentially with missings. mask : [type], optional Mask selecting values for evaluation from data(y), by default None If no mask is provided, all non-missing values from `data`-array will be used. fill_na : int, optional value to replace missing values with, by default 0 """ self.peptides = torch.FloatTensor(data) if mask is None: self.mask = torch.from_numpy(np.isnan(data)) else: self.mask = torch.from_numpy(mask) self.y = torch.where(~self.mask, self.peptides, self.nan) if mask is not None: self.peptides = torch.where(~self.mask, self.nan, self.peptides) self.peptides = torch.where( self.peptides.isnan(), torch.FloatTensor([fill_na]), self.peptides ) self.length_ = len(self.peptides) def __len__(self): return self.length_ def __getitem__(self, idx): return self.mask[idx], self.peptides[idx], self.y[idx]
[docs] def to_tensor(s: pd.Series) -> torch.Tensor: return torch.from_numpy(s.values).type(DEFAULT_DTYPE)
[docs] class DatasetWithMaskAndNoTarget(Dataset): # nan = torch.tensor(np.float32('NaN')) # if res.dtype is torch.float64: return res.float() def __init__(self, df: pd.DataFrame, transformer: sklearn.pipeline.Pipeline = None): if not issubclass(type(df), pd.DataFrame): raise ValueError(f"please pass a pandas DataFrame, not: {type(df) = }") self.mask_isna = df.isna() # .astype('uint8') # in case 0,1 is preferred self.columns = df.columns self.transformer = transformer if transformer: if hasattr(transformer, "transform"): df = transformer.transform(df) else: raise AttributeError( f"{type(transformer)} is not sklearn compatible, has no inverse_transform." ) self.data = df self.length_ = len(self.data) def __len__(self): return self.length_ def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor]: mask_isna = self.mask_isna.iloc[idx] data = self.data.iloc[idx] mask_isna, data = to_tensor(mask_isna), to_tensor(data) return mask_isna, data
[docs] class DatasetWithTarget(DatasetWithMaskAndNoTarget): def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: mask, data = super().__getitem__(idx) return mask, data, data
[docs] class DatasetWithTargetSpecifyTarget(DatasetWithMaskAndNoTarget): def __init__( self, df: pd.DataFrame, targets: pd.DataFrame, transformer: sklearn.pipeline.Pipeline = None, ): """Create a dataset for validation. Parameters ---------- df : pd.DataFrame DataFrame, indexed as targets targets : Targets to use for evaluation DataFrame, indexed as df transformer : sklearn.pipeline.Pipeline, optional transformation pipeline to use, by default None """ if not issubclass(type(df), pd.DataFrame): raise ValueError(f"please pass a pandas DataFrame, not: {type(df) = }") self.mask_isna = targets.isna() self.columns = df.columns self.transformer = transformer self.target = df.fillna( targets ) # not really necessary, without mask would not be needed if transformer: if hasattr(transformer, "transform"): df = transformer.transform(df) self.target = transformer.transform(self.target) else: raise AttributeError( f"{type(transformer)} is not sklearn compatible, has no inverse_transform." ) self.data = df self.length_ = len(self.data) def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor]: mask_isna, data = super().__getitem__(idx) target = to_tensor(self.target.iloc[idx]) return mask_isna, data, target