vaep.io package#

class vaep.io.PathsList(files, folder)#

Bases: tuple

files#

Alias for field number 0

folder#

Alias for field number 1

vaep.io.add_indices(array: array, original_df: DataFrame, index_only: bool = False) DataFrame[source]#

Add indices to array using provided origional DataFrame.

Parameters:
  • array (np.array) – Array of data to add indices to.

  • original_df (pd.DataFrame) – Original DataFrame data was generated from.

  • index_only (bool, optional) – Only add row index, by default False

Returns:

DataFrame with array data and original indices.

Return type:

pd.DataFrame

vaep.io.dump_json(data_dict: dict, filename: Union[str, Path])[source]#

Dump dictionary as JSON.

Parameters:
  • data_dict (dict) – Dictionary with valid JSON entries to dump.

  • filename (Union[str, Path]) – Filepath to save dictionary as JSON.

vaep.io.dump_to_csv(df: DataFrame, folder: Path, outfolder: Path, parent_folder_fct=None) None[source]#
vaep.io.extend_name(fname: Union[str, Path], extend_by: str, ext: Optional[str] = None) Path[source]#

Extend the name of a file.

Parameters:
  • fname (Union[str, Path]) – Filepath to file to rename.

  • extend_by (str) – Extend file stem by string

Returns:

Changed filepath with extension

Return type:

Path

vaep.io.from_pickle(fname)[source]#
vaep.io.get_fname_from_keys(keys, folder='.', file_ext='.pkl', remove_duplicates=True)[source]#
vaep.io.load_json(fname: Union[str, Path]) dict[source]#

Load JSON from disc.

Parameters:

fname (Union[str, Path]) – Filepath to JSON on disk.

Returns:

Loaded JSON file.

Return type:

dict

vaep.io.parse_dict(input_dict: dict, types: ~typing.Tuple[~typing.Tuple] = ((<class 'pathlib.PurePath'>, <function <lambda>>), (<class 'numpy.ndarray'>, <function <lambda>>)))[source]#

Transform a set of items (instances) to their string representation

vaep.io.resolve_path(path: Union[str, Path], to: Union[str, Path] = '.') Path[source]#

Resolve a path partly overlapping with to another path.

vaep.io.search_files(path='.', query='.txt')[source]#

Uses Pathlib to find relative to path files with the query text in their file names. Returns the path relative to the specified path.

Parameters:
  • path (str, optional) – Path to search, by default ‘.’

  • query (str, optional) – query string for for filenames, by default ‘.txt’

Returns:

list with files as string containig query key.

Return type:

list

vaep.io.search_subfolders(path='.', depth: int = 1, exclude_root: bool = False)[source]#

Search subfolders relative to given path.

vaep.io.to_pickle(obj, fname)[source]#

Submodules#

vaep.io.dataloaders module#

class vaep.io.dataloaders.DataLoadersCreator(df_train: DataFrame, df_valid: DataFrame, scaler, DataSetClass: Dataset, batch_size: int)[source]#

Bases: object

DataLoader creator. For training or evaluation.

get_dls(shuffle_train: bool = True, **kwargs) Tuple[DataLoader, DataLoader][source]#
vaep.io.dataloaders.get_dls(train_X: DataFrame, valid_X: DataFrame, transformer: VaepPipeline, bs: int = 64, num_workers=0) DataLoaders[source]#

Create training and validation dataloaders

Parameters:
  • train_X (pandas.DataFrame) – Training Data, index is ignored for data fetching

  • valid_X (pandas.DataFrame) – Validation data, won’t be shuffled.

  • transformer (VaepPipeline) – Pipeline with separate encode and decode

  • bs (int, optional) – batch size, by default 64

  • num_workers (int, optional) – number of workers to use for data loading, by default 0

Returns:

FastAI DataLoaders with train and valid Dataloder

Return type:

fastai.data.core.DataLoaders

Example

import sklearn from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler

from vaep.dataloader import get_dls from vaep.transform import VaepPipeline

dae_default_pipeline = sklearn.pipeline.Pipeline(
[(‘normalize’, StandardScaler()),

(‘impute’, SimpleImputer(add_indicator=False)) ])

# train_X, val_X = None, None # pandas.DataFrames transforms = VaepPipeline(df_train=train_X,

encode=dae_default_pipeline, decode=[‘normalize’])

dls = get_dls(train_X, val_X, transforms, bs=4)

vaep.io.dataloaders.get_test_dl(df: ~pandas.core.frame.DataFrame, transformer: ~vaep.transform.VaepPipeline, dataset: ~torch.utils.data.dataset.Dataset = <class 'vaep.io.datasets.DatasetWithTarget'>, bs: int = 64)[source]#

[summary]

Parameters:
Returns:

DataLoader from fastai for test data.

Return type:

fastai.data.load.DataLoader

vaep.io.datasets module#

class vaep.io.datasets.DatasetWithMaskAndNoTarget(df: DataFrame, transformer: Optional[Pipeline] = None)[source]#

Bases: Dataset

class vaep.io.datasets.DatasetWithTarget(df: DataFrame, transformer: Optional[Pipeline] = None)[source]#

Bases: DatasetWithMaskAndNoTarget

class vaep.io.datasets.DatasetWithTargetSpecifyTarget(df: DataFrame, targets: DataFrame, transformer: Optional[Pipeline] = None)[source]#

Bases: DatasetWithMaskAndNoTarget

class vaep.io.datasets.PeptideDatasetInMemory(data: array, mask: Optional[array] = None, fill_na=0.0)[source]#

Bases: Dataset

Peptide Dataset fully in memory.

nan = tensor(nan)#
vaep.io.datasets.to_tensor(s: Series) Tensor[source]#

vaep.io.datasplits module#

class vaep.io.datasplits.DataSplits(is_wide_format: 'bool', train_X: 'pd.DataFrame' = None, val_y: 'pd.DataFrame' = None, test_y: 'pd.DataFrame' = None)[source]#

Bases: object

dump(folder='data', file_format='csv') dict[source]#

dump in long format.

classmethod from_folder(folder: str, use_wide_format=False, file_format='csv') DataSplits[source]#

Build DataSplits instance from folder.

interpolate(dataset: Union[str, DataFrame])[source]#
is_wide_format: bool#
load(folder: str, use_wide_format=False, file_format='csv') None[source]#

Load data in place from folder

test_y: DataFrame = None#
to_long_format(name_values: str = 'intensity')[source]#
to_wide_format()[source]#
train_X: DataFrame = None#
val_y: DataFrame = None#
vaep.io.datasplits.load_freq(folder: str, file='freq_features.pkl')[source]#
vaep.io.datasplits.load_items(folder: str, items: dict, use_wide_format=False, file_format='csv') dict[source]#
vaep.io.datasplits.long_format(df: DataFrame, colname_values: str = 'intensity') DataFrame[source]#
vaep.io.datasplits.to_long_format(df: pd.DataFrame, *, colname_values: str = 'intensity') pd.DataFrame#
vaep.io.datasplits.wide_format(df: DataFrame, columns: str = 'Sample ID', name_values: str = 'intensity') DataFrame[source]#

vaep.io.format module#

class vaep.io.format.bcolors[source]#

Bases: object

Class for colors changing string represenations in output.

Found: https://stackoverflow.com/a/287944/9684872

There are more options available:

BOLD = '\x1b[1m'#
ENDC = '\x1b[0m'#
FAIL = '\x1b[91m'#
HEADER = '\x1b[95m'#
OKBLUE = '\x1b[94m'#
OKCYAN = '\x1b[96m'#
OKGREEN = '\x1b[92m'#
UNDERLINE = '\x1b[4m'#
WARNING = '\x1b[93m'#
vaep.io.format.class_full_module(cls)[source]#

Return entire class name (repr notation) as str.

vaep.io.format.classname(obj)[source]#

Return entire object’s class name (repr notation) as str. Source: https://gist.github.com/clbarnes/edd28ea32010eb159b34b075687bb49e

Parameters:

obj (object) – any object

Returns:

Full class name with module name

Return type:

str

vaep.io.load module#

vaep.io.load.from_csv(fname: str, nrows: Optional[int] = None, index_col: Union[int, str, List] = 'Sample ID', verify_fname: bool = False, usecols=None, **kwargs)[source]#
vaep.io.load.from_pickle(fname: str, index_col: Union[int, str, List] = 'Sample ID', verify_fname: bool = False, usecols=None, **kwargs) DataFrame[source]#
vaep.io.load.verify_df(df: DataFrame, fname: str, index_col: str, verify_fname: bool = False, usecols=None)[source]#

vaep.io.types module#

papermill strategy to determine type see: nteract/papermill

vaep.io.types.resolve_type(value)[source]#