pimmslearn.io package#

class pimmslearn.io.PathsList(files, folder)#

Bases: tuple

files#

Alias for field number 0

folder#

Alias for field number 1

pimmslearn.io.add_indices(array: array, original_df: DataFrame, index_only: bool = False) DataFrame[source]#

Add indices to array using provided origional DataFrame.

Parameters:
  • array (np.array) – Array of data to add indices to.

  • original_df (pd.DataFrame) – Original DataFrame data was generated from.

  • index_only (bool, optional) – Only add row index, by default False

Returns:

DataFrame with array data and original indices.

Return type:

pd.DataFrame

pimmslearn.io.dump_json(data_dict: dict, filename: str | Path)[source]#

Dump dictionary as JSON.

Parameters:
  • data_dict (dict) – Dictionary with valid JSON entries to dump.

  • filename (Union[str, Path]) – Filepath to save dictionary as JSON.

pimmslearn.io.dump_to_csv(df: DataFrame, folder: Path, outfolder: Path, parent_folder_fct=None) None[source]#
pimmslearn.io.extend_name(fname: str | Path, extend_by: str, ext: str | None = None) Path[source]#

Extend the name of a file.

Parameters:
  • fname (Union[str, Path]) – Filepath to file to rename.

  • extend_by (str) – Extend file stem by string

Returns:

Changed filepath with extension

Return type:

Path

pimmslearn.io.from_pickle(fname)[source]#
pimmslearn.io.get_fname_from_keys(keys, folder='.', file_ext='.pkl', remove_duplicates=True)[source]#
pimmslearn.io.load_json(fname: str | Path) dict[source]#

Load JSON from disc.

Parameters:

fname (Union[str, Path]) – Filepath to JSON on disk.

Returns:

Loaded JSON file.

Return type:

dict

pimmslearn.io.parse_dict(input_dict: dict, types: ~typing.Tuple[~typing.Tuple] = ((<class 'pathlib.PurePath'>, <function <lambda>>), (<class 'numpy.ndarray'>, <function <lambda>>)))[source]#

Transform a set of items (instances) to their string representation

pimmslearn.io.resolve_path(path: str | Path, to: str | Path = '.') Path[source]#

Resolve a path partly overlapping with to another path.

pimmslearn.io.search_files(path='.', query='.txt')[source]#

Uses Pathlib to find relative to path files with the query text in their file names. Returns the path relative to the specified path.

Parameters:
  • path (str, optional) – Path to search, by default ‘.’

  • query (str, optional) – query string for for filenames, by default ‘.txt’

Returns:

list with files as string containig query key.

Return type:

list

pimmslearn.io.search_subfolders(path='.', depth: int = 1, exclude_root: bool = False)[source]#

Search subfolders relative to given path.

pimmslearn.io.to_pickle(obj, fname)[source]#

Submodules#

pimmslearn.io.dataloaders module#

pimmslearn.io.dataloaders.get_dls(train_X: DataFrame, valid_X: DataFrame, transformer: VaepPipeline, bs: int = 64, num_workers=0) DataLoaders[source]#

Create training and validation dataloaders

Parameters:
  • train_X (pandas.DataFrame) – Training Data, index is ignored for data fetching

  • valid_X (pandas.DataFrame) – Validation data, won’t be shuffled.

  • transformer (VaepPipeline) – Pipeline with separate encode and decode

  • bs (int, optional) – batch size, by default 64

  • num_workers (int, optional) – number of workers to use for data loading, by default 0

Returns:

FastAI DataLoaders with train and valid Dataloder

Return type:

fastai.data.core.DataLoaders

Example

import sklearn from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler

from pimmslearn.dataloader import get_dls from pimmslearn.transform import VaepPipeline

dae_default_pipeline = sklearn.pipeline.Pipeline(
[(‘normalize’, StandardScaler()),

(‘impute’, SimpleImputer(add_indicator=False)) ])

# train_X, val_X = None, None # pandas.DataFrames transforms = VaepPipeline(df_train=train_X,

encode=dae_default_pipeline, decode=[‘normalize’])

dls = get_dls(train_X, val_X, transforms, bs=4)

pimmslearn.io.dataloaders.get_test_dl(df: ~pandas.core.frame.DataFrame, transformer: ~pimmslearn.transform.VaepPipeline, dataset: ~torch.utils.data.dataset.Dataset = <class 'pimmslearn.io.datasets.DatasetWithTarget'>, bs: int = 64)[source]#

[summary]

Parameters:
Returns:

DataLoader from fastai for test data.

Return type:

fastai.data.load.DataLoader

pimmslearn.io.datasets module#

class pimmslearn.io.datasets.DatasetWithMaskAndNoTarget(df: DataFrame, transformer: Pipeline | None = None)[source]#

Bases: Dataset

class pimmslearn.io.datasets.DatasetWithTarget(df: DataFrame, transformer: Pipeline | None = None)[source]#

Bases: DatasetWithMaskAndNoTarget

class pimmslearn.io.datasets.DatasetWithTargetSpecifyTarget(df: DataFrame, targets: DataFrame, transformer: Pipeline | None = None)[source]#

Bases: DatasetWithMaskAndNoTarget

class pimmslearn.io.datasets.PeptideDatasetInMemory(data: array, mask: array | None = None, fill_na=0.0)[source]#

Bases: Dataset

Peptide Dataset fully in memory.

nan = tensor(nan)#
pimmslearn.io.datasets.to_tensor(s: Series) Tensor[source]#

pimmslearn.io.datasplits module#

class pimmslearn.io.datasplits.DataSplits(is_wide_format: 'bool', train_X: 'pd.DataFrame' = None, val_y: 'pd.DataFrame' = None, test_y: 'pd.DataFrame' = None)[source]#

Bases: object

dump(folder='data', file_format='csv') dict[source]#

dump in long format.

classmethod from_folder(folder: str, use_wide_format=False, file_format='csv') DataSplits[source]#

Build DataSplits instance from folder.

interpolate(dataset: str | DataFrame)[source]#
is_wide_format: bool#
load(folder: str, use_wide_format=False, file_format='csv') None[source]#

Load data in place from folder

test_y: DataFrame = None#
to_long_format(name_values: str = 'intensity')[source]#
to_wide_format()[source]#
train_X: DataFrame = None#
val_y: DataFrame = None#
pimmslearn.io.datasplits.load_freq(folder: str, file='freq_features.pkl')[source]#
pimmslearn.io.datasplits.load_items(folder: str, items: dict, use_wide_format=False, file_format='csv') dict[source]#
pimmslearn.io.datasplits.long_format(df: DataFrame, colname_values: str = 'intensity') DataFrame[source]#
pimmslearn.io.datasplits.to_long_format(df: pd.DataFrame, *, colname_values: str = 'intensity') pd.DataFrame#
pimmslearn.io.datasplits.wide_format(df: DataFrame, columns: str = 'Sample ID', name_values: str = 'intensity') DataFrame[source]#

pimmslearn.io.format module#

class pimmslearn.io.format.bcolors[source]#

Bases: object

Class for colors changing string represenations in output.

Found: https://stackoverflow.com/a/287944/9684872

There are more options available:

BOLD = '\x1b[1m'#
ENDC = '\x1b[0m'#
FAIL = '\x1b[91m'#
HEADER = '\x1b[95m'#
OKBLUE = '\x1b[94m'#
OKCYAN = '\x1b[96m'#
OKGREEN = '\x1b[92m'#
UNDERLINE = '\x1b[4m'#
WARNING = '\x1b[93m'#
pimmslearn.io.format.class_full_module(cls)[source]#

Return entire class name (repr notation) as str.

pimmslearn.io.format.classname(obj)[source]#

Return entire object’s class name (repr notation) as str. Source: https://gist.github.com/clbarnes/edd28ea32010eb159b34b075687bb49e

Parameters:

obj (object) – any object

Returns:

Full class name with module name

Return type:

str

pimmslearn.io.load module#

pimmslearn.io.load.from_csv(fname: str, nrows: int | None = None, index_col: int | str | List = 'Sample ID', verify_fname: bool = False, usecols=None, **kwargs)[source]#
pimmslearn.io.load.from_pickle(fname: str, index_col: int | str | List = 'Sample ID', verify_fname: bool = False, usecols=None, **kwargs) DataFrame[source]#
pimmslearn.io.load.verify_df(df: DataFrame, fname: str, index_col: str, verify_fname: bool = False, usecols=None)[source]#

pimmslearn.io.types module#

papermill strategy to determine type see: nteract/papermill

pimmslearn.io.types.resolve_type(value)[source]#