Source code for pimmslearn.utils
import pathlib
from typing import Union
import numpy as np
import pandas as pd
from pimmslearn.io.datasplits import long_format
[docs]
def append_to_filepath(
filepath: Union[pathlib.Path, str],
to_append: str,
sep: str = "_",
new_suffix: str = None,
) -> pathlib.Path:
"""Append filepath with specified to_append using a seperator.
Example: `data.csv` to data_processed.csv
"""
filepath = pathlib.Path(filepath)
suffix = filepath.suffix
if new_suffix:
suffix = f".{new_suffix}"
new_fp = filepath.parent / f"{filepath.stem}{sep}{to_append}{suffix}"
return new_fp
[docs]
def create_random_missing_data(
N, M, mean: float = 25.0, std_dev: float = 2.0, prop_missing: float = 0.15
):
data = np.random.normal(loc=mean, scale=std_dev, size=(N, M))
prop_missing = float(prop_missing)
if prop_missing > 0.0 and prop_missing < 1.0:
mask = np.random.choice(
[False, True], size=data.shape, p=[prop_missing, 1 - prop_missing]
)
data = np.where(mask, data, np.nan)
return data
[docs]
def create_random_missing_data_long(N: int, M: int, prop_missing=0.1):
"""Build example long"""
data = create_random_missing_data(N=N, M=M, prop_missing=prop_missing)
df_long = long_format(pd.DataFrame(data))
df_long.index.names = ("Sample ID", "peptide")
df_long.reset_index(inplace=True)
return df_long
[docs]
def create_random_df(
N: int,
M: int,
scaling_factor: float = 30.0,
prop_na: float = 0.0,
start_idx: int = 0,
name_index="Sample ID",
name_columns="peptide",
):
X = np.random.rand(N, M)
if prop_na > 0.0 and prop_na < 1.0:
mask = ~(X < prop_na)
X = np.where(mask, X, np.nan)
X *= scaling_factor
X = pd.DataFrame(
X,
index=[f"sample_{i:0{len(str(N))}}" for i in range(start_idx, start_idx + N)],
columns=(f"feat_{i:0{len(str(M))}}" for i in range(M)),
)
X.index.name = name_index
X.columns.name = name_columns
return X