Source code for pimmslearn.io

import json
import logging
import pickle
from collections import namedtuple
from pathlib import Path, PurePath, PurePosixPath
from typing import Tuple, Union

import numpy as np
import pandas as pd

import pimmslearn.pandas

PathsList = namedtuple("PathsList", ["files", "folder"])


logger = logging.getLogger(__name__)
logger.info(f"Calling from {__name__}")



[docs]
def search_files(path=".", query=".txt"):
    """Uses Pathlib to find relative to path files
    with the query text in their file names. Returns
    the path relative to the specified path.

    Parameters
    ----------
    path : str, optional
        Path to search, by default '.'
    query : str, optional
        query string for for filenames, by default '.txt'

    Returns
    -------
    list
        list with files as string containig query key.
    """
    path = Path(path)
    files = []
    for p in path.rglob("*"):
        if query in p.name:
            files.append(str(p.relative_to(path)))
    return PathsList(files=files, folder=path)




[docs]
def search_subfolders(path=".", depth: int = 1, exclude_root: bool = False):
    """Search subfolders relative to given path."""
    if not isinstance(depth, int) and depth > 0:
        raise ValueError(f"Please provide an strictly positive integer, not {depth}")
    EXCLUDED = ["*ipynb_checkpoints*"]

    path = Path(path)
    directories = [path]

    def get_subfolders(path):
        return [
            x
            for x in path.iterdir()
            if x.is_dir() and not any(x.match(excl) for excl in EXCLUDED)
        ]

    directories_previous = directories.copy()
    while depth > 0:
        directories_new = list()
        for p in directories_previous:
            directories_new.extend(get_subfolders(p))
        directories.extend(directories_new)
        directories_previous = directories_new.copy()
        depth -= 1

    if exclude_root:
        directories.pop(0)
    return directories




[docs]
def resolve_path(path: Union[str, Path], to: Union[str, Path] = ".") -> Path:
    """Resolve a path partly overlapping with to another path."""
    pwd = Path(to).absolute()
    pwd = [p for p in pwd.parts]
    ret = [p for p in Path(path).parts if p not in pwd]
    return Path("/".join(ret))




[docs]
def get_fname_from_keys(keys, folder=".", file_ext=".pkl", remove_duplicates=True):
    if remove_duplicates:
        # https://stackoverflow.com/a/53657523/9684872
        keys = list(dict.fromkeys(keys))
    folder = Path(folder)
    folder.mkdir(exist_ok=True, parents=True)
    fname_dataset = folder / "{}{}".format(
        pimmslearn.pandas.replace_with(" ".join(keys), replace="- ", replace_with="_"),
        file_ext,
    )
    return fname_dataset




[docs]
def dump_to_csv(
    df: pd.DataFrame, folder: Path, outfolder: Path, parent_folder_fct=None
) -> None:
    fname = f"{folder.stem}.csv"
    if parent_folder_fct is not None:
        outfolder = outfolder / parent_folder_fct(folder)
    outfolder.mkdir(exist_ok=True, parents=True)
    fname = outfolder / fname
    logger.info(f"Dump to file: {fname}")
    df.to_csv(fname)
    return fname




[docs]
def dump_json(data_dict: dict, filename: Union[str, Path]):
    """Dump dictionary as JSON.

    Parameters
    ----------
    data_dict : dict
        Dictionary with valid JSON entries to dump.
    filename : Union[str, Path]
        Filepath to save dictionary as JSON.
    """
    with open(filename, "w") as f:
        json.dump(obj=data_dict, fp=f, indent=4)




[docs]
def to_pickle(obj, fname):
    with open(fname, "wb") as f:
        pickle.dump(obj, f)




[docs]
def from_pickle(fname):
    with open(fname, "rb") as f:
        return pickle.load(f)




[docs]
def load_json(fname: Union[str, Path]) -> dict:
    """Load JSON from disc.

    Parameters
    ----------
    fname : Union[str, Path]
        Filepath to JSON on disk.

    Returns
    -------
    dict
        Loaded JSON file.
    """
    with open(Path(fname)) as f:
        d = json.load(f)
    return d




[docs]
def parse_dict(
    input_dict: dict,
    types: Tuple[Tuple] = (
        (PurePath, lambda p: str(PurePosixPath(p))),
        (np.ndarray, lambda a: a.to_list()),
    ),
):
    """Transform a set of items (instances) to their string representation"""
    d = dict()
    for k, v in input_dict.items():
        for old_type, fct in types:
            if isinstance(v, old_type):
                v = fct(v)
        d[k] = v
    return d




[docs]
def extend_name(fname: Union[str, Path], extend_by: str, ext: str = None) -> Path:
    """Extend the name of a file.

    Parameters
    ----------
    fname : Union[str, Path]
        Filepath to file to rename.
    extend_by : str
        Extend file stem by string

    Returns
    -------
    Path
        Changed filepath with extension
    """
    fname = Path(fname)
    if ext is None:
        ext = fname.suffix
    fname = fname.parent / f"{fname.stem}{extend_by}"
    fname = fname.with_suffix(ext)
    return fname




[docs]
def add_indices(
    array: np.array, original_df: pd.DataFrame, index_only: bool = False
) -> pd.DataFrame:
    """Add indices to array using provided origional DataFrame.

    Parameters
    ----------
    array : np.array
        Array of data to add indices to.
    original_df : pd.DataFrame
        Original DataFrame data was generated from.
    index_only : bool, optional
        Only add row index, by default False

    Returns
    -------
    pd.DataFrame
        DataFrame with array data and original indices.
    """

    index = original_df.index
    columns = None
    if not index_only:
        columns = original_df.columns
    return pd.DataFrame(array, index=index, columns=columns)