import logging
from typing import List
import pandas as pd
import sklearn
import sklearn.pipeline
import torch
from sklearn import preprocessing
logger = logging.getLogger(__name__)
# ! general transform and inverse_transform needs to move somewhere else
msg_return_docstring = """
Returns
-------
Y: array-like
If X is a pandas DataFrame, Y will be a DataFrame with the initial
Indix and column Index objects.
"""
[docs]
def make_pandas_compatible(cls):
"""Patch transform and inverse_transform."""
# ? could become factory function, build args dictionary
_fcts = ["transform", "inverse_transform"]
for _fct in _fcts:
if not hasattr(cls, _fct):
raise ValueError(f"no {_fct} method for {cls.__name__}")
new_class = type(
cls.__name__,
(cls,),
dict(transform=transform, inverse_transform=inverse_transform),
)
new_class.transform.__doc__ = cls.transform.__doc__ + msg_return_docstring
new_class.inverse_transform.__doc__ = (
cls.inverse_transform.__doc__ + msg_return_docstring
)
return new_class
# ? Can this be a MixIn class?
# # this could be a class method
# @make_pandas_compatible
# class MinMaxScaler(preprocessing.MinMaxScaler):
# pass
# # look at fastcore to see if **kwargs could be replaced with original
# # arguments, see https://fastcore.fast.ai/meta.html#Metaprogramming
# # decorate()
MinMaxScaler = make_pandas_compatible(preprocessing.MinMaxScaler)
[docs]
class VaepPipeline:
"""Custom Pipeline combining a pandas.DataFrame and a sklearn.pipeline.Pipleine."""
def __init__(
self,
df_train: pd.DataFrame,
encode: sklearn.pipeline.Pipeline,
decode: List[str] = None,
):
"""[summary]
Parameters
----------
df_train : pd.DataFrame
pandas.DataFrame to which the data should be fitted.
encode : sklearn.pipeline.Pipeline, optional
sklearn.pipeline to fit with df_train, by default None
decode : List[str], optional
subset of transforms (their string name) as an Iterable, by default None, i.e.
the same as encode
"""
self.columns = df_train.columns
self.M = len(df_train.columns)
self.encode = encode
self.encode.fit(df_train)
if decode:
self.decode = list()
for d in decode:
self.decode.append((d, self.encode.named_steps[d]))
self.decode = sklearn.pipeline.Pipeline(self.decode)
else:
self.decode = self.encode
# Option: single-dispatch based on type of X