Source code for skbel.preprocessing.composite

import numpy as np
from sklearn.base import BaseEstimator, MultiOutputMixin, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_is_fitted

"""
Collection of classes to combine multiple-features transformation/dimension reduction.
The classes below take a base scikit-learn object and sequentially apply the desired algorithm to each set of features
part of the same dataset and concatenates the results.
Scikit-Learn does implement its own "column_transformer", but it is not supported by pipelines, and does not have an
"inverse_transform" method. The code here solves these shortcomings.
"""

__all__ = ["CompositePCA", "CompositeTransformer", "Dummy"]


[docs] class CompositePCA(TransformerMixin, BaseEstimator):
[docs] def __init__(self, n_components: list, scale: bool = False): """Initiate the class by specifying a list of number of components to keep for each different datasets. :param n_components: list of number of components to keep for each dataset :param scale: whether to scale the data before applying PCA """ if type(n_components) is not list: n_components = [n_components] self.n_components = n_components self.scale = scale self.pca_objects = [PCA(n_components=n) for n in self.n_components] # list of PCA objects
[docs] def fit(self, Xc: list, yc=None, **fit_params): """Fit all PCA objects for the different datasets with their specified n_components. :param Xc: list of datasets :param yc: Only here to satisfy the scikit-learn API :return: self """ if type(Xc) is not list: Xc = [Xc] [pca.fit(Xc[i], yc) for i, pca in enumerate(self.pca_objects)] return self
[docs] def transform(self, Xc: list, yc=None, **fit_params) -> np.array: """Transforms all datasets and concatenates the output. :param Xc: list of datasets :param yc: Only here to satisfy the scikit-learn API :return: concatenated output """ if type(Xc) is not list: Xc = [Xc] [check_is_fitted(p) for p in self.pca_objects] # Check if fitted scores = [pca.transform(Xc[i]) for i, pca in enumerate(self.pca_objects)] # Transform if self.scale: # Scale the data if specified scaler = StandardScaler() scores = [scaler.fit_transform(s) for s in scores] return np.concatenate(scores, axis=1)
[docs] def fit_transform(self, Xc: list, yc=None, **fit_params): """Fit and transform all datasets. :param Xc: list of datasets :param yc: Only here to satisfy the scikit-learn API :return: concatenated output """ if type(Xc) is not list: Xc = [Xc] return self.fit(Xc, yc).transform(Xc, yc)
[docs] def inverse_transform(self, Xr: np.array, yc=None, **fit_params) -> list: """Inverse transform the data back to the original space. :param Xr: transformed data :param yc: Only here to satisfy the scikit-learn API :return: list of transformed datasets """ if type(Xr) is not list: Xr = [Xr] rm = np.cumsum(np.concatenate([[0], self.n_components])) # Cumulative sum of n_components Xr = Xr.reshape(-1) Xc = [ Xr[rm[i] : rm[i + 1]] for i in range(len(rm) - 1) ] # Separates the concatenated features into the different original datasets Xit = [ pca.inverse_transform(Xc[i]) for i, pca in enumerate(self.pca_objects) ] # Successively inverse transform return Xit
[docs] class CompositeTransformer(TransformerMixin, BaseEstimator):
[docs] def __init__(self, base_function, **fit_params): """Initiate the class by specifying a base scikit-learn object and the parameters to use for each dataset. :param base_function: function to apply to the data :param fit_params: parameters to pass to the base function """ self.base_function = base_function self.t_objects = None self.params = fit_params
[docs] def fit(self, Xc: list, yc=None, **fit_params): """Fit all transformations for the different datasets with their specified parameters. :param Xc: list of datasets :param yc: Only here to satisfy the scikit-learn API :return: self """ self.t_objects = [self.base_function(**self.params) for _ in Xc] # list of transformations [obj.fit(Xc[i], yc) for i, obj in enumerate(self.t_objects)] # Fit return self
[docs] def transform(self, Xc: list, yc=None, **fit_params) -> np.array: """Transforms all datasets and concatenates the output. :param Xc: list of datasets :param yc: Only here to satisfy the scikit-learn API :return: concatenated output """ [check_is_fitted(p) for p in self.t_objects] output = [obj.transform(Xc[i]) for i, obj in enumerate(self.t_objects)] return output
[docs] def fit_transform(self, Xc: list, yc=None, **fit_params): """Fit and transform all datasets. :param Xc: list of datasets :param yc: Only here to satisfy the scikit-learn API :return: concatenated output """ return self.fit(Xc, yc).transform(Xc, yc)
[docs] def inverse_transform(self, Xr: np.array, yc=None, **fit_params) -> list: """Inverse transform the data back to the original space. :param Xr: transformed data :param yc: Only here to satisfy the scikit-learn API :return: list of transformed datasets """ Xit = [ obj.inverse_transform(Xr[i].reshape(1, -1)) for i, obj in enumerate(self.t_objects) ] # Successively inverse transform return Xit
[docs] class Dummy(TransformerMixin, MultiOutputMixin, BaseEstimator): """Dummy transformer that does nothing."""
[docs] def __init__(self): self.fake_fit_ = np.zeros(1)
[docs] def fit(self, X=None, y=None): return self
[docs] def transform(self, X=None, y=None): # noqa if X is not None and y is None: return X elif y is not None and X is None: return y else: return X, y
[docs] def inverse_transform(self, X=None, y=None): # noqa if X is not None and y is None: return X elif y is not None and X is None: return y else: return X, y
[docs] def fit_transform(self, X=None, y=None, **fit_params): return self.fit(X, y).transform(X, y)