Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 14 additions & 33 deletions feature_engine/_base_transformers/base_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,53 +28,34 @@ class BaseNumericalTransformer(
variable transformers, discretisers, math combination.
"""

def fit(self, X: pd.DataFrame) -> pd.DataFrame:
def _fit_setup(self, X: pd.DataFrame):
"""
Checks that input is a dataframe, finds numerical variables, or alternatively
checks that variables entered by the user are of type numerical.

Parameters
----------
X : Pandas DataFrame

y : Pandas Series, np.array. Default = None
Parameter is necessary for compatibility with sklearn Pipeline.

Raises
------
TypeError
If the input is not a Pandas DataFrame or a numpy array
If any of the user provided variables are not numerical
ValueError
If there are no numerical variables in the df or the df is empty
If the variable(s) contain null values

Returns
-------
X : Pandas DataFrame
The same dataframe entered as parameter
Check dataframe, find numerical variables, check for NA and Inf.
Returns the checked dataframe and the correctly identified numerical variables.
"""

# check input dataframe
X = check_X(X)

# find or check for numerical variables
if self.variables is None:
self.variables_ = find_numerical_variables(X)
variables_ = find_numerical_variables(X)
else:
self.variables_ = check_numerical_variables(X, self.variables)
variables_ = check_numerical_variables(X, self.variables)

# check if dataset contains na or inf
_check_contains_na(X, self.variables_)
_check_contains_inf(X, self.variables_)
_check_contains_na(X, variables_)
_check_contains_inf(X, variables_)

# save input features
self.feature_names_in_ = X.columns.tolist()
return X, variables_

# save train set shape
def _get_feature_names_in(self, X):
"""Get the names and number of features in the train set (the dataframe
used during fit)."""

self.feature_names_in_ = X.columns.to_list()
self.n_features_in_ = X.shape[1]

return X
return self

def _check_transform_input_and_state(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Expand Down
23 changes: 11 additions & 12 deletions feature_engine/_base_transformers/mixins.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, List, Union
from typing import Dict, List, Tuple, Union

import pandas as pd
from numpy import ndarray
Expand Down Expand Up @@ -46,7 +46,9 @@ def transform_x_y(self, X: pd.DataFrame, y: pd.Series):


class FitFromDictMixin:
def _fit_from_dict(self, X: pd.DataFrame, user_dict_: Dict) -> pd.DataFrame:
def _fit_from_dict(
self, X: pd.DataFrame, user_dict_: Dict
) -> Tuple[pd.DataFrame, List[Union[str, int]]]:
"""
Checks that input is a dataframe, checks that variables in the dictionary
entered by the user are of type numerical.
Expand All @@ -71,25 +73,22 @@ def _fit_from_dict(self, X: pd.DataFrame, user_dict_: Dict) -> pd.DataFrame:
-------
X : Pandas DataFrame
The same dataframe entered as parameter

variables_ : List
The variables in the dictionary.
"""
# check input dataframe
X = check_X(X)

# find or check for numerical variables
variables = list(user_dict_.keys())
self.variables_ = check_numerical_variables(X, variables)
variables_ = check_numerical_variables(X, variables)

# check if dataset contains na or inf
_check_contains_na(X, self.variables_)
_check_contains_inf(X, self.variables_)

# save input features
self.feature_names_in_ = X.columns.tolist()

# save train set shape
self.n_features_in_ = X.shape[1]
_check_contains_na(X, variables_)
_check_contains_inf(X, variables_)

return X
return X, variables_


class GetFeatureNamesOutMixin:
Expand Down
12 changes: 8 additions & 4 deletions feature_engine/creation/cyclical_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,15 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
It is not needed in this transformer. You can pass y or None.
"""
if self.max_values is None:
X = super().fit(X)
self.max_values_ = X[self.variables_].max().to_dict()
X, variables_ = self._fit_setup(X)
max_values_ = X[variables_].max().to_dict()
else:
super()._fit_from_dict(X, self.max_values)
self.max_values_ = self.max_values
X, variables_ = super()._fit_from_dict(X, self.max_values)
max_values_ = self.max_values

self.variables_ = variables_
self.max_values_ = max_values_
self._get_feature_names_in(X)

return self

Expand Down
4 changes: 3 additions & 1 deletion feature_engine/discretisation/arbitrary.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
y is not needed in this transformer. You can pass y or None.
"""
# check input dataframe
X = super()._fit_from_dict(X, self.binning_dict)
X, variables_ = super()._fit_from_dict(X, self.binning_dict)

# for consistency wit the rest of the discretisers, we add this attribute
self.variables_ = variables_
self.binner_dict_ = self.binning_dict
self._get_feature_names_in(X)

return self

Expand Down
11 changes: 7 additions & 4 deletions feature_engine/discretisation/decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def __init__(
self.param_grid = param_grid
self.random_state = random_state

def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore
def fit(self, X: pd.DataFrame, y: pd.Series):
"""
Fit one decision tree per variable to discretize with cross-validation and
grid-search for hyperparameters.
Expand All @@ -241,7 +241,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore
check_classification_targets(y)

# check input dataframe
X = super().fit(X)
X, variables_ = self._fit_setup(X)

if self.param_grid:
param_grid = self.param_grid
Expand All @@ -251,7 +251,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore
binner_dict_ = {}
scores_dict_ = {}

for var in self.variables_:
for var in variables_:

if self.regression:
model = DecisionTreeRegressor(random_state=self.random_state)
Expand All @@ -269,7 +269,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore
scores_dict_[var] = tree_model.score(X[var].to_frame(), y)

if self.bin_output != "prediction":
for var in self.variables_:
for var in variables_:
clf = binner_dict_[var].best_estimator_
threshold = clf.tree_.threshold
feature = clf.tree_.feature
Expand All @@ -280,6 +280,9 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore

self.binner_dict_ = binner_dict_
self.scores_dict_ = scores_dict_
self.variables_ = variables_
self._get_feature_names_in(X)

return self

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
Expand Down
11 changes: 7 additions & 4 deletions feature_engine/discretisation/equal_frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,17 +159,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""

# check input dataframe
X = super().fit(X)
X, variables_ = self._fit_setup(X)

self.binner_dict_ = {}
binner_dict_ = {}

for var in self.variables_:
for var in variables_:
tmp, bins = pd.qcut(x=X[var], q=self.q, retbins=True, duplicates="drop")

# Prepend/Append infinities to accommodate outliers
bins = list(bins)
bins[0] = float("-inf")
bins[len(bins) - 1] = float("inf")
self.binner_dict_[var] = bins
binner_dict_[var] = bins

self.binner_dict_ = binner_dict_
self.variables_ = variables_
self._get_feature_names_in(X)
return self
11 changes: 7 additions & 4 deletions feature_engine/discretisation/equal_width.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""

# check input dataframe
X = super().fit(X)
X, variables_ = self._fit_setup(X)

# fit
self.binner_dict_ = {}
binner_dict_ = {}

for var in self.variables_:
for var in variables_:
tmp, bins = pd.cut(
x=X[var],
bins=self.bins,
Expand All @@ -186,6 +186,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
bins = list(bins)
bins[0] = float("-inf")
bins[len(bins) - 1] = float("inf")
self.binner_dict_[var] = bins
binner_dict_[var] = bins

self.binner_dict_ = binner_dict_
self.variables_ = variables_
self._get_feature_names_in(X)
return self
12 changes: 8 additions & 4 deletions feature_engine/discretisation/geometric_width.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,19 +159,23 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""

# check input dataframe
X = super().fit(X)
X, variables_ = self._fit_setup(X)

# fit
self.binner_dict_ = {}
binner_dict_ = {}

for var in self.variables_:
for var in variables_:
min_, max_ = X[var].min(), X[var].max()
increment = np.power(max_ - min_, 1.0 / self.bins)
bins = np.r_[
-np.inf, min_ + np.power(increment, np.arange(1, self.bins)), np.inf
]
bins = np.sort(bins)
bins = list(bins)
self.binner_dict_[var] = bins
binner_dict_[var] = bins

self.variables_ = variables_
self.binner_dict_ = binner_dict_
self._get_feature_names_in(X)

return self
12 changes: 7 additions & 5 deletions feature_engine/imputation/arbitrary_number.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,17 +149,19 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
# find or check for numerical variables
# create the imputer dictionary
if self.imputer_dict:
self.variables_ = check_numerical_variables(
variables_ = check_numerical_variables(
X, list(self.imputer_dict.keys())
)
self.imputer_dict_ = self.imputer_dict
imputer_dict_ = self.imputer_dict
else:
if self.variables is None:
self.variables_ = find_numerical_variables(X)
variables_ = find_numerical_variables(X)
else:
self.variables_ = check_numerical_variables(X, self.variables)
self.imputer_dict_ = {var: self.arbitrary_number for var in self.variables_}
variables_ = check_numerical_variables(X, self.variables)
imputer_dict_ = {var: self.arbitrary_number for var in variables_}

self.variables_ = variables_
self.imputer_dict_ = imputer_dict_
self._get_feature_names_in(X)

return self
22 changes: 12 additions & 10 deletions feature_engine/imputation/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,22 +169,22 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
# select variables to encode
if self.ignore_format is True:
if self.variables is None:
self.variables_ = find_all_variables(X)
variables_ = find_all_variables(X)
else:
self.variables_ = check_all_variables(X, self.variables)
variables_ = check_all_variables(X, self.variables)
else:
if self.variables is None:
self.variables_ = find_categorical_variables(X)
variables_ = find_categorical_variables(X)
else:
self.variables_ = check_categorical_variables(X, self.variables)
variables_ = check_categorical_variables(X, self.variables)

if self.imputation_method == "missing":
self.imputer_dict_ = {var: self.fill_value for var in self.variables_}
imputer_dict_ = {var: self.fill_value for var in variables_}

elif self.imputation_method == "frequent":
# if imputing only 1 variable:
if len(self.variables_) == 1:
var = self.variables_[0]
if len(variables_) == 1:
var = variables_[0]
mode_vals = X[var].mode()

# Some variables may contain more than 1 mode:
Expand All @@ -193,13 +193,13 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
f"The variable {var} contains multiple frequent categories."
)

self.imputer_dict_ = {var: mode_vals[0]}
imputer_dict_ = {var: mode_vals[0]}

# imputing multiple variables:
else:
# Returns a dataframe with 1 row if there is one mode per
# variable, or more rows if there are more modes:
mode_vals = X[self.variables_].mode()
mode_vals = X[variables_].mode()

# Careful: some variables contain multiple modes
if len(mode_vals) > 1:
Expand All @@ -213,8 +213,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
f"categories."
)

self.imputer_dict_ = mode_vals.iloc[0].to_dict()
imputer_dict_ = mode_vals.iloc[0].to_dict()

self.variables_ = variables_
self.imputer_dict_ = imputer_dict_
self._get_feature_names_in(X)

return self
Expand Down
9 changes: 5 additions & 4 deletions feature_engine/imputation/drop_missing_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,16 +150,17 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):

# find variables for which indicator should be added
if self.variables is None:
self.variables_ = find_all_variables(X)
variables_ = find_all_variables(X)
else:
self.variables_ = check_all_variables(X, self.variables)
variables_ = check_all_variables(X, self.variables)

# If user passes a threshold, then missing_only is ignored:
if self.threshold is None and self.missing_only is True:
self.variables_ = [
var for var in self.variables_ if X[var].isnull().sum() > 0
variables_ = [
var for var in variables_ if X[var].isnull().sum() > 0
]

self.variables_ = variables_
self._get_feature_names_in(X)

return self
Expand Down
Loading