From 11081e5b1aaacc3c7584a38cabee7c0f570a89ff Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Tue, 10 Mar 2026 11:06:51 -0500 Subject: [PATCH 1/7] feat: Add StringListBinarizer to encode multi-label strings and lists --- docs/api_doc/encoding/StringListBinarizer.rst | 5 + docs/api_doc/encoding/index.rst | 1 + .../encoding/StringListBinarizer.rst | 79 +++++ docs/user_guide/encoding/index.rst | 1 + feature_engine/encoding/__init__.py | 2 + .../encoding/string_list_binarizer.py | 271 ++++++++++++++++++ .../test_string_list_binarizer.py | 107 +++++++ 7 files changed, 466 insertions(+) create mode 100644 docs/api_doc/encoding/StringListBinarizer.rst create mode 100644 docs/user_guide/encoding/StringListBinarizer.rst create mode 100644 feature_engine/encoding/string_list_binarizer.py create mode 100644 tests/test_encoding/test_string_list_binarizer.py diff --git a/docs/api_doc/encoding/StringListBinarizer.rst b/docs/api_doc/encoding/StringListBinarizer.rst new file mode 100644 index 000000000..5b7112cfe --- /dev/null +++ b/docs/api_doc/encoding/StringListBinarizer.rst @@ -0,0 +1,5 @@ +StringListBinarizer +=================== + +.. autoclass:: feature_engine.encoding.StringListBinarizer + :members: diff --git a/docs/api_doc/encoding/index.rst b/docs/api_doc/encoding/index.rst index 5256753c4..ab448c4d1 100644 --- a/docs/api_doc/encoding/index.rst +++ b/docs/api_doc/encoding/index.rst @@ -37,6 +37,7 @@ input. DecisionTreeEncoder RareLabelEncoder StringSimilarityEncoder + StringListBinarizer Other categorical encoding libraries ------------------------------------ diff --git a/docs/user_guide/encoding/StringListBinarizer.rst b/docs/user_guide/encoding/StringListBinarizer.rst new file mode 100644 index 000000000..54dd538a9 --- /dev/null +++ b/docs/user_guide/encoding/StringListBinarizer.rst @@ -0,0 +1,79 @@ +.. _string_list_binarizer: + +.. currentmodule:: feature_engine.encoding + +StringListBinarizer +=================== + +:class:`StringListBinarizer()` replaces categorical variables containing lists of strings +or comma-delimited strings with a set of binary variables (dummy variables) representing +each one of the unique tags or categories present across all observations. + +This transformer is particularly useful for handling multi-label categorical columns +where each row might have multiple values, such as ``"action, comedy"`` or +``"romance, thriller, action"``. The transformer splits these strings by a specified separator, +collects all unique tags, and then applies one-hot encoding on them. It can also natively +handle columns structured as Python lists, like ``["action", "comedy"]``. + +Python example +-------------- + +Let's look at an example. We generate a toy dataset with multi-label genre information +stored as comma-delimited strings: + +.. code:: python + + import pandas as pd + from feature_engine.encoding import StringListBinarizer + + X = pd.DataFrame(dict( + user_id = [1, 2, 3], + genres = ["action, comedy", "comedy", "action, thriller"] + )) + + print(X) + +.. code:: python + + user_id genres + 0 1 action, comedy + 1 2 comedy + 2 3 action, thriller + +Now, we set up the :class:`StringListBinarizer()`. Since our strings are separated by a +comma and a space, we specify ``separator=", "``. + +.. code:: python + + slb = StringListBinarizer( + variables=["genres"], + separator=", " + ) + + slb.fit(X) + +During `fit`, the enoder splits the strings, identifies the unique categories across +the entire dataset, and saves them in its `encoder_dict_` attribute. + +.. code:: python + + print(slb.encoder_dict_) + # {'genres': ['action', 'comedy', 'thriller']} + +We can now use `transform` to get the dummy variables. The original column is dropped by default. + +.. code:: python + + X_encoded = slb.transform(X) + print(X_encoded) + +.. code:: python + + user_id genres_action genres_comedy genres_thriller + 0 1 1 1 0 + 1 2 0 1 0 + 2 3 1 0 1 + +As we see, each row now has a 1 in the columns corresponding to the genres it originally contained, +and 0 otherwise. Unseen categories encountered during transform will simply be ignored (i.e. all +dummy columns will be 0 for those extra components). diff --git a/docs/user_guide/encoding/index.rst b/docs/user_guide/encoding/index.rst index 2b82c0a11..eccdc7500 100644 --- a/docs/user_guide/encoding/index.rst +++ b/docs/user_guide/encoding/index.rst @@ -431,6 +431,7 @@ Encoders :maxdepth: 1 OneHotEncoder + StringListBinarizer OrdinalEncoder CountFrequencyEncoder MeanEncoder diff --git a/feature_engine/encoding/__init__.py b/feature_engine/encoding/__init__.py index 3c689b7ad..cfdec4a68 100644 --- a/feature_engine/encoding/__init__.py +++ b/feature_engine/encoding/__init__.py @@ -9,6 +9,7 @@ from .ordinal import OrdinalEncoder from .rare_label import RareLabelEncoder from .similarity_encoder import StringSimilarityEncoder +from .string_list_binarizer import StringListBinarizer from .woe import WoEEncoder __all__ = [ @@ -18,6 +19,7 @@ "OneHotEncoder", "OrdinalEncoder", "RareLabelEncoder", + "StringListBinarizer", "StringSimilarityEncoder", "WoEEncoder", ] diff --git a/feature_engine/encoding/string_list_binarizer.py b/feature_engine/encoding/string_list_binarizer.py new file mode 100644 index 000000000..560bcd3b4 --- /dev/null +++ b/feature_engine/encoding/string_list_binarizer.py @@ -0,0 +1,271 @@ +# Authors: Ankit Hemant Lade (contributor) +# License: BSD 3 clause + +from typing import List, Optional, Union + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted + +from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin +from feature_engine._check_init_parameters.check_variables import ( + _check_variables_input_value, +) +from feature_engine.dataframe_checks import ( + _check_optional_contains_na, + _check_X_matches_training_df, + check_X, +) +from feature_engine.tags import _return_tags +from feature_engine.variable_handling import ( + check_all_variables, + find_all_variables, +) + + +class StringListBinarizer(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): + """ + StringListBinarizer() takes categorical variables that contain a list of strings + or a delimited string, and creates binary variables representing each of the + unique categories across all observations. + + This is especially useful for columns containing multiple tags per row, such as + `["action", "comedy"]` or `"action, comedy"`. + + The transformer takes a list of variables to encode, or automatically selects + all object/categorical columns if none are provided. + + The encodings are created by splitting the strings on a specified `separator` + (or parsing the lists directly), identifying the unique tags in the dataset, + and then adding a new boolean column `varname_tag` for each unique tag. + + Original columns are dropped after transformation by default. + + More details in the :ref:`User Guide `. + + Parameters + ---------- + variables : list, default=None + The list of categorical variables to encode. If None, the encoder will find and + select all categorical variables. + + separator : str, default="," + The separator used to split the strings in the variable. If the variable contains + Python lists instead of strings, this parameter is ignored. + + ignore_format : bool, default=False + Whether to format check the variables in `fit`. If `True`, the encoder will + ignore the variable types and proceed with encoding, provided the variables are + entered by the user. If `variables` is None, the target variables are all those + in the dataset regardless of type. If `False`, the encoder will select and + encode only categorical variables (type 'object' or 'categorical'). + + Attributes + ---------- + variables_: + The list of variables to be transformed. + + encoder_dict_: + A dictionary mapping the variables to the sorted list of their unique tags. + + feature_names_in_: + List with the names of features seen during `fit`. + + n_features_in_: + The number of features in the train set used in fit. + + Methods + ------- + fit: + Learn the unique tags per variable. + + fit_transform: + Fit to data, then transform it. + + transform: + Replace the original variable with the binary encoded variables. + + Examples + -------- + >>> import pandas as pd + >>> from feature_engine.encoding import StringListBinarizer + >>> X = pd.DataFrame(dict(tags=["action, comedy", "comedy", "action, thriller"])) + >>> slb = StringListBinarizer(variables=["tags"], separator=", ") + >>> slb.fit(X) + >>> slb.transform(X) + tags_action tags_comedy tags_thriller + 0 1 1 0 + 1 0 1 0 + 2 1 0 1 + """ + + def __init__( + self, + variables: Union[None, int, str, List[Union[str, int]]] = None, + separator: str = ",", + ignore_format: bool = False, + ) -> None: + + if not isinstance(separator, str): + raise ValueError( + f"separator takes only strings. Got {type(separator).__name__} instead." + ) + + if not isinstance(ignore_format, bool): + raise ValueError( + "ignore_format takes only booleans True and False. " + f"Got {ignore_format} instead." + ) + + self.variables = _check_variables_input_value(variables) + self.separator = separator + self.ignore_format = ignore_format + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """ + Learn the unique tags present in each categorical variable. + + Parameters + ---------- + + X: pandas dataframe of shape = [n_samples, n_features] + The training input samples. + + y: pandas series, default=None + Target. It is not needed in this encoded. You can pass y or + None. + """ + X = check_X(X) + + # select variables to encode + if self.ignore_format is True: + if self.variables is None: + self.variables_ = find_all_variables(X) + else: + self.variables_ = check_all_variables(X, self.variables) + else: + if self.variables is None: + # Need to use built-in categorical finder logic + self.variables_ = [ + var + for var in X.columns + if pd.api.types.is_object_dtype(X[var]) + or isinstance(X[var].dtype, pd.CategoricalDtype) + ] + if len(self.variables_) == 0: + raise ValueError( + "No categorical variables found in the dataframe. Please check " + "the variables format or set `ignore_format=True`." + ) + else: + self.variables_ = _check_variables_input_value(self.variables) + + # Check that specified variables exist and are object/categorical + non_cat = [ + var for var in self.variables_ + if var in X.columns and not ( + pd.api.types.is_object_dtype(X[var]) or + isinstance(X[var].dtype, pd.CategoricalDtype) + ) + ] + if non_cat: + raise TypeError( + f"Some of the variables are not categorical. Please cast them " + f"as object or categorical before calling fit, or set " + f"`ignore_format=True`. Variables: {non_cat}" + ) + + _check_optional_contains_na(X, self.variables_) + + self.encoder_dict_ = {} + + for var in self.variables_: + unique_tags = set() + for row in X[var]: + if isinstance(row, str): + tags = [t.strip() for t in row.split(self.separator)] + elif isinstance(row, list): + tags = [str(t).strip() for t in row] + else: + tags = [str(row).strip()] + unique_tags.update(tags) + + # Remove empty strings from tags (often caused by trailing separators) + unique_tags.discard("") + + self.encoder_dict_[var] = sorted(list(unique_tags)) + + self.feature_names_in_ = X.columns.tolist() + self.n_features_in_ = X.shape[1] + + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Replace the categorical variables by the binary encoded variables. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The data to transform. + + Returns + ------- + X_new: pandas dataframe. + The transformed dataframe. The shape of the dataframe will differ from + the original, as it replaces the original list/string columns with multiple + dummy columns. + """ + check_is_fitted(self) + + X = check_X(X) + _check_X_matches_training_df(X, self.n_features_in_) + _check_optional_contains_na(X, self.variables_) + + X_transformed = X[self.feature_names_in_].copy() + + for feature in self.variables_: + categories = self.encoder_dict_[feature] + + # Use faster numpy processing for dummies + dummy_data = {f"{feature}_{category}": np.zeros(len(X), dtype=int) for category in categories} + + for i, row in enumerate(X[feature]): + if isinstance(row, str): + tags = [t.strip() for t in row.split(self.separator)] + elif isinstance(row, list): + tags = [str(t).strip() for t in row] + else: + tags = [str(row).strip()] + + for t in tags: + if t in categories: + dummy_data[f"{feature}_{t}"][i] = 1 + + dummy_df = pd.DataFrame(dummy_data, index=X.index) + X_transformed = pd.concat([X_transformed, dummy_df], axis=1) + + # drop original variables + X_transformed.drop(labels=self.variables_, axis=1, inplace=True) + + return X_transformed + + def get_feature_names_out(self, input_features=None) -> List[str]: + """Get output feature names for transformation.""" + check_is_fitted(self) + + feature_names = list(self.feature_names_in_) + feature_names = [f for f in feature_names if f not in self.variables_] + + for feature in self.variables_: + for category in self.encoder_dict_[feature]: + feature_names.append(f"{feature}_{category}") + + return feature_names + + def _more_tags(self): + tags_dict = _return_tags() + tags_dict["variables"] = "categorical" + tags_dict["_xfail_checks"]["check_estimators_nan_inf"] = "transformer allows NA" + return tags_dict diff --git a/tests/test_encoding/test_string_list_binarizer.py b/tests/test_encoding/test_string_list_binarizer.py new file mode 100644 index 000000000..03567c892 --- /dev/null +++ b/tests/test_encoding/test_string_list_binarizer.py @@ -0,0 +1,107 @@ +import pandas as pd +import pytest +from sklearn.exceptions import NotFittedError + +from feature_engine.encoding import StringListBinarizer + +def test_string_list_binarizer_delimited_strings(): + df = pd.DataFrame({ + "tags": ["action, comedy", "comedy", "action, thriller"], + "other": [1, 2, 3] + }) + + expected_df = pd.DataFrame({ + "other": [1, 2, 3], + "tags_action": [1, 0, 1], + "tags_comedy": [1, 1, 0], + "tags_thriller": [0, 0, 1] + }) + + encoder = StringListBinarizer(variables=["tags"], separator=",") + X = encoder.fit_transform(df) + + assert encoder.variables_ == ["tags"] + assert encoder.encoder_dict_ == {"tags": ["action", "comedy", "thriller"]} + pd.testing.assert_frame_equal(X, expected_df) + +def test_string_list_binarizer_python_lists(): + df = pd.DataFrame({ + "tags": [["action", "comedy"], ["comedy"], ["action", "thriller"]], + "other": [1, 2, 3] + }) + + expected_df = pd.DataFrame({ + "other": [1, 2, 3], + "tags_action": [1, 0, 1], + "tags_comedy": [1, 1, 0], + "tags_thriller": [0, 0, 1] + }) + + encoder = StringListBinarizer(variables=["tags"]) + X = encoder.fit_transform(df) + + assert encoder.variables_ == ["tags"] + assert encoder.encoder_dict_ == {"tags": ["action", "comedy", "thriller"]} + pd.testing.assert_frame_equal(X, expected_df) + +def test_find_categorical_variables(): + df = pd.DataFrame({ + "tags": ["A,B", "C"], + "num": [1, 2] + }) + + encoder = StringListBinarizer(variables=None, separator=",") + encoder.fit(df) + + assert encoder.variables_ == ["tags"] + +def test_ignore_format(): + df = pd.DataFrame({ + "tags": ["A,B", "C"], + "num": ["1", "2"] # Treated as object but maybe we want to encode it + }) + + encoder = StringListBinarizer(variables=["num"], ignore_format=True) + encoder.fit(df) + + assert encoder.variables_ == ["num"] + assert encoder.encoder_dict_ == {"num": ["1", "2"]} + +def test_error_if_not_categorical(): + df = pd.DataFrame({ + "num": [1, 2] + }) + encoder = StringListBinarizer(variables=["num"]) + with pytest.raises(TypeError): + encoder.fit(df) + +def test_missing_values_error(): + df = pd.DataFrame({ + "tags": ["A,B", float('nan')] + }) + encoder = StringListBinarizer(variables=["tags"]) + with pytest.raises(ValueError): + encoder.fit(df) + +def test_not_fitted_error(): + df = pd.DataFrame({"tags": ["A,B"]}) + encoder = StringListBinarizer() + with pytest.raises(NotFittedError): + encoder.transform(df) + +def test_unseen_categories(): + df_train = pd.DataFrame({"tags": ["A,B", "C"]}) + df_test = pd.DataFrame({"tags": ["A,D", "B,C,E"]}) + + encoder = StringListBinarizer(variables=["tags"], separator=",") + encoder.fit(df_train) + X = encoder.transform(df_test) + + # Expect D and E to be ignored (columns for A, B, C only) + expected_df = pd.DataFrame({ + "tags_A": [1, 0], + "tags_B": [0, 1], + "tags_C": [0, 1] + }) + + pd.testing.assert_frame_equal(X, expected_df) From 5fc5bf5df9dd13fe79a341aa61935bcbc80ca4e3 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Wed, 11 Mar 2026 16:32:48 -0500 Subject: [PATCH 2/7] fix: address flake8 for StringListBinarizer --- .../encoding/string_list_binarizer.py | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/feature_engine/encoding/string_list_binarizer.py b/feature_engine/encoding/string_list_binarizer.py index 560bcd3b4..6479223e2 100644 --- a/feature_engine/encoding/string_list_binarizer.py +++ b/feature_engine/encoding/string_list_binarizer.py @@ -29,7 +29,7 @@ class StringListBinarizer(TransformerMixin, BaseEstimator, GetFeatureNamesOutMix StringListBinarizer() takes categorical variables that contain a list of strings or a delimited string, and creates binary variables representing each of the unique categories across all observations. - + This is especially useful for columns containing multiple tags per row, such as `["action", "comedy"]` or `"action, comedy"`. @@ -137,7 +137,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): None. """ X = check_X(X) - + # select variables to encode if self.ignore_format is True: if self.variables is None: @@ -160,20 +160,23 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): ) else: self.variables_ = _check_variables_input_value(self.variables) - + # Check that specified variables exist and are object/categorical non_cat = [ - var for var in self.variables_ - if var in X.columns and not ( - pd.api.types.is_object_dtype(X[var]) or - isinstance(X[var].dtype, pd.CategoricalDtype) + var + for var in self.variables_ + if var in X.columns + and not ( + pd.api.types.is_object_dtype(X[var]) + or isinstance(X[var].dtype, pd.CategoricalDtype) ) ] if non_cat: raise TypeError( - f"Some of the variables are not categorical. Please cast them " - f"as object or categorical before calling fit, or set " - f"`ignore_format=True`. Variables: {non_cat}" + "Some of the variables are not categorical. Please cast them " + "as object or categorical before calling fit, or set " + "`ignore_format=True`. Variables: " + f"{non_cat}" ) _check_optional_contains_na(X, self.variables_) @@ -190,10 +193,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): else: tags = [str(row).strip()] unique_tags.update(tags) - + # Remove empty strings from tags (often caused by trailing separators) unique_tags.discard("") - + self.encoder_dict_[var] = sorted(list(unique_tags)) self.feature_names_in_ = X.columns.tolist() @@ -227,10 +230,13 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: for feature in self.variables_: categories = self.encoder_dict_[feature] - + # Use faster numpy processing for dummies - dummy_data = {f"{feature}_{category}": np.zeros(len(X), dtype=int) for category in categories} - + dummy_data = { + f"{feature}_{category}": np.zeros(len(X), dtype=int) + for category in categories + } + for i, row in enumerate(X[feature]): if isinstance(row, str): tags = [t.strip() for t in row.split(self.separator)] @@ -238,7 +244,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: tags = [str(t).strip() for t in row] else: tags = [str(row).strip()] - + for t in tags: if t in categories: dummy_data[f"{feature}_{t}"][i] = 1 @@ -254,10 +260,10 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: def get_feature_names_out(self, input_features=None) -> List[str]: """Get output feature names for transformation.""" check_is_fitted(self) - + feature_names = list(self.feature_names_in_) feature_names = [f for f in feature_names if f not in self.variables_] - + for feature in self.variables_: for category in self.encoder_dict_[feature]: feature_names.append(f"{feature}_{category}") From 8e67332b1a4975d64a61aad6f455ce538c9e9885 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Wed, 11 Mar 2026 16:38:54 -0500 Subject: [PATCH 3/7] fix: support pandas string dtype in StringListBinarizer --- .../encoding/string_list_binarizer.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/feature_engine/encoding/string_list_binarizer.py b/feature_engine/encoding/string_list_binarizer.py index 6479223e2..79a3b7527 100644 --- a/feature_engine/encoding/string_list_binarizer.py +++ b/feature_engine/encoding/string_list_binarizer.py @@ -51,8 +51,9 @@ class StringListBinarizer(TransformerMixin, BaseEstimator, GetFeatureNamesOutMix select all categorical variables. separator : str, default="," - The separator used to split the strings in the variable. If the variable contains - Python lists instead of strings, this parameter is ignored. + The separator used to split the strings in the variable. + If the variable contains Python lists instead of strings, + this parameter is ignored. ignore_format : bool, default=False Whether to format check the variables in `fit`. If `True`, the encoder will @@ -146,13 +147,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): self.variables_ = check_all_variables(X, self.variables) else: if self.variables is None: - # Need to use built-in categorical finder logic - self.variables_ = [ - var - for var in X.columns - if pd.api.types.is_object_dtype(X[var]) - or isinstance(X[var].dtype, pd.CategoricalDtype) - ] + # Select typical categorical/string-like variables + self.variables_ = X.select_dtypes( + include=["object", "category", "string"] + ).columns.to_list() if len(self.variables_) == 0: raise ValueError( "No categorical variables found in the dataframe. Please check " @@ -169,6 +167,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): and not ( pd.api.types.is_object_dtype(X[var]) or isinstance(X[var].dtype, pd.CategoricalDtype) + or pd.api.types.is_string_dtype(X[var]) ) ] if non_cat: From 3edebe0bba00c45789a0dbd42dcbb19152f68b8b Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Wed, 11 Mar 2026 16:44:53 -0500 Subject: [PATCH 4/7] chore: fix flake8 for StringListBinarizer tests --- .../test_string_list_binarizer.py | 120 ++++++++++-------- 1 file changed, 68 insertions(+), 52 deletions(-) diff --git a/tests/test_encoding/test_string_list_binarizer.py b/tests/test_encoding/test_string_list_binarizer.py index 03567c892..e71e94897 100644 --- a/tests/test_encoding/test_string_list_binarizer.py +++ b/tests/test_encoding/test_string_list_binarizer.py @@ -4,104 +4,120 @@ from feature_engine.encoding import StringListBinarizer + def test_string_list_binarizer_delimited_strings(): - df = pd.DataFrame({ - "tags": ["action, comedy", "comedy", "action, thriller"], - "other": [1, 2, 3] - }) - - expected_df = pd.DataFrame({ - "other": [1, 2, 3], - "tags_action": [1, 0, 1], - "tags_comedy": [1, 1, 0], - "tags_thriller": [0, 0, 1] - }) - + df = pd.DataFrame( + { + "tags": ["action, comedy", "comedy", "action, thriller"], + "other": [1, 2, 3], + } + ) + + expected_df = pd.DataFrame( + { + "other": [1, 2, 3], + "tags_action": [1, 0, 1], + "tags_comedy": [1, 1, 0], + "tags_thriller": [0, 0, 1], + } + ) + encoder = StringListBinarizer(variables=["tags"], separator=",") X = encoder.fit_transform(df) - + assert encoder.variables_ == ["tags"] assert encoder.encoder_dict_ == {"tags": ["action", "comedy", "thriller"]} pd.testing.assert_frame_equal(X, expected_df) + + def test_string_list_binarizer_python_lists(): - df = pd.DataFrame({ - "tags": [["action", "comedy"], ["comedy"], ["action", "thriller"]], - "other": [1, 2, 3] - }) - - expected_df = pd.DataFrame({ - "other": [1, 2, 3], - "tags_action": [1, 0, 1], - "tags_comedy": [1, 1, 0], - "tags_thriller": [0, 0, 1] - }) - + df = pd.DataFrame( + { + "tags": [["action", "comedy"], ["comedy"], ["action", "thriller"]], + "other": [1, 2, 3], + } + ) + + expected_df = pd.DataFrame( + { + "other": [1, 2, 3], + "tags_action": [1, 0, 1], + "tags_comedy": [1, 1, 0], + "tags_thriller": [0, 0, 1], + } + ) + encoder = StringListBinarizer(variables=["tags"]) X = encoder.fit_transform(df) - + assert encoder.variables_ == ["tags"] assert encoder.encoder_dict_ == {"tags": ["action", "comedy", "thriller"]} pd.testing.assert_frame_equal(X, expected_df) + + def test_find_categorical_variables(): - df = pd.DataFrame({ - "tags": ["A,B", "C"], - "num": [1, 2] - }) - + df = pd.DataFrame({"tags": ["A,B", "C"], "num": [1, 2]}) + encoder = StringListBinarizer(variables=None, separator=",") encoder.fit(df) - + assert encoder.variables_ == ["tags"] + + def test_ignore_format(): - df = pd.DataFrame({ - "tags": ["A,B", "C"], - "num": ["1", "2"] # Treated as object but maybe we want to encode it - }) - + df = pd.DataFrame( + { + "tags": ["A,B", "C"], + "num": ["1", "2"], # Treated as object but maybe we want to encode it + } + ) + encoder = StringListBinarizer(variables=["num"], ignore_format=True) encoder.fit(df) - + assert encoder.variables_ == ["num"] assert encoder.encoder_dict_ == {"num": ["1", "2"]} + + def test_error_if_not_categorical(): - df = pd.DataFrame({ - "num": [1, 2] - }) + df = pd.DataFrame({"num": [1, 2]}) encoder = StringListBinarizer(variables=["num"]) with pytest.raises(TypeError): encoder.fit(df) + + def test_missing_values_error(): - df = pd.DataFrame({ - "tags": ["A,B", float('nan')] - }) + df = pd.DataFrame({"tags": ["A,B", float("nan")]}) encoder = StringListBinarizer(variables=["tags"]) with pytest.raises(ValueError): encoder.fit(df) + + def test_not_fitted_error(): df = pd.DataFrame({"tags": ["A,B"]}) encoder = StringListBinarizer() with pytest.raises(NotFittedError): encoder.transform(df) + + def test_unseen_categories(): df_train = pd.DataFrame({"tags": ["A,B", "C"]}) df_test = pd.DataFrame({"tags": ["A,D", "B,C,E"]}) - + encoder = StringListBinarizer(variables=["tags"], separator=",") encoder.fit(df_train) X = encoder.transform(df_test) - + # Expect D and E to be ignored (columns for A, B, C only) - expected_df = pd.DataFrame({ - "tags_A": [1, 0], - "tags_B": [0, 1], - "tags_C": [0, 1] - }) - + expected_df = pd.DataFrame( + {"tags_A": [1, 0], "tags_B": [0, 1], "tags_C": [0, 1]} + ) + pd.testing.assert_frame_equal(X, expected_df) From 9e308487e379d169b8d6fefc8be25b9351c0208e Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Wed, 11 Mar 2026 16:46:12 -0500 Subject: [PATCH 5/7] chore: normalize blank lines in StringListBinarizer tests --- tests/test_encoding/test_string_list_binarizer.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/test_encoding/test_string_list_binarizer.py b/tests/test_encoding/test_string_list_binarizer.py index e71e94897..0e1ae0314 100644 --- a/tests/test_encoding/test_string_list_binarizer.py +++ b/tests/test_encoding/test_string_list_binarizer.py @@ -4,7 +4,6 @@ from feature_engine.encoding import StringListBinarizer - def test_string_list_binarizer_delimited_strings(): df = pd.DataFrame( { @@ -30,7 +29,6 @@ def test_string_list_binarizer_delimited_strings(): pd.testing.assert_frame_equal(X, expected_df) - def test_string_list_binarizer_python_lists(): df = pd.DataFrame( { @@ -56,7 +54,6 @@ def test_string_list_binarizer_python_lists(): pd.testing.assert_frame_equal(X, expected_df) - def test_find_categorical_variables(): df = pd.DataFrame({"tags": ["A,B", "C"], "num": [1, 2]}) @@ -66,7 +63,6 @@ def test_find_categorical_variables(): assert encoder.variables_ == ["tags"] - def test_ignore_format(): df = pd.DataFrame( { @@ -82,7 +78,6 @@ def test_ignore_format(): assert encoder.encoder_dict_ == {"num": ["1", "2"]} - def test_error_if_not_categorical(): df = pd.DataFrame({"num": [1, 2]}) encoder = StringListBinarizer(variables=["num"]) @@ -90,7 +85,6 @@ def test_error_if_not_categorical(): encoder.fit(df) - def test_missing_values_error(): df = pd.DataFrame({"tags": ["A,B", float("nan")]}) encoder = StringListBinarizer(variables=["tags"]) @@ -98,7 +92,6 @@ def test_missing_values_error(): encoder.fit(df) - def test_not_fitted_error(): df = pd.DataFrame({"tags": ["A,B"]}) encoder = StringListBinarizer() @@ -106,7 +99,6 @@ def test_not_fitted_error(): encoder.transform(df) - def test_unseen_categories(): df_train = pd.DataFrame({"tags": ["A,B", "C"]}) df_test = pd.DataFrame({"tags": ["A,D", "B,C,E"]}) From 2842cca1adba128b0f2106e20914910715d8587f Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Wed, 11 Mar 2026 16:47:30 -0500 Subject: [PATCH 6/7] chore: add missing blank line before first StringListBinarizer test --- tests/test_encoding/test_string_list_binarizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_encoding/test_string_list_binarizer.py b/tests/test_encoding/test_string_list_binarizer.py index 0e1ae0314..a2692de0b 100644 --- a/tests/test_encoding/test_string_list_binarizer.py +++ b/tests/test_encoding/test_string_list_binarizer.py @@ -4,6 +4,7 @@ from feature_engine.encoding import StringListBinarizer + def test_string_list_binarizer_delimited_strings(): df = pd.DataFrame( { From eae5f5e6af8c058323cf11c4f9225ed6b580cce7 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Wed, 11 Mar 2026 16:59:44 -0500 Subject: [PATCH 7/7] test: add coverage for StringListBinarizer (init validation, ignore_format paths, non-str/list rows, get_feature_names_out, _more_tags) --- .../test_string_list_binarizer.py | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/tests/test_encoding/test_string_list_binarizer.py b/tests/test_encoding/test_string_list_binarizer.py index a2692de0b..14a0114c9 100644 --- a/tests/test_encoding/test_string_list_binarizer.py +++ b/tests/test_encoding/test_string_list_binarizer.py @@ -114,3 +114,73 @@ def test_unseen_categories(): ) pd.testing.assert_frame_equal(X, expected_df) + + +def test_init_separator_not_str(): + with pytest.raises(ValueError, match="separator takes only strings"): + StringListBinarizer(variables=["tags"], separator=123) + + +def test_init_ignore_format_not_bool(): + with pytest.raises(ValueError, match="ignore_format takes only booleans"): + StringListBinarizer(variables=["tags"], ignore_format="yes") + + +def test_ignore_format_true_variables_none(): + """Fit with ignore_format=True and variables=None uses find_all_variables.""" + df = pd.DataFrame( + {"tags": ["a,b", "c"], "num": [1, 2], "other": ["x", "y"]} + ) + encoder = StringListBinarizer(separator=",", ignore_format=True) + encoder.fit(df) + assert set(encoder.variables_) == {"tags", "num", "other"} + X = encoder.transform(df) + assert list(X.columns) == encoder.get_feature_names_out() + + +def test_no_categorical_variables_raises(): + """Raise when variables=None and no object/category/string columns.""" + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + encoder = StringListBinarizer(variables=None) + with pytest.raises(ValueError, match="No categorical variables found"): + encoder.fit(df) + + +def test_fit_row_not_str_or_list(): + """Fit with a row that is neither str nor list (e.g. number) uses else branch.""" + df = pd.DataFrame({"tags": ["A,B", 42]}) + encoder = StringListBinarizer(variables=["tags"], separator=",") + encoder.fit(df) + assert "A" in encoder.encoder_dict_["tags"] + assert "B" in encoder.encoder_dict_["tags"] + assert "42" in encoder.encoder_dict_["tags"] + + +def test_transform_row_not_str_or_list(): + """Transform with non-str non-list row uses else branch.""" + df_train = pd.DataFrame({"tags": ["A", "B"]}) + encoder = StringListBinarizer(variables=["tags"]) + encoder.fit(df_train) + df_test = pd.DataFrame({"tags": [123]}) + X = encoder.transform(df_test) + assert "tags_A" in X.columns + assert "tags_B" in X.columns + + +def test_get_feature_names_out(): + """get_feature_names_out returns binarized feature names in order.""" + df = pd.DataFrame( + {"x": [1, 2], "tags": ["a,b", "c"], "y": [3, 4]} + ) + encoder = StringListBinarizer(variables=["tags"], separator=",") + encoder.fit(df) + names = encoder.get_feature_names_out() + assert names == ["x", "y", "tags_a", "tags_b", "tags_c"] + + +def test_more_tags(): + """_more_tags returns expected sklearn config.""" + encoder = StringListBinarizer(variables=["tags"]) + tags = encoder._more_tags() + assert tags["variables"] == "categorical" + assert "check_estimators_nan_inf" in tags["_xfail_checks"]