From 11081e5b1aaacc3c7584a38cabee7c0f570a89ff Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Tue, 10 Mar 2026 11:06:51 -0500
Subject: [PATCH 1/7] feat: Add StringListBinarizer to encode multi-label
 strings and lists

---
 docs/api_doc/encoding/StringListBinarizer.rst |   5 +
 docs/api_doc/encoding/index.rst               |   1 +
 .../encoding/StringListBinarizer.rst          |  79 +++++
 docs/user_guide/encoding/index.rst            |   1 +
 feature_engine/encoding/__init__.py           |   2 +
 .../encoding/string_list_binarizer.py         | 271 ++++++++++++++++++
 .../test_string_list_binarizer.py             | 107 +++++++
 7 files changed, 466 insertions(+)
 create mode 100644 docs/api_doc/encoding/StringListBinarizer.rst
 create mode 100644 docs/user_guide/encoding/StringListBinarizer.rst
 create mode 100644 feature_engine/encoding/string_list_binarizer.py
 create mode 100644 tests/test_encoding/test_string_list_binarizer.py

diff --git a/docs/api_doc/encoding/StringListBinarizer.rst b/docs/api_doc/encoding/StringListBinarizer.rst
new file mode 100644
index 000000000..5b7112cfe
--- /dev/null
+++ b/docs/api_doc/encoding/StringListBinarizer.rst
@@ -0,0 +1,5 @@
+StringListBinarizer
+===================
+
+.. autoclass:: feature_engine.encoding.StringListBinarizer
+    :members:
diff --git a/docs/api_doc/encoding/index.rst b/docs/api_doc/encoding/index.rst
index 5256753c4..ab448c4d1 100644
--- a/docs/api_doc/encoding/index.rst
+++ b/docs/api_doc/encoding/index.rst
@@ -37,6 +37,7 @@ input.
    DecisionTreeEncoder
    RareLabelEncoder
    StringSimilarityEncoder
+   StringListBinarizer
 
 Other categorical encoding libraries
 ------------------------------------
diff --git a/docs/user_guide/encoding/StringListBinarizer.rst b/docs/user_guide/encoding/StringListBinarizer.rst
new file mode 100644
index 000000000..54dd538a9
--- /dev/null
+++ b/docs/user_guide/encoding/StringListBinarizer.rst
@@ -0,0 +1,79 @@
+.. _string_list_binarizer:
+
+.. currentmodule:: feature_engine.encoding
+
+StringListBinarizer
+===================
+
+:class:`StringListBinarizer()` replaces categorical variables containing lists of strings
+or comma-delimited strings with a set of binary variables (dummy variables) representing
+each one of the unique tags or categories present across all observations.
+
+This transformer is particularly useful for handling multi-label categorical columns
+where each row might have multiple values, such as ``"action, comedy"`` or
+``"romance, thriller, action"``. The transformer splits these strings by a specified separator,
+collects all unique tags, and then applies one-hot encoding on them. It can also natively
+handle columns structured as Python lists, like ``["action", "comedy"]``.
+
+Python example
+--------------
+
+Let's look at an example. We generate a toy dataset with multi-label genre information
+stored as comma-delimited strings:
+
+.. code:: python
+
+    import pandas as pd
+    from feature_engine.encoding import StringListBinarizer
+
+    X = pd.DataFrame(dict(
+        user_id = [1, 2, 3],
+        genres = ["action, comedy", "comedy", "action, thriller"]
+    ))
+
+    print(X)
+
+.. code:: python
+
+       user_id            genres
+    0        1    action, comedy
+    1        2            comedy
+    2        3  action, thriller
+
+Now, we set up the :class:`StringListBinarizer()`. Since our strings are separated by a
+comma and a space, we specify ``separator=", "``.
+
+.. code:: python
+
+    slb = StringListBinarizer(
+        variables=["genres"],
+        separator=", "
+    )
+
+    slb.fit(X)
+
+During `fit`, the enoder splits the strings, identifies the unique categories across
+the entire dataset, and saves them in its `encoder_dict_` attribute.
+
+.. code:: python
+
+    print(slb.encoder_dict_)
+    # {'genres': ['action', 'comedy', 'thriller']}
+
+We can now use `transform` to get the dummy variables. The original column is dropped by default.
+
+.. code:: python
+
+    X_encoded = slb.transform(X)
+    print(X_encoded)
+
+.. code:: python
+
+       user_id  genres_action  genres_comedy  genres_thriller
+    0        1              1              1               0
+    1        2              0              1               0
+    2        3              1              0               1
+
+As we see, each row now has a 1 in the columns corresponding to the genres it originally contained,
+and 0 otherwise. Unseen categories encountered during transform will simply be ignored (i.e. all
+dummy columns will be 0 for those extra components).
diff --git a/docs/user_guide/encoding/index.rst b/docs/user_guide/encoding/index.rst
index 2b82c0a11..eccdc7500 100644
--- a/docs/user_guide/encoding/index.rst
+++ b/docs/user_guide/encoding/index.rst
@@ -431,6 +431,7 @@ Encoders
    :maxdepth: 1
 
    OneHotEncoder
+   StringListBinarizer
    OrdinalEncoder
    CountFrequencyEncoder
    MeanEncoder
diff --git a/feature_engine/encoding/__init__.py b/feature_engine/encoding/__init__.py
index 3c689b7ad..cfdec4a68 100644
--- a/feature_engine/encoding/__init__.py
+++ b/feature_engine/encoding/__init__.py
@@ -9,6 +9,7 @@
 from .ordinal import OrdinalEncoder
 from .rare_label import RareLabelEncoder
 from .similarity_encoder import StringSimilarityEncoder
+from .string_list_binarizer import StringListBinarizer
 from .woe import WoEEncoder
 
 __all__ = [
@@ -18,6 +19,7 @@
     "OneHotEncoder",
     "OrdinalEncoder",
     "RareLabelEncoder",
+    "StringListBinarizer",
     "StringSimilarityEncoder",
     "WoEEncoder",
 ]
diff --git a/feature_engine/encoding/string_list_binarizer.py b/feature_engine/encoding/string_list_binarizer.py
new file mode 100644
index 000000000..560bcd3b4
--- /dev/null
+++ b/feature_engine/encoding/string_list_binarizer.py
@@ -0,0 +1,271 @@
+# Authors: Ankit Hemant Lade (contributor)
+# License: BSD 3 clause
+
+from typing import List, Optional, Union
+
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import check_is_fitted
+
+from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin
+from feature_engine._check_init_parameters.check_variables import (
+    _check_variables_input_value,
+)
+from feature_engine.dataframe_checks import (
+    _check_optional_contains_na,
+    _check_X_matches_training_df,
+    check_X,
+)
+from feature_engine.tags import _return_tags
+from feature_engine.variable_handling import (
+    check_all_variables,
+    find_all_variables,
+)
+
+
+class StringListBinarizer(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin):
+    """
+    StringListBinarizer() takes categorical variables that contain a list of strings
+    or a delimited string, and creates binary variables representing each of the
+    unique categories across all observations.
+    
+    This is especially useful for columns containing multiple tags per row, such as
+    `["action", "comedy"]` or `"action, comedy"`.
+
+    The transformer takes a list of variables to encode, or automatically selects
+    all object/categorical columns if none are provided.
+
+    The encodings are created by splitting the strings on a specified `separator`
+    (or parsing the lists directly), identifying the unique tags in the dataset,
+    and then adding a new boolean column `varname_tag` for each unique tag.
+
+    Original columns are dropped after transformation by default.
+
+    More details in the :ref:`User Guide <string_list_binarizer>`.
+
+    Parameters
+    ----------
+    variables : list, default=None
+        The list of categorical variables to encode. If None, the encoder will find and
+        select all categorical variables.
+
+    separator : str, default=","
+        The separator used to split the strings in the variable. If the variable contains
+        Python lists instead of strings, this parameter is ignored.
+
+    ignore_format : bool, default=False
+        Whether to format check the variables in `fit`. If `True`, the encoder will
+        ignore the variable types and proceed with encoding, provided the variables are
+        entered by the user. If `variables` is None, the target variables are all those
+        in the dataset regardless of type. If `False`, the encoder will select and
+        encode only categorical variables (type 'object' or 'categorical').
+
+    Attributes
+    ----------
+    variables_:
+        The list of variables to be transformed.
+
+    encoder_dict_:
+        A dictionary mapping the variables to the sorted list of their unique tags.
+
+    feature_names_in_:
+        List with the names of features seen during `fit`.
+
+    n_features_in_:
+        The number of features in the train set used in fit.
+
+    Methods
+    -------
+    fit:
+        Learn the unique tags per variable.
+
+    fit_transform:
+        Fit to data, then transform it.
+
+    transform:
+        Replace the original variable with the binary encoded variables.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> from feature_engine.encoding import StringListBinarizer
+    >>> X = pd.DataFrame(dict(tags=["action, comedy", "comedy", "action, thriller"]))
+    >>> slb = StringListBinarizer(variables=["tags"], separator=", ")
+    >>> slb.fit(X)
+    >>> slb.transform(X)
+       tags_action  tags_comedy  tags_thriller
+    0            1            1              0
+    1            0            1              0
+    2            1            0              1
+    """
+
+    def __init__(
+        self,
+        variables: Union[None, int, str, List[Union[str, int]]] = None,
+        separator: str = ",",
+        ignore_format: bool = False,
+    ) -> None:
+
+        if not isinstance(separator, str):
+            raise ValueError(
+                f"separator takes only strings. Got {type(separator).__name__} instead."
+            )
+
+        if not isinstance(ignore_format, bool):
+            raise ValueError(
+                "ignore_format takes only booleans True and False. "
+                f"Got {ignore_format} instead."
+            )
+
+        self.variables = _check_variables_input_value(variables)
+        self.separator = separator
+        self.ignore_format = ignore_format
+
+    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
+        """
+        Learn the unique tags present in each categorical variable.
+
+        Parameters
+        ----------
+
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The training input samples.
+
+        y: pandas series, default=None
+            Target. It is not needed in this encoded. You can pass y or
+            None.
+        """
+        X = check_X(X)
+        
+        # select variables to encode
+        if self.ignore_format is True:
+            if self.variables is None:
+                self.variables_ = find_all_variables(X)
+            else:
+                self.variables_ = check_all_variables(X, self.variables)
+        else:
+            if self.variables is None:
+                # Need to use built-in categorical finder logic
+                self.variables_ = [
+                    var
+                    for var in X.columns
+                    if pd.api.types.is_object_dtype(X[var])
+                    or isinstance(X[var].dtype, pd.CategoricalDtype)
+                ]
+                if len(self.variables_) == 0:
+                    raise ValueError(
+                        "No categorical variables found in the dataframe. Please check "
+                        "the variables format or set `ignore_format=True`."
+                    )
+            else:
+                self.variables_ = _check_variables_input_value(self.variables)
+                
+                # Check that specified variables exist and are object/categorical
+                non_cat = [
+                    var for var in self.variables_ 
+                    if var in X.columns and not (
+                        pd.api.types.is_object_dtype(X[var]) or 
+                        isinstance(X[var].dtype, pd.CategoricalDtype)
+                    )
+                ]
+                if non_cat:
+                    raise TypeError(
+                        f"Some of the variables are not categorical. Please cast them "
+                        f"as object or categorical before calling fit, or set "
+                        f"`ignore_format=True`. Variables: {non_cat}"
+                    )
+
+        _check_optional_contains_na(X, self.variables_)
+
+        self.encoder_dict_ = {}
+
+        for var in self.variables_:
+            unique_tags = set()
+            for row in X[var]:
+                if isinstance(row, str):
+                    tags = [t.strip() for t in row.split(self.separator)]
+                elif isinstance(row, list):
+                    tags = [str(t).strip() for t in row]
+                else:
+                    tags = [str(row).strip()]
+                unique_tags.update(tags)
+            
+            # Remove empty strings from tags (often caused by trailing separators)
+            unique_tags.discard("")
+            
+            self.encoder_dict_[var] = sorted(list(unique_tags))
+
+        self.feature_names_in_ = X.columns.tolist()
+        self.n_features_in_ = X.shape[1]
+
+        return self
+
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        """
+        Replace the categorical variables by the binary encoded variables.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The data to transform.
+
+        Returns
+        -------
+        X_new: pandas dataframe.
+            The transformed dataframe. The shape of the dataframe will differ from
+            the original, as it replaces the original list/string columns with multiple
+            dummy columns.
+        """
+        check_is_fitted(self)
+
+        X = check_X(X)
+        _check_X_matches_training_df(X, self.n_features_in_)
+        _check_optional_contains_na(X, self.variables_)
+
+        X_transformed = X[self.feature_names_in_].copy()
+
+        for feature in self.variables_:
+            categories = self.encoder_dict_[feature]
+            
+            # Use faster numpy processing for dummies
+            dummy_data = {f"{feature}_{category}": np.zeros(len(X), dtype=int) for category in categories}
+            
+            for i, row in enumerate(X[feature]):
+                if isinstance(row, str):
+                    tags = [t.strip() for t in row.split(self.separator)]
+                elif isinstance(row, list):
+                    tags = [str(t).strip() for t in row]
+                else:
+                    tags = [str(row).strip()]
+                    
+                for t in tags:
+                    if t in categories:
+                        dummy_data[f"{feature}_{t}"][i] = 1
+
+            dummy_df = pd.DataFrame(dummy_data, index=X.index)
+            X_transformed = pd.concat([X_transformed, dummy_df], axis=1)
+
+        # drop original variables
+        X_transformed.drop(labels=self.variables_, axis=1, inplace=True)
+
+        return X_transformed
+
+    def get_feature_names_out(self, input_features=None) -> List[str]:
+        """Get output feature names for transformation."""
+        check_is_fitted(self)
+        
+        feature_names = list(self.feature_names_in_)
+        feature_names = [f for f in feature_names if f not in self.variables_]
+        
+        for feature in self.variables_:
+            for category in self.encoder_dict_[feature]:
+                feature_names.append(f"{feature}_{category}")
+
+        return feature_names
+
+    def _more_tags(self):
+        tags_dict = _return_tags()
+        tags_dict["variables"] = "categorical"
+        tags_dict["_xfail_checks"]["check_estimators_nan_inf"] = "transformer allows NA"
+        return tags_dict
diff --git a/tests/test_encoding/test_string_list_binarizer.py b/tests/test_encoding/test_string_list_binarizer.py
new file mode 100644
index 000000000..03567c892
--- /dev/null
+++ b/tests/test_encoding/test_string_list_binarizer.py
@@ -0,0 +1,107 @@
+import pandas as pd
+import pytest
+from sklearn.exceptions import NotFittedError
+
+from feature_engine.encoding import StringListBinarizer
+
+def test_string_list_binarizer_delimited_strings():
+    df = pd.DataFrame({
+        "tags": ["action, comedy", "comedy", "action, thriller"],
+        "other": [1, 2, 3]
+    })
+    
+    expected_df = pd.DataFrame({
+        "other": [1, 2, 3],
+        "tags_action": [1, 0, 1],
+        "tags_comedy": [1, 1, 0],
+        "tags_thriller": [0, 0, 1]
+    })
+    
+    encoder = StringListBinarizer(variables=["tags"], separator=",")
+    X = encoder.fit_transform(df)
+    
+    assert encoder.variables_ == ["tags"]
+    assert encoder.encoder_dict_ == {"tags": ["action", "comedy", "thriller"]}
+    pd.testing.assert_frame_equal(X, expected_df)
+
+def test_string_list_binarizer_python_lists():
+    df = pd.DataFrame({
+        "tags": [["action", "comedy"], ["comedy"], ["action", "thriller"]],
+        "other": [1, 2, 3]
+    })
+    
+    expected_df = pd.DataFrame({
+        "other": [1, 2, 3],
+        "tags_action": [1, 0, 1],
+        "tags_comedy": [1, 1, 0],
+        "tags_thriller": [0, 0, 1]
+    })
+    
+    encoder = StringListBinarizer(variables=["tags"])
+    X = encoder.fit_transform(df)
+    
+    assert encoder.variables_ == ["tags"]
+    assert encoder.encoder_dict_ == {"tags": ["action", "comedy", "thriller"]}
+    pd.testing.assert_frame_equal(X, expected_df)
+
+def test_find_categorical_variables():
+    df = pd.DataFrame({
+        "tags": ["A,B", "C"],
+        "num": [1, 2]
+    })
+    
+    encoder = StringListBinarizer(variables=None, separator=",")
+    encoder.fit(df)
+    
+    assert encoder.variables_ == ["tags"]
+
+def test_ignore_format():
+    df = pd.DataFrame({
+        "tags": ["A,B", "C"],
+        "num": ["1", "2"] # Treated as object but maybe we want to encode it
+    })
+    
+    encoder = StringListBinarizer(variables=["num"], ignore_format=True)
+    encoder.fit(df)
+    
+    assert encoder.variables_ == ["num"]
+    assert encoder.encoder_dict_ == {"num": ["1", "2"]}
+
+def test_error_if_not_categorical():
+    df = pd.DataFrame({
+        "num": [1, 2]
+    })
+    encoder = StringListBinarizer(variables=["num"])
+    with pytest.raises(TypeError):
+        encoder.fit(df)
+
+def test_missing_values_error():
+    df = pd.DataFrame({
+        "tags": ["A,B", float('nan')]
+    })
+    encoder = StringListBinarizer(variables=["tags"])
+    with pytest.raises(ValueError):
+        encoder.fit(df)
+
+def test_not_fitted_error():
+    df = pd.DataFrame({"tags": ["A,B"]})
+    encoder = StringListBinarizer()
+    with pytest.raises(NotFittedError):
+        encoder.transform(df)
+
+def test_unseen_categories():
+    df_train = pd.DataFrame({"tags": ["A,B", "C"]})
+    df_test = pd.DataFrame({"tags": ["A,D", "B,C,E"]})
+    
+    encoder = StringListBinarizer(variables=["tags"], separator=",")
+    encoder.fit(df_train)
+    X = encoder.transform(df_test)
+    
+    # Expect D and E to be ignored (columns for A, B, C only)
+    expected_df = pd.DataFrame({
+        "tags_A": [1, 0],
+        "tags_B": [0, 1],
+        "tags_C": [0, 1]
+    })
+    
+    pd.testing.assert_frame_equal(X, expected_df)

From 5fc5bf5df9dd13fe79a341aa61935bcbc80ca4e3 Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Wed, 11 Mar 2026 16:32:48 -0500
Subject: [PATCH 2/7] fix: address flake8 for StringListBinarizer

---
 .../encoding/string_list_binarizer.py         | 42 +++++++++++--------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/feature_engine/encoding/string_list_binarizer.py b/feature_engine/encoding/string_list_binarizer.py
index 560bcd3b4..6479223e2 100644
--- a/feature_engine/encoding/string_list_binarizer.py
+++ b/feature_engine/encoding/string_list_binarizer.py
@@ -29,7 +29,7 @@ class StringListBinarizer(TransformerMixin, BaseEstimator, GetFeatureNamesOutMix
     StringListBinarizer() takes categorical variables that contain a list of strings
     or a delimited string, and creates binary variables representing each of the
     unique categories across all observations.
-    
+
     This is especially useful for columns containing multiple tags per row, such as
     `["action", "comedy"]` or `"action, comedy"`.
 
@@ -137,7 +137,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
             None.
         """
         X = check_X(X)
-        
+
         # select variables to encode
         if self.ignore_format is True:
             if self.variables is None:
@@ -160,20 +160,23 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
                     )
             else:
                 self.variables_ = _check_variables_input_value(self.variables)
-                
+
                 # Check that specified variables exist and are object/categorical
                 non_cat = [
-                    var for var in self.variables_ 
-                    if var in X.columns and not (
-                        pd.api.types.is_object_dtype(X[var]) or 
-                        isinstance(X[var].dtype, pd.CategoricalDtype)
+                    var
+                    for var in self.variables_
+                    if var in X.columns
+                    and not (
+                        pd.api.types.is_object_dtype(X[var])
+                        or isinstance(X[var].dtype, pd.CategoricalDtype)
                     )
                 ]
                 if non_cat:
                     raise TypeError(
-                        f"Some of the variables are not categorical. Please cast them "
-                        f"as object or categorical before calling fit, or set "
-                        f"`ignore_format=True`. Variables: {non_cat}"
+                        "Some of the variables are not categorical. Please cast them "
+                        "as object or categorical before calling fit, or set "
+                        "`ignore_format=True`. Variables: "
+                        f"{non_cat}"
                     )
 
         _check_optional_contains_na(X, self.variables_)
@@ -190,10 +193,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
                 else:
                     tags = [str(row).strip()]
                 unique_tags.update(tags)
-            
+
             # Remove empty strings from tags (often caused by trailing separators)
             unique_tags.discard("")
-            
+
             self.encoder_dict_[var] = sorted(list(unique_tags))
 
         self.feature_names_in_ = X.columns.tolist()
@@ -227,10 +230,13 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
 
         for feature in self.variables_:
             categories = self.encoder_dict_[feature]
-            
+
             # Use faster numpy processing for dummies
-            dummy_data = {f"{feature}_{category}": np.zeros(len(X), dtype=int) for category in categories}
-            
+            dummy_data = {
+                f"{feature}_{category}": np.zeros(len(X), dtype=int)
+                for category in categories
+            }
+
             for i, row in enumerate(X[feature]):
                 if isinstance(row, str):
                     tags = [t.strip() for t in row.split(self.separator)]
@@ -238,7 +244,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
                     tags = [str(t).strip() for t in row]
                 else:
                     tags = [str(row).strip()]
-                    
+
                 for t in tags:
                     if t in categories:
                         dummy_data[f"{feature}_{t}"][i] = 1
@@ -254,10 +260,10 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
     def get_feature_names_out(self, input_features=None) -> List[str]:
         """Get output feature names for transformation."""
         check_is_fitted(self)
-        
+
         feature_names = list(self.feature_names_in_)
         feature_names = [f for f in feature_names if f not in self.variables_]
-        
+
         for feature in self.variables_:
             for category in self.encoder_dict_[feature]:
                 feature_names.append(f"{feature}_{category}")

From 8e67332b1a4975d64a61aad6f455ce538c9e9885 Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Wed, 11 Mar 2026 16:38:54 -0500
Subject: [PATCH 3/7] fix: support pandas string dtype in StringListBinarizer

---
 .../encoding/string_list_binarizer.py           | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/feature_engine/encoding/string_list_binarizer.py b/feature_engine/encoding/string_list_binarizer.py
index 6479223e2..79a3b7527 100644
--- a/feature_engine/encoding/string_list_binarizer.py
+++ b/feature_engine/encoding/string_list_binarizer.py
@@ -51,8 +51,9 @@ class StringListBinarizer(TransformerMixin, BaseEstimator, GetFeatureNamesOutMix
         select all categorical variables.
 
     separator : str, default=","
-        The separator used to split the strings in the variable. If the variable contains
-        Python lists instead of strings, this parameter is ignored.
+        The separator used to split the strings in the variable.
+        If the variable contains Python lists instead of strings,
+        this parameter is ignored.
 
     ignore_format : bool, default=False
         Whether to format check the variables in `fit`. If `True`, the encoder will
@@ -146,13 +147,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
                 self.variables_ = check_all_variables(X, self.variables)
         else:
             if self.variables is None:
-                # Need to use built-in categorical finder logic
-                self.variables_ = [
-                    var
-                    for var in X.columns
-                    if pd.api.types.is_object_dtype(X[var])
-                    or isinstance(X[var].dtype, pd.CategoricalDtype)
-                ]
+                # Select typical categorical/string-like variables
+                self.variables_ = X.select_dtypes(
+                    include=["object", "category", "string"]
+                ).columns.to_list()
                 if len(self.variables_) == 0:
                     raise ValueError(
                         "No categorical variables found in the dataframe. Please check "
@@ -169,6 +167,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
                     and not (
                         pd.api.types.is_object_dtype(X[var])
                         or isinstance(X[var].dtype, pd.CategoricalDtype)
+                        or pd.api.types.is_string_dtype(X[var])
                     )
                 ]
                 if non_cat:

From 3edebe0bba00c45789a0dbd42dcbb19152f68b8b Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Wed, 11 Mar 2026 16:44:53 -0500
Subject: [PATCH 4/7] chore: fix flake8 for StringListBinarizer tests

---
 .../test_string_list_binarizer.py             | 120 ++++++++++--------
 1 file changed, 68 insertions(+), 52 deletions(-)

diff --git a/tests/test_encoding/test_string_list_binarizer.py b/tests/test_encoding/test_string_list_binarizer.py
index 03567c892..e71e94897 100644
--- a/tests/test_encoding/test_string_list_binarizer.py
+++ b/tests/test_encoding/test_string_list_binarizer.py
@@ -4,104 +4,120 @@
 
 from feature_engine.encoding import StringListBinarizer
 
+
 def test_string_list_binarizer_delimited_strings():
-    df = pd.DataFrame({
-        "tags": ["action, comedy", "comedy", "action, thriller"],
-        "other": [1, 2, 3]
-    })
-    
-    expected_df = pd.DataFrame({
-        "other": [1, 2, 3],
-        "tags_action": [1, 0, 1],
-        "tags_comedy": [1, 1, 0],
-        "tags_thriller": [0, 0, 1]
-    })
-    
+    df = pd.DataFrame(
+        {
+            "tags": ["action, comedy", "comedy", "action, thriller"],
+            "other": [1, 2, 3],
+        }
+    )
+
+    expected_df = pd.DataFrame(
+        {
+            "other": [1, 2, 3],
+            "tags_action": [1, 0, 1],
+            "tags_comedy": [1, 1, 0],
+            "tags_thriller": [0, 0, 1],
+        }
+    )
+
     encoder = StringListBinarizer(variables=["tags"], separator=",")
     X = encoder.fit_transform(df)
-    
+
     assert encoder.variables_ == ["tags"]
     assert encoder.encoder_dict_ == {"tags": ["action", "comedy", "thriller"]}
     pd.testing.assert_frame_equal(X, expected_df)
 
+
+
 def test_string_list_binarizer_python_lists():
-    df = pd.DataFrame({
-        "tags": [["action", "comedy"], ["comedy"], ["action", "thriller"]],
-        "other": [1, 2, 3]
-    })
-    
-    expected_df = pd.DataFrame({
-        "other": [1, 2, 3],
-        "tags_action": [1, 0, 1],
-        "tags_comedy": [1, 1, 0],
-        "tags_thriller": [0, 0, 1]
-    })
-    
+    df = pd.DataFrame(
+        {
+            "tags": [["action", "comedy"], ["comedy"], ["action", "thriller"]],
+            "other": [1, 2, 3],
+        }
+    )
+
+    expected_df = pd.DataFrame(
+        {
+            "other": [1, 2, 3],
+            "tags_action": [1, 0, 1],
+            "tags_comedy": [1, 1, 0],
+            "tags_thriller": [0, 0, 1],
+        }
+    )
+
     encoder = StringListBinarizer(variables=["tags"])
     X = encoder.fit_transform(df)
-    
+
     assert encoder.variables_ == ["tags"]
     assert encoder.encoder_dict_ == {"tags": ["action", "comedy", "thriller"]}
     pd.testing.assert_frame_equal(X, expected_df)
 
+
+
 def test_find_categorical_variables():
-    df = pd.DataFrame({
-        "tags": ["A,B", "C"],
-        "num": [1, 2]
-    })
-    
+    df = pd.DataFrame({"tags": ["A,B", "C"], "num": [1, 2]})
+
     encoder = StringListBinarizer(variables=None, separator=",")
     encoder.fit(df)
-    
+
     assert encoder.variables_ == ["tags"]
 
+
+
 def test_ignore_format():
-    df = pd.DataFrame({
-        "tags": ["A,B", "C"],
-        "num": ["1", "2"] # Treated as object but maybe we want to encode it
-    })
-    
+    df = pd.DataFrame(
+        {
+            "tags": ["A,B", "C"],
+            "num": ["1", "2"],  # Treated as object but maybe we want to encode it
+        }
+    )
+
     encoder = StringListBinarizer(variables=["num"], ignore_format=True)
     encoder.fit(df)
-    
+
     assert encoder.variables_ == ["num"]
     assert encoder.encoder_dict_ == {"num": ["1", "2"]}
 
+
+
 def test_error_if_not_categorical():
-    df = pd.DataFrame({
-        "num": [1, 2]
-    })
+    df = pd.DataFrame({"num": [1, 2]})
     encoder = StringListBinarizer(variables=["num"])
     with pytest.raises(TypeError):
         encoder.fit(df)
 
+
+
 def test_missing_values_error():
-    df = pd.DataFrame({
-        "tags": ["A,B", float('nan')]
-    })
+    df = pd.DataFrame({"tags": ["A,B", float("nan")]})
     encoder = StringListBinarizer(variables=["tags"])
     with pytest.raises(ValueError):
         encoder.fit(df)
 
+
+
 def test_not_fitted_error():
     df = pd.DataFrame({"tags": ["A,B"]})
     encoder = StringListBinarizer()
     with pytest.raises(NotFittedError):
         encoder.transform(df)
 
+
+
 def test_unseen_categories():
     df_train = pd.DataFrame({"tags": ["A,B", "C"]})
     df_test = pd.DataFrame({"tags": ["A,D", "B,C,E"]})
-    
+
     encoder = StringListBinarizer(variables=["tags"], separator=",")
     encoder.fit(df_train)
     X = encoder.transform(df_test)
-    
+
     # Expect D and E to be ignored (columns for A, B, C only)
-    expected_df = pd.DataFrame({
-        "tags_A": [1, 0],
-        "tags_B": [0, 1],
-        "tags_C": [0, 1]
-    })
-    
+    expected_df = pd.DataFrame(
+        {"tags_A": [1, 0], "tags_B": [0, 1], "tags_C": [0, 1]}
+    )
+
     pd.testing.assert_frame_equal(X, expected_df)

From 9e308487e379d169b8d6fefc8be25b9351c0208e Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Wed, 11 Mar 2026 16:46:12 -0500
Subject: [PATCH 5/7] chore: normalize blank lines in StringListBinarizer tests

---
 tests/test_encoding/test_string_list_binarizer.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tests/test_encoding/test_string_list_binarizer.py b/tests/test_encoding/test_string_list_binarizer.py
index e71e94897..0e1ae0314 100644
--- a/tests/test_encoding/test_string_list_binarizer.py
+++ b/tests/test_encoding/test_string_list_binarizer.py
@@ -4,7 +4,6 @@
 
 from feature_engine.encoding import StringListBinarizer
 
-
 def test_string_list_binarizer_delimited_strings():
     df = pd.DataFrame(
         {
@@ -30,7 +29,6 @@ def test_string_list_binarizer_delimited_strings():
     pd.testing.assert_frame_equal(X, expected_df)
 
 
-
 def test_string_list_binarizer_python_lists():
     df = pd.DataFrame(
         {
@@ -56,7 +54,6 @@ def test_string_list_binarizer_python_lists():
     pd.testing.assert_frame_equal(X, expected_df)
 
 
-
 def test_find_categorical_variables():
     df = pd.DataFrame({"tags": ["A,B", "C"], "num": [1, 2]})
 
@@ -66,7 +63,6 @@ def test_find_categorical_variables():
     assert encoder.variables_ == ["tags"]
 
 
-
 def test_ignore_format():
     df = pd.DataFrame(
         {
@@ -82,7 +78,6 @@ def test_ignore_format():
     assert encoder.encoder_dict_ == {"num": ["1", "2"]}
 
 
-
 def test_error_if_not_categorical():
     df = pd.DataFrame({"num": [1, 2]})
     encoder = StringListBinarizer(variables=["num"])
@@ -90,7 +85,6 @@ def test_error_if_not_categorical():
         encoder.fit(df)
 
 
-
 def test_missing_values_error():
     df = pd.DataFrame({"tags": ["A,B", float("nan")]})
     encoder = StringListBinarizer(variables=["tags"])
@@ -98,7 +92,6 @@ def test_missing_values_error():
         encoder.fit(df)
 
 
-
 def test_not_fitted_error():
     df = pd.DataFrame({"tags": ["A,B"]})
     encoder = StringListBinarizer()
@@ -106,7 +99,6 @@ def test_not_fitted_error():
         encoder.transform(df)
 
 
-
 def test_unseen_categories():
     df_train = pd.DataFrame({"tags": ["A,B", "C"]})
     df_test = pd.DataFrame({"tags": ["A,D", "B,C,E"]})

From 2842cca1adba128b0f2106e20914910715d8587f Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Wed, 11 Mar 2026 16:47:30 -0500
Subject: [PATCH 6/7] chore: add missing blank line before first
 StringListBinarizer test

---
 tests/test_encoding/test_string_list_binarizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_encoding/test_string_list_binarizer.py b/tests/test_encoding/test_string_list_binarizer.py
index 0e1ae0314..a2692de0b 100644
--- a/tests/test_encoding/test_string_list_binarizer.py
+++ b/tests/test_encoding/test_string_list_binarizer.py
@@ -4,6 +4,7 @@
 
 from feature_engine.encoding import StringListBinarizer
 
+
 def test_string_list_binarizer_delimited_strings():
     df = pd.DataFrame(
         {

From eae5f5e6af8c058323cf11c4f9225ed6b580cce7 Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Wed, 11 Mar 2026 16:59:44 -0500
Subject: [PATCH 7/7] test: add coverage for StringListBinarizer (init
 validation, ignore_format paths, non-str/list rows, get_feature_names_out,
 _more_tags)

---
 .../test_string_list_binarizer.py             | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/tests/test_encoding/test_string_list_binarizer.py b/tests/test_encoding/test_string_list_binarizer.py
index a2692de0b..14a0114c9 100644
--- a/tests/test_encoding/test_string_list_binarizer.py
+++ b/tests/test_encoding/test_string_list_binarizer.py
@@ -114,3 +114,73 @@ def test_unseen_categories():
     )
 
     pd.testing.assert_frame_equal(X, expected_df)
+
+
+def test_init_separator_not_str():
+    with pytest.raises(ValueError, match="separator takes only strings"):
+        StringListBinarizer(variables=["tags"], separator=123)
+
+
+def test_init_ignore_format_not_bool():
+    with pytest.raises(ValueError, match="ignore_format takes only booleans"):
+        StringListBinarizer(variables=["tags"], ignore_format="yes")
+
+
+def test_ignore_format_true_variables_none():
+    """Fit with ignore_format=True and variables=None uses find_all_variables."""
+    df = pd.DataFrame(
+        {"tags": ["a,b", "c"], "num": [1, 2], "other": ["x", "y"]}
+    )
+    encoder = StringListBinarizer(separator=",", ignore_format=True)
+    encoder.fit(df)
+    assert set(encoder.variables_) == {"tags", "num", "other"}
+    X = encoder.transform(df)
+    assert list(X.columns) == encoder.get_feature_names_out()
+
+
+def test_no_categorical_variables_raises():
+    """Raise when variables=None and no object/category/string columns."""
+    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    encoder = StringListBinarizer(variables=None)
+    with pytest.raises(ValueError, match="No categorical variables found"):
+        encoder.fit(df)
+
+
+def test_fit_row_not_str_or_list():
+    """Fit with a row that is neither str nor list (e.g. number) uses else branch."""
+    df = pd.DataFrame({"tags": ["A,B", 42]})
+    encoder = StringListBinarizer(variables=["tags"], separator=",")
+    encoder.fit(df)
+    assert "A" in encoder.encoder_dict_["tags"]
+    assert "B" in encoder.encoder_dict_["tags"]
+    assert "42" in encoder.encoder_dict_["tags"]
+
+
+def test_transform_row_not_str_or_list():
+    """Transform with non-str non-list row uses else branch."""
+    df_train = pd.DataFrame({"tags": ["A", "B"]})
+    encoder = StringListBinarizer(variables=["tags"])
+    encoder.fit(df_train)
+    df_test = pd.DataFrame({"tags": [123]})
+    X = encoder.transform(df_test)
+    assert "tags_A" in X.columns
+    assert "tags_B" in X.columns
+
+
+def test_get_feature_names_out():
+    """get_feature_names_out returns binarized feature names in order."""
+    df = pd.DataFrame(
+        {"x": [1, 2], "tags": ["a,b", "c"], "y": [3, 4]}
+    )
+    encoder = StringListBinarizer(variables=["tags"], separator=",")
+    encoder.fit(df)
+    names = encoder.get_feature_names_out()
+    assert names == ["x", "y", "tags_a", "tags_b", "tags_c"]
+
+
+def test_more_tags():
+    """_more_tags returns expected sklearn config."""
+    encoder = StringListBinarizer(variables=["tags"])
+    tags = encoder._more_tags()
+    assert tags["variables"] == "categorical"
+    assert "check_estimators_nan_inf" in tags["_xfail_checks"]