Answer a question

I am working on an example of using ColumnTransformer and LabelEncoder for pre-processing the well-known Titanic data set X:

    Age Embarked    Fare    Sex
0   22.0    S      7.2500   male
1   38.0    C      71.2833  female
2   26.0    S      7.9250   female
3   35.0    S      53.1000  female
4   35.0    S      8.0500   male

Calling the transformer like this:

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
ColumnTransformer(
    transformers=[
        ("label-encode categorical", LabelEncoder(), ["Sex", "Embarked"])
    ]
).fit(X).transform(X)

results in:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-54-fd5a05b7e47e> in <module>
      4         ("label-encode categorical", LabelEncoder(), ["Sex", "Embarked"])
      5     ]
----> 6 ).fit(X).transform(X)

~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit(self, X, y)
    418         # we use fit_transform to make sure to set sparse_output_ (for which we
    419         # need the transformed data) to have consistent output type in predict
--> 420         self.fit_transform(X, y=y)
    421         return self
    422 

~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
    447         self._validate_remainder(X)
    448 
--> 449         result = self._fit_transform(X, y, _fit_transform_one)
    450 
    451         if not result:

~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
    391                               _get_column(X, column), y, weight)
    392                 for _, trans, column, weight in self._iter(
--> 393                     fitted=fitted, replace_strings=True))
    394         except ValueError as e:
    395             if "Expected 2D array, got 1D array instead" in str(e):

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    915             # remaining jobs.
    916             self._iterating = False
--> 917             if self.dispatch_one_batch(iterator):
    918                 self._iterating = self._original_iterator is not None
    919 

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    757                 return False
    758             else:
--> 759                 self._dispatch(tasks)
    760                 return True
    761 

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    714         with self._lock:
    715             job_idx = len(self._jobs)
--> 716             job = self._backend.apply_async(batch, callback=cb)
    717             # A job can complete so quickly than its callback is
    718             # called before we get here, causing self._jobs to

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    180     def apply_async(self, func, callback=None):
    181         """Schedule a func to be run"""
--> 182         result = ImmediateResult(func)
    183         if callback:
    184             callback(result)

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    547         # Don't delay the application, to avoid keeping the input
    548         # arguments in memory
--> 549         self.results = batch()
    550 
    551     def get(self):

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
    612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
    613     if hasattr(transformer, 'fit_transform'):
--> 614         res = transformer.fit_transform(X, y, **fit_params)
    615     else:
    616         res = transformer.fit(X, y, **fit_params).transform(X)

TypeError: fit_transform() takes 2 positional arguments but 3 were given

What is the problem with **fit_params here? To me this looks like a bug in sklearn or at least an incompatibility.

Answers

There are two major reasons why this will not work for your purpose.

  1. LabelEncoder() is desinged to be used for the target variable (y). That is the reason for getting the positional argument error, when columnTransformer() tries to feed X, y=None, fit_params={}.

From Documentation:

Encode labels with value between 0 and n_classes-1.

fit(y)
Fit label encoder

Parameters:
y : array-like of shape (n_samples,)
Target values.

  1. Even if you do a workaround to remove the empty dictionary, then also LabelEncoder() cannot take 2D array (basically multiple features at a time) because it takes only 1D y values.

Short answer - we should not be using LabelEncoder() for input features.

Now, what is the solution to encode the input features?

Use OrdinalEncoder() if your features are ordinal features or OneHotEncoder() in case of nominal features.

Example:

>>> from sklearn.compose import ColumnTransformer
>>> from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
>>> X = np.array([[1000., 100., 'apple', 'green'],
...               [1100., 100., 'orange', 'blue']])
>>> ct = ColumnTransformer(
...     [("ordinal", OrdinalEncoder(), [0, 1]),
         ("nominal", OneHotEncoder(), [2, 3])])
>>> ct.fit_transform(X)   
array([[0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 1., 0.]]) 
Logo

Python社区为您提供最前沿的新闻资讯和知识内容

更多推荐