Instead of get_dummies
from pandas use OneHotEncoder
from sklearn.preprocessing
, together with ColumnTransformer
from sklearn.compose
. Make a DataFrame with both ‘text’ and ‘category|dummies’ columns as features.
OneHotEncoder
expects integers type features. If your features are not to int type, first encode|map them to ints, then apply OneHotEncoder
.
# ...
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
text_features = ['text']
text_transformer = Pipeline(steps=[
('vectorizer', TfidfVectorizer(stop_words="english"))])
categorical_features = ['category']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('text', text_transformer, text_features),
('cat', categorical_transformer, categorical_features)])
pipe = Pipeline(steps=[('preprocessor', preprocessor),
("ridge", RidgeCV())
])
# ...
CLICK HERE to find out more related problems solutions.