Customize your Label and Independent transformer inside a Scikit Learn Pipeline
Label transformer in SKlearn
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
url = ('https://raw.githubusercontent.com/CarlosVecina/nft_market_arbitrage_regression/master/df_axie_full.csv')
data = pd.read_csv(url, sep=',')
## Check NAs
null_count_ser = pd.isnull(data).sum()
is_null_ser = null_count_ser > 0
null_count_ser[is_null_ser]## timestamp 1
## axie_type 214
## genes 1176
## hp 1176
## speed 1176
## skill 1176
## morale 1176
## eyes 1176
## ears 1176
## back 1176
## back_attack 1176
## back_def 1176
## mouth 1176
## mouth_attack 1176
## mouth_def 1176
## head 1176
## head_attack 1176
## head_def 1176
## tail 1176
## tail_attack 1176
## tail_def 1176
## dtype: int64data.dropna(inplace=True)## Train/Test
TEST_PCT = 0.2
X = data.loc[:,['axie_breed','hp','speed','skill','morale']]
y = data.price_eth_parsed.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_PCT)class LogTransformerX(BaseEstimator, TransformerMixin):
# add another additional parameter, just for fun, while we are at it
def __init__(self, feature_name, additional_param = ""):
print('\n>>>>>>>init() called.\n')
self.feature_name = feature_name
self.additional_param = additional_param
def fit(self, X, y = None):
print('\n>>>>>>>fit() called.\n')
print(f'\nadditional param ~~~~~ {self.additional_param}\n')
return self
def transform(self, X, y = None):
print('\n>>>>>>>transform() called.\n')
X_ = X.copy() # creating a copy to avoid changes to original dataset
X_[self.feature_name] = 2 * np.sqrt(X_[self.feature_name])
return X_class LogTransformerY(BaseEstimator, TransformerMixin):
# no need to implement __init__ in this particular case
def fit(self, target):
return self
def transform(self, target):
print('\n%%%%%%%%%%%%%%%custom_target_transform() called.\n')
target_ = target.copy()
target_ = np.log(target_)
return target_
# need to implement this too
def inverse_transform(self, target):
print('\n%%%%%%%%%%%%%%%custom_inverse_target_transform() called.\n')
target_ = target.copy()
target_ = np.exp(target)
return target_
## Create de model
print("create pipeline 3.1")
# no change in input pipeline## create pipeline 3.1pipe = Pipeline(steps=[
('experimental_trans', LogTransformerX('skill')),
('linear_model', LinearRegression())
])
# create a TargetTransformer
# By default, the provided functions are checked at each fit to be the inverse of each other. However, it is
# possible to bypass this checking by setting check_inverse to False.##
## >>>>>>>init() called.model = TransformedTargetRegressor(regressor=pipe,
transformer=LogTransformerY(),
check_inverse=False) # avoid repeated calls
## Train the model
print("fit pipeline 3.1 [fit Model]")## fit pipeline 3.1 [fit Model]model.fit(X_train.iloc[1:6,], y_train[1:6])##
## %%%%%%%%%%%%%%%custom_target_transform() called.
##
##
## >>>>>>>init() called.
##
##
## >>>>>>>fit() called.
##
##
## additional param ~~~~~
##
##
## >>>>>>>transform() called.
##
## TransformedTargetRegressor(check_inverse=False,
## regressor=Pipeline(steps=[('experimental_trans',
## LogTransformerX(feature_name='skill')),
## ('linear_model',
## LinearRegression())]),
## transformer=LogTransformerY())print("predict via pipeline 3.1 [Model]")## predict via pipeline 3.1 [Model]preds3_1 = model.predict(X_test)##
## >>>>>>>transform() called.
##
##
## %%%%%%%%%%%%%%%custom_inverse_target_transform() called.print(f"\n{preds3_1}") # should be [196. 289.]##
## [0.11535258 0.13718012 0.08965178 ... 0.09259419 0.07218038 0.086 ]print(f"RMSE: {np.sqrt(mean_squared_error(y_test, preds3_1))}\n")## RMSE: 0.04759534278499385