Customize your Label and Independent transformer inside a Scikit Learn Pipeline
Label transformer in SKlearn
import numpy as np
import pandas as pd
import warnings
'ignore')
warnings.filterwarnings(from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
= ('https://raw.githubusercontent.com/CarlosVecina/nft_market_arbitrage_regression/master/df_axie_full.csv')
url = pd.read_csv(url, sep=',')
data
## Check NAs
= pd.isnull(data).sum()
null_count_ser = null_count_ser > 0
is_null_ser null_count_ser[is_null_ser]
## timestamp 1
## axie_type 214
## genes 1176
## hp 1176
## speed 1176
## skill 1176
## morale 1176
## eyes 1176
## ears 1176
## back 1176
## back_attack 1176
## back_def 1176
## mouth 1176
## mouth_attack 1176
## mouth_def 1176
## head 1176
## head_attack 1176
## head_def 1176
## tail 1176
## tail_attack 1176
## tail_def 1176
## dtype: int64
=True) data.dropna(inplace
## Train/Test
= 0.2
TEST_PCT = data.loc[:,['axie_breed','hp','speed','skill','morale']]
X = data.price_eth_parsed.tolist()
y = train_test_split(X, y, test_size=TEST_PCT) X_train, X_test, y_train, y_test
class LogTransformerX(BaseEstimator, TransformerMixin):
# add another additional parameter, just for fun, while we are at it
def __init__(self, feature_name, additional_param = ""):
print('\n>>>>>>>init() called.\n')
self.feature_name = feature_name
self.additional_param = additional_param
def fit(self, X, y = None):
print('\n>>>>>>>fit() called.\n')
print(f'\nadditional param ~~~~~ {self.additional_param}\n')
return self
def transform(self, X, y = None):
print('\n>>>>>>>transform() called.\n')
= X.copy() # creating a copy to avoid changes to original dataset
X_ self.feature_name] = 2 * np.sqrt(X_[self.feature_name])
X_[return X_
class LogTransformerY(BaseEstimator, TransformerMixin):
# no need to implement __init__ in this particular case
def fit(self, target):
return self
def transform(self, target):
print('\n%%%%%%%%%%%%%%%custom_target_transform() called.\n')
= target.copy()
target_ = np.log(target_)
target_ return target_
# need to implement this too
def inverse_transform(self, target):
print('\n%%%%%%%%%%%%%%%custom_inverse_target_transform() called.\n')
= target.copy()
target_ = np.exp(target)
target_ return target_
## Create de model
print("create pipeline 3.1")
# no change in input pipeline
## create pipeline 3.1
= Pipeline(steps=[
pipe 'experimental_trans', LogTransformerX('skill')),
('linear_model', LinearRegression())
(
])
# create a TargetTransformer
# By default, the provided functions are checked at each fit to be the inverse of each other. However, it is
# possible to bypass this checking by setting check_inverse to False.
##
## >>>>>>>init() called.
= TransformedTargetRegressor(regressor=pipe,
model =LogTransformerY(),
transformer=False) # avoid repeated calls
check_inverse
## Train the model
print("fit pipeline 3.1 [fit Model]")
## fit pipeline 3.1 [fit Model]
1:6,], y_train[1:6]) model.fit(X_train.iloc[
##
## %%%%%%%%%%%%%%%custom_target_transform() called.
##
##
## >>>>>>>init() called.
##
##
## >>>>>>>fit() called.
##
##
## additional param ~~~~~
##
##
## >>>>>>>transform() called.
##
## TransformedTargetRegressor(check_inverse=False,
## regressor=Pipeline(steps=[('experimental_trans',
## LogTransformerX(feature_name='skill')),
## ('linear_model',
## LinearRegression())]),
## transformer=LogTransformerY())
print("predict via pipeline 3.1 [Model]")
## predict via pipeline 3.1 [Model]
= model.predict(X_test) preds3_1
##
## >>>>>>>transform() called.
##
##
## %%%%%%%%%%%%%%%custom_inverse_target_transform() called.
print(f"\n{preds3_1}") # should be [196. 289.]
##
## [0.11535258 0.13718012 0.08965178 ... 0.09259419 0.07218038 0.086 ]
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, preds3_1))}\n")
## RMSE: 0.04759534278499385