Customize your Label and Independent transformer inside a Scikit Learn Pipeline

Label transformer in SKlearn

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
url = ('https://raw.githubusercontent.com/CarlosVecina/nft_market_arbitrage_regression/master/df_axie_full.csv')
data = pd.read_csv(url, sep=',')

## Check NAs
null_count_ser = pd.isnull(data).sum()
is_null_ser = null_count_ser > 0
null_count_ser[is_null_ser]
## timestamp          1
## axie_type        214
## genes           1176
## hp              1176
## speed           1176
## skill           1176
## morale          1176
## eyes            1176
## ears            1176
## back            1176
## back_attack     1176
## back_def        1176
## mouth           1176
## mouth_attack    1176
## mouth_def       1176
## head            1176
## head_attack     1176
## head_def        1176
## tail            1176
## tail_attack     1176
## tail_def        1176
## dtype: int64
data.dropna(inplace=True)
## Train/Test
TEST_PCT = 0.2
X = data.loc[:,['axie_breed','hp','speed','skill','morale']]
y = data.price_eth_parsed.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_PCT)
class LogTransformerX(BaseEstimator, TransformerMixin):
  # add another additional parameter, just for fun, while we are at it
  def __init__(self, feature_name, additional_param = ""):
    print('\n>>>>>>>init() called.\n')
    self.feature_name = feature_name
    self.additional_param = additional_param

  def fit(self, X, y = None):
    print('\n>>>>>>>fit() called.\n')
    print(f'\nadditional param ~~~~~ {self.additional_param}\n')
    return self

  def transform(self, X, y = None):
    print('\n>>>>>>>transform() called.\n')
    X_ = X.copy() # creating a copy to avoid changes to original dataset
    X_[self.feature_name] = 2 * np.sqrt(X_[self.feature_name])
    return X_
class LogTransformerY(BaseEstimator, TransformerMixin):
    # no need to implement __init__ in this particular case

    def fit(self, target):
        return self

    def transform(self, target):
        print('\n%%%%%%%%%%%%%%%custom_target_transform() called.\n')
        target_ = target.copy()
        target_ = np.log(target_)
        return target_

    # need to implement this too
    def inverse_transform(self, target):
        print('\n%%%%%%%%%%%%%%%custom_inverse_target_transform() called.\n')
        target_ = target.copy()
        target_ = np.exp(target)
        return target_

## Create de model
print("create pipeline 3.1")
# no change in input pipeline
## create pipeline 3.1
pipe = Pipeline(steps=[
                       ('experimental_trans', LogTransformerX('skill')),
                       ('linear_model', LinearRegression())
])

# create a TargetTransformer
# By default, the provided functions are checked at each fit to be the inverse of each other. However, it is
# possible to bypass this checking by setting check_inverse to False.
## 
## >>>>>>>init() called.
model = TransformedTargetRegressor(regressor=pipe,
                                   transformer=LogTransformerY(),
                                   check_inverse=False) # avoid repeated calls

## Train the model

print("fit pipeline 3.1 [fit Model]")
## fit pipeline 3.1 [fit Model]
model.fit(X_train.iloc[1:6,], y_train[1:6])
## 
## %%%%%%%%%%%%%%%custom_target_transform() called.
## 
## 
## >>>>>>>init() called.
## 
## 
## >>>>>>>fit() called.
## 
## 
## additional param ~~~~~ 
## 
## 
## >>>>>>>transform() called.
## 
## TransformedTargetRegressor(check_inverse=False,
##                            regressor=Pipeline(steps=[('experimental_trans',
##                                                       LogTransformerX(feature_name='skill')),
##                                                      ('linear_model',
##                                                       LinearRegression())]),
##                            transformer=LogTransformerY())
print("predict via pipeline 3.1 [Model]")
## predict via pipeline 3.1 [Model]
preds3_1 = model.predict(X_test)
## 
## >>>>>>>transform() called.
## 
## 
## %%%%%%%%%%%%%%%custom_inverse_target_transform() called.
print(f"\n{preds3_1}")  # should be [196. 289.]
## 
## [0.11535258 0.13718012 0.08965178 ... 0.09259419 0.07218038 0.086     ]
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, preds3_1))}\n")
## RMSE: 0.04759534278499385

link

Carlos Vecina
Carlos Vecina
Senior Data Scientist at Jobandtalent

Senior Data Scientist at Jobandtalent | AI & Data Science for Business