Comparison of XGBRegressor with different transformations

from ai4water.datasets import busan_beach
from ai4water.utils.utils import get_version_info
from ai4water.experiments import TransformationExperiments
from ai4water.hyperopt import Categorical, Integer, Real
from ai4water.utils.utils import dateandtime_now


for k,v in get_version_info().items():
    print(f"{k} version: {v}")
python version: 3.7.9 (default, Oct 19 2020, 15:13:17)
[GCC 7.5.0]
os version: posix
ai4water version: 1.06
lightgbm version: 3.3.5
tcn version: 3.5.0
catboost version: 1.1.1
xgboost version: 1.6.2
easy_mpl version: 0.21.2
SeqMetrics version: 1.3.4
tensorflow version: 2.7.0
keras.api._v2.keras version: 2.7.0
numpy version: 1.21.1
pandas version: 1.3.4
matplotlib version: 3.5.3
h5py version: 3.8.0
joblib version: 1.2.0
data = busan_beach()
input_features = data.columns.tolist()[0:-1]
output_features = data.columns.tolist()[-1:]
class MyTransformationExperiments(TransformationExperiments):

    def update_paras(self, **kwargs):

        y_transformation = kwargs.pop('y_transformation')
        if y_transformation == "none":
            y_transformation = None

        return {
            'model': {"XGBRegressor": kwargs},
            'y_transformation': y_transformation
        }
cases = {
    'model_None': {'y_transformation': 'none'},
    'model_minmax': {'y_transformation': 'minmax'},
    'model_zscore': {'y_transformation': 'zscore'},
    'model_center': {'y_transformation': 'center'},
    'model_scale': {'y_transformation': 'scale'},
    'model_robust': {'y_transformation': 'robust'},
    'model_quantile': {'y_transformation': 'quantile'},
    'model_box_cox': {'y_transformation': {'method': 'box-cox', 'treat_negatives': True, 'replace_zeros': True}},
    'model_yeo-johnson': {'y_transformation': 'yeo-johnson'},
    'model_sqrt': {'y_transformation': 'sqrt'},
    'model_log': {'y_transformation': {'method':'log', 'treat_negatives': True, 'replace_zeros': True}},
    'model_log10': {'y_transformation': {'method':'log10', 'treat_negatives': True, 'replace_zeros': True}},
    "model_pareto": {"y_transformation": "pareto"},
    "model_vast": {"y_transformation": "vast"},
    "model_mmad": {"y_transformation": "mmad"}
         }
num_samples=10
search_space = [
# maximum number of trees that can be built
Integer(low=10, high=30, name='iterations', num_samples=num_samples),
# Used for reducing the gradient step.
Real(low=0.09, high=0.3, prior='log-uniform', name='learning_rate', num_samples=num_samples),
# Coefficient at the L2 regularization term of the cost function.
Real(low=0.5, high=5.0, name='l2_leaf_reg', num_samples=num_samples),
# arger the value, the smaller the model size.
Real(low=0.1, high=10, name='model_size_reg', num_samples=num_samples),
# percentage of features to use at each split selection, when features are selected over again at random.
Real(low=0.1, high=0.5, name='rsm', num_samples=num_samples),
# number of splits for numerical features
Integer(low=32, high=50, name='border_count', num_samples=num_samples),
# The quantization mode for numerical features.  The quantization mode for numerical features.
Categorical(categories=['Median', 'Uniform', 'UniformAndQuantiles',
                        'MaxLogSum', 'MinEntropy', 'GreedyLogSum'], name='feature_border_type')
]
x0 = [10, 0.11, 1.0, 1.0, 0.2, 45, "Uniform"]
experiment = MyTransformationExperiments(
    cases=cases,
    input_features=input_features,
    output_features = output_features,
    param_space=search_space,
    x0=x0,
    verbosity=0,
    split_random=True,
    exp_name = f"xgb_y_exp_{dateandtime_now()}",
    save=False
)
experiment.fit(data = data, run_type='dry_run')
********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)
running  None model
[02:19:51] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  minmax model
[02:19:54] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  zscore model
[02:19:56] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  center model
[02:19:59] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  scale model
[02:20:01] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  robust model
[02:20:04] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  quantile model
[02:20:06] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


running  box_cox model
[02:20:09] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


running  yeo-johnson model
[02:20:12] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


running  sqrt model
[02:20:14] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


running  log model
[02:20:17] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


running  log10 model
[02:20:19] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


running  pareto model
[02:20:22] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  vast model
[02:20:25] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  mmad model
[02:20:27] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
experiment.compare_errors('rmse', data=data)
Train, test
********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)
train test
zscore 76989.844136 1.829495e+07
vast 67745.092116 1.813990e+07
mmad 63637.688822 1.811634e+07
scale 52865.432007 1.804655e+07
robust 47502.997190 1.803859e+07
center 55266.694605 1.802654e+07
pareto 59914.405664 1.802373e+07
log 270181.799448 1.794930e+07
sqrt 58261.276136 1.792989e+07
minmax 57963.528260 1.784150e+07
log10 270496.782920 1.766912e+07
box_cox 79374.647787 1.765535e+07
yeo-johnson 74527.688244 1.765232e+07
None 62660.309653 1.750955e+07
quantile 391137.069891 9.166060e+06


experiment.compare_errors('r2', data=data)
Train, test
********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)
train test
quantile 0.997146 0.538682
box_cox 0.999939 0.292986
yeo-johnson 0.999916 0.291664
log10 0.999094 0.255802
log 0.998880 0.218938
None 0.999877 0.151187
sqrt 0.999942 0.076603
minmax 0.999893 0.068524
pareto 0.999892 0.054656
center 0.999911 0.054537
robust 0.999934 0.052759
scale 0.999918 0.052052
mmad 0.999875 0.047492
vast 0.999856 0.042825
zscore 0.999805 0.030284


experiment.compare_errors('r2_score', data=data)
Train, test
********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)
train test
quantile 0.993170 0.064507
None 0.999825 0.064368
yeo-johnson 0.999752 0.049048
box_cox 0.999719 0.048722
log10 0.996734 0.047237
minmax 0.999850 0.028556
sqrt 0.999848 0.018906
log 0.996741 0.016781
pareto 0.999840 0.008610
center 0.999864 0.008301
robust 0.999899 0.006975
scale 0.999875 0.006098
mmad 0.999819 -0.001604
vast 0.999795 -0.004210
zscore 0.999735 -0.021450


experiment.compare_errors('nrmse', data=data)
Train, test
********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)
train test
quantile 0.010852 0.254300
zscore 0.002136 0.161920
vast 0.001879 0.160548
mmad 0.001766 0.160340
scale 0.001467 0.159722
robust 0.001318 0.159651
center 0.001533 0.159545
pareto 0.001662 0.159520
log 0.007496 0.158861
sqrt 0.001616 0.158689
minmax 0.001608 0.157907
log10 0.007505 0.156381
box_cox 0.002202 0.156260
yeo-johnson 0.002068 0.156233
None 0.001738 0.154969


experiment.taylor_plot(data=data)
, Train, Test
********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

<Figure size 900x700 with 2 Axes>
experiment.compare_edf_plots(data=data)
Empirical Distribution Function Plot
********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)
experiment.compare_regression_plots(data=data)
ml transformation
********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)
experiment.compare_residual_plots(data=data)
ml transformation
********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

<Figure size 640x480 with 16 Axes>

Total running time of the script: ( 0 minutes 52.032 seconds)

Gallery generated by Sphinx-Gallery