Comparison of XGBRegressor with different transformations

from ai4water.datasets import busan_beach
from ai4water.utils.utils import get_version_info
from ai4water.experiments import TransformationExperiments
from ai4water.hyperopt import Categorical, Integer, Real
from ai4water.utils.utils import dateandtime_now


for k,v in get_version_info().items():
    print(f"{k} version: {v}")

python version: 3.7.9 (default, Oct 19 2020, 15:13:17)
[GCC 7.5.0]
os version: posix
ai4water version: 1.06
lightgbm version: 3.3.5
tcn version: 3.5.0
catboost version: 1.1.1
xgboost version: 1.6.2
easy_mpl version: 0.21.2
SeqMetrics version: 1.3.4
tensorflow version: 2.7.0
keras.api._v2.keras version: 2.7.0
numpy version: 1.21.1
pandas version: 1.3.4
matplotlib version: 3.5.3
h5py version: 3.8.0
joblib version: 1.2.0

data = busan_beach()
input_features = data.columns.tolist()[0:-1]
output_features = data.columns.tolist()[-1:]

class MyTransformationExperiments(TransformationExperiments):

    def update_paras(self, **kwargs):

        y_transformation = kwargs.pop('y_transformation')
        if y_transformation == "none":
            y_transformation = None

        return {
            'model': {"XGBRegressor": kwargs},
            'y_transformation': y_transformation
        }

cases = {
    'model_None': {'y_transformation': 'none'},
    'model_minmax': {'y_transformation': 'minmax'},
    'model_zscore': {'y_transformation': 'zscore'},
    'model_center': {'y_transformation': 'center'},
    'model_scale': {'y_transformation': 'scale'},
    'model_robust': {'y_transformation': 'robust'},
    'model_quantile': {'y_transformation': 'quantile'},
    'model_box_cox': {'y_transformation': {'method': 'box-cox', 'treat_negatives': True, 'replace_zeros': True}},
    'model_yeo-johnson': {'y_transformation': 'yeo-johnson'},
    'model_sqrt': {'y_transformation': 'sqrt'},
    'model_log': {'y_transformation': {'method':'log', 'treat_negatives': True, 'replace_zeros': True}},
    'model_log10': {'y_transformation': {'method':'log10', 'treat_negatives': True, 'replace_zeros': True}},
    "model_pareto": {"y_transformation": "pareto"},
    "model_vast": {"y_transformation": "vast"},
    "model_mmad": {"y_transformation": "mmad"}
         }

num_samples=10
search_space = [
# maximum number of trees that can be built
Integer(low=10, high=30, name='iterations', num_samples=num_samples),
# Used for reducing the gradient step.
Real(low=0.09, high=0.3, prior='log-uniform', name='learning_rate', num_samples=num_samples),
# Coefficient at the L2 regularization term of the cost function.
Real(low=0.5, high=5.0, name='l2_leaf_reg', num_samples=num_samples),
# arger the value, the smaller the model size.
Real(low=0.1, high=10, name='model_size_reg', num_samples=num_samples),
# percentage of features to use at each split selection, when features are selected over again at random.
Real(low=0.1, high=0.5, name='rsm', num_samples=num_samples),
# number of splits for numerical features
Integer(low=32, high=50, name='border_count', num_samples=num_samples),
# The quantization mode for numerical features.  The quantization mode for numerical features.
Categorical(categories=['Median', 'Uniform', 'UniformAndQuantiles',
                        'MaxLogSum', 'MinEntropy', 'GreedyLogSum'], name='feature_border_type')
]

x0 = [10, 0.11, 1.0, 1.0, 0.2, 45, "Uniform"]

experiment = MyTransformationExperiments(
    cases=cases,
    input_features=input_features,
    output_features = output_features,
    param_space=search_space,
    x0=x0,
    verbosity=0,
    split_random=True,
    exp_name = f"xgb_y_exp_{dateandtime_now()}",
    save=False
)

experiment.fit(data = data, run_type='dry_run')

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)
running  None model
[02:19:51] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  minmax model
[02:19:54] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  zscore model
[02:19:56] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  center model
[02:19:59] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  scale model
[02:20:01] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  robust model
[02:20:04] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  quantile model
[02:20:06] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


running  box_cox model
[02:20:09] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


running  yeo-johnson model
[02:20:12] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


running  sqrt model
[02:20:14] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


running  log model
[02:20:17] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


running  log10 model
[02:20:19] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


running  pareto model
[02:20:22] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  vast model
[02:20:25] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
running  mmad model
[02:20:27] WARNING: ../src/learner.cc:627:
Parameters: { "border_count", "feature_border_type", "iterations", "l2_leaf_reg", "model_size_reg", "rsm" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)
/home/docs/checkouts/readthedocs.org/user_builds/ai4water-experiments/envs/latest/lib/python3.7/site-packages/scipy/stats/stats.py:283: RuntimeWarning: invalid value encountered in log
  log_a = np.log(a)

experiment.compare_errors('rmse', data=data)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

	train	test
zscore	76989.844136	1.829495e+07
vast	67745.092116	1.813990e+07
mmad	63637.688822	1.811634e+07
scale	52865.432007	1.804655e+07
robust	47502.997190	1.803859e+07
center	55266.694605	1.802654e+07
pareto	59914.405664	1.802373e+07
log	270181.799448	1.794930e+07
sqrt	58261.276136	1.792989e+07
minmax	57963.528260	1.784150e+07
log10	270496.782920	1.766912e+07
box_cox	79374.647787	1.765535e+07
yeo-johnson	74527.688244	1.765232e+07
None	62660.309653	1.750955e+07
quantile	391137.069891	9.166060e+06

experiment.compare_errors('r2', data=data)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

	train	test
quantile	0.997146	0.538682
box_cox	0.999939	0.292986
yeo-johnson	0.999916	0.291664
log10	0.999094	0.255802
log	0.998880	0.218938
None	0.999877	0.151187
sqrt	0.999942	0.076603
minmax	0.999893	0.068524
pareto	0.999892	0.054656
center	0.999911	0.054537
robust	0.999934	0.052759
scale	0.999918	0.052052
mmad	0.999875	0.047492
vast	0.999856	0.042825
zscore	0.999805	0.030284

experiment.compare_errors('r2_score', data=data)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

	train	test
quantile	0.993170	0.064507
None	0.999825	0.064368
yeo-johnson	0.999752	0.049048
box_cox	0.999719	0.048722
log10	0.996734	0.047237
minmax	0.999850	0.028556
sqrt	0.999848	0.018906
log	0.996741	0.016781
pareto	0.999840	0.008610
center	0.999864	0.008301
robust	0.999899	0.006975
scale	0.999875	0.006098
mmad	0.999819	-0.001604
vast	0.999795	-0.004210
zscore	0.999735	-0.021450

experiment.compare_errors('nrmse', data=data)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

	train	test
quantile	0.010852	0.254300
zscore	0.002136	0.161920
vast	0.001879	0.160548
mmad	0.001766	0.160340
scale	0.001467	0.159722
robust	0.001318	0.159651
center	0.001533	0.159545
pareto	0.001662	0.159520
log	0.007496	0.158861
sqrt	0.001616	0.158689
minmax	0.001608	0.157907
log10	0.007505	0.156381
box_cox	0.002202	0.156260
yeo-johnson	0.002068	0.156233
None	0.001738	0.154969

experiment.taylor_plot(data=data)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

<Figure size 900x700 with 2 Axes>

experiment.compare_edf_plots(data=data)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

experiment.compare_regression_plots(data=data)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

experiment.compare_residual_plots(data=data)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

<Figure size 640x480 with 16 Axes>

Total running time of the script: ( 0 minutes 52.032 seconds)

Gallery generated by Sphinx-Gallery