import time
import os
from pathlib import Path
from pprint import pprint
import json
import math
from itertools import combinations

import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter, FixedLocator

import plotly.express as px
import plotly.io as pio

import seaborn as sns
from IPython.display import display
import PIL

# from tqdm.notebook import tqdm
from tqdm import tqdm

from category_encoders import TargetEncoder

import statsmodels.api as sm

from sklearn.base import TransformerMixin, BaseEstimator

from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    FunctionTransformer,
    QuantileTransformer,
    PowerTransformer,
    scale,
)
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_squared_log_error, make_scorer
from sklearn.compose import (
    ColumnTransformer,
    TransformedTargetRegressor,
    make_column_selector,
    make_column_transformer,
)

from sklearn.linear_model import (
    Ridge,
    RidgeCV,
    LassoCV,
    ElasticNet,
    ElasticNetCV,
)

from sklearn.ensemble import StackingRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.pipeline import Pipeline

from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

import missingno as msno

import umap.plot

from pca import pca as pca_obj

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import scipy

import optuna

from pipeline_pandas_utils import (
    make_mi_scores,
    plot_mi_scores,
    plot_variance,
    SimpleImputerKeepCategories,
)

pd.options.mode.copy_on_write = True

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", titleweight="bold")

pio.templates.default = "plotly"

import multiprocess

cpu_count = multiprocess.cpu_count()
# if you have lots of CPUs, use one fold per CPU
# the code is written to run folds in parallel
# more folds = hopefully better estimates
if cpu_count < 8:
    num_folds = 8
else:
    num_folds = cpu_count

/home/florin/.local/lib/python3.11/site-packages/umap/distances.py:1063: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  @numba.jit()
/home/florin/.local/lib/python3.11/site-packages/umap/distances.py:1071: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  @numba.jit()
/home/florin/.local/lib/python3.11/site-packages/umap/distances.py:1086: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  @numba.jit()
/home/florin/.local/lib/python3.11/site-packages/umap/umap_.py:660: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  @numba.jit()
2023-08-16 11:12:58.081082: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-16 11:12:58.503221: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
/home/florin/.local/lib/python3.11/site-packages/umap/plot.py:203: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  @numba.jit()
/tmp/ipykernel_4896/130485644.py:94: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.
  plt.style.use("seaborn-whitegrid")

data_dir = Path("../input/house-prices-advanced-regression-techniques/")
df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")

df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")
df_test['SalePrice'] = np.full(df_test.shape[0], np.nan)

df = pd.concat([df_train, df_test])
df

na_values = pd.DataFrame(df.isna().sum().sort_values(ascending=False)).head(40)
na_values

def clean(df_orig):
    df = df_orig.copy(deep=True)
    df["Exterior2nd"] = df["Exterior2nd"].replace({"Brk Cmn": "BrkComm", "CmentBd": "CemntBd", "Wd Shng": "WdShing"})
    df["BldgType"] = df['BldgType'].replace({"2fmCon": "2FmCon", "Duplex": "Duplx", 'Twnhs': 'TwnhsI'})
    df["Neighborhood"] = df["Neighborhood"].replace({"NAmes": "Names"})
    df['MSZoning'] = df['MSZoning'].replace({'C (all)': 'C'})

    # Some values of GarageYrBlt are corrupt
    # df["GarageYrBlt"] = df["GarageYrBlt"].where(df.GarageYrBlt <= 2010, df.YearBuilt)
    df["GarageYrBlt"] = df["GarageYrBlt"].where(df.GarageYrBlt <= 2010, df['YearRemodAdd'])

    # Names beginning with numbers are awkward to work with
    df.rename(
        columns={
            "1stFlrSF": "FirstFlrSF",
            "2ndFlrSF": "SecondFlrSF",
            "3SsnPorch": "ThreeSeasonPorch",
        },
        inplace=True,
    )
    return df

def cat_to_num(X):
    X = X.copy()
    for c in X.columns.to_list():
        if X[c].dtype.name == 'category':
            X[c] = X[c].cat.codes
    return X


def encode(df_orig, features_numeric, features_ordinal, features_nominative):
    df = df_orig.copy(deep=True)
    # Nominal categories
    for name in features_nominative:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "NA" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("NA")
            # make sure NA is the first element in the category list
            # it is already first element for the ordered levels
            cat_list = df[name].cat.categories.to_list()
            last_cat = cat_list.pop()
            cat_list = [last_cat] + cat_list
            df[name] = df[name].cat.reorder_categories(cat_list)

    # numeric features
    for name in features_numeric:
        df[name] = df[name].astype(float)

    # Ordinal categories
    for name, levels in features_ordinal.items():
        df[name] = df[name].astype(CategoricalDtype(levels, ordered=True))

    return df

def encode_time(df_orig):
    # year alone, or month alone, don't do much
    # year + month together have more value
    df = df_orig.copy()
    df['YrSold'] = df['YrSold'].astype(int)
    df['MoSold'] = df['MoSold'].astype(int)
    df['YrMoSold'] = pd.to_datetime(df['YrSold'].astype(str) + '-' + df['MoSold'].astype(str) + '-01').astype(int)
    df.drop(columns=['YrSold', 'MoSold'], inplace=True)
    return df

def load_data():
    # Read data
    data_dir = Path("../input/house-prices-advanced-regression-techniques/")
    df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")
    df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")
    # Merge the splits so we can process them together
    df = pd.concat([df_train, df_test])

    # The nominative (unordered) categorical features
    features_nominative = [
        "MSSubClass",
        "MSZoning",
        "Alley",
        "LotShape",
        "LotConfig",
        "LandSlope",
        "Neighborhood",
        "Condition1",
        "Condition2",
        "BldgType",
        "HouseStyle",
        "RoofStyle",
        "RoofMatl",
        "Exterior1st",
        "Exterior2nd",
        "MasVnrType",
        "Foundation",
        "BsmtFinType1",
        "BsmtFinType2",
        "Heating",
        "CentralAir",
        # "BsmtFullBath",
        # "BsmtHalfBath",
        # "BedroomAbvGr",
        # "KitchenAbvGr",
        "GarageType",
        "MiscFeature",
        "SaleType",
        "SaleCondition",
        # "Fence",
    ]

    features_numeric = [
        "LotFrontage",
        "LotArea",
        "YearBuilt",
        "YearRemodAdd",
        "MasVnrArea",
        "BsmtFinSF1",
        "BsmtFinSF2",
        "BsmtUnfSF",
        "TotalBsmtSF",
        "FirstFlrSF",
        "SecondFlrSF",
        "LowQualFinSF",
        "GrLivArea",
        "BsmtFullBath",
        "BsmtHalfBath",
        "FullBath",
        "HalfBath",
        "BedroomAbvGr",
        "KitchenAbvGr",
        "TotRmsAbvGrd",
        "Fireplaces",
        "GarageYrBlt",
        "GarageCars",
        "GarageArea",
        "WoodDeckSF",
        "OpenPorchSF",
        "EnclosedPorch",
        "ThreeSeasonPorch",
        "ScreenPorch",
        "PoolArea",
        "MiscVal",
        "MoSold",
        "YrSold",
    ]

    # The ordinal (ordered) categorical features

    # Pandas calls the categories "levels"
    five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
    ten_levels = list(range(11))

    features_ordinal = {
        "OverallQual": ten_levels,
        "OverallCond": ten_levels,
        "ExterQual": five_levels,
        "ExterCond": five_levels,
        "BsmtQual": five_levels,
        "BsmtCond": five_levels,
        "HeatingQC": five_levels,
        "KitchenQual": five_levels,
        "FireplaceQu": five_levels,
        "GarageQual": five_levels,
        "GarageCond": five_levels,
        "PoolQC": five_levels,
        "Street": ["Grvl", "Pave"],
        "LandContour": ["Bnk", "Lvl", "Low", "HLS"],
        "BsmtExposure": ["No", "Mn", "Av", "Gd"],
        # "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
        # "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
        "Functional": ["Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"],
        "GarageFinish": ["Unf", "RFn", "Fin"],
        "PavedDrive": ["N", "P", "Y"],
        "Utilities": ["ELO", "NoSeWa", "NoSewr", "AllPub"],
        "CentralAir": ["N", "Y"],
        "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
        "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
    }

    # Add a None level for missing values
    features_ordinal = {key: ["NA"] + value for key, value in features_ordinal.items()}

    # Preprocessing
    df = clean(df)
    df = encode(df, features_numeric, features_ordinal, features_nominative)

    df = encode_time(df)
    features_numeric.remove('MoSold')
    features_numeric.remove('YrSold')
    features_numeric = features_numeric + ['YrMoSold']

    # Reform splits
    df_train = df.loc[df_train.index, :]
    df_test = df.loc[df_test.index, :]
    return df_train, df_test, features_numeric, features_ordinal, features_nominative


df_train, df_test, features_numeric, features_ordinal, features_nominative = load_data()

# Peek at the values
display(df_train)
# display(df_test)

# Display information about dtypes and missing values
display(df_train.info())
# display(df_test.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 79 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   MSSubClass        1460 non-null   category
 1   MSZoning          1460 non-null   category
 2   LotFrontage       1201 non-null   float64 
 3   LotArea           1460 non-null   float64 
 4   Street            1460 non-null   category
 5   Alley             91 non-null     category
 6   LotShape          1460 non-null   category
 7   LandContour       1460 non-null   category
 8   Utilities         1460 non-null   category
 9   LotConfig         1460 non-null   category
 10  LandSlope         1460 non-null   category
 11  Neighborhood      1460 non-null   category
 12  Condition1        1460 non-null   category
 13  Condition2        1460 non-null   category
 14  BldgType          1460 non-null   category
 15  HouseStyle        1460 non-null   category
 16  OverallQual       1460 non-null   category
 17  OverallCond       1460 non-null   category
 18  YearBuilt         1460 non-null   float64 
 19  YearRemodAdd      1460 non-null   float64 
 20  RoofStyle         1460 non-null   category
 21  RoofMatl          1460 non-null   category
 22  Exterior1st       1460 non-null   category
 23  Exterior2nd       1460 non-null   category
 24  MasVnrType        588 non-null    category
 25  MasVnrArea        1452 non-null   float64 
 26  ExterQual         1460 non-null   category
 27  ExterCond         1460 non-null   category
 28  Foundation        1460 non-null   category
 29  BsmtQual          1423 non-null   category
 30  BsmtCond          1423 non-null   category
 31  BsmtExposure      1422 non-null   category
 32  BsmtFinType1      1423 non-null   category
 33  BsmtFinSF1        1460 non-null   float64 
 34  BsmtFinType2      1422 non-null   category
 35  BsmtFinSF2        1460 non-null   float64 
 36  BsmtUnfSF         1460 non-null   float64 
 37  TotalBsmtSF       1460 non-null   float64 
 38  Heating           1460 non-null   category
 39  HeatingQC         1460 non-null   category
 40  CentralAir        1460 non-null   category
 41  Electrical        1459 non-null   category
 42  FirstFlrSF        1460 non-null   float64 
 43  SecondFlrSF       1460 non-null   float64 
 44  LowQualFinSF      1460 non-null   float64 
 45  GrLivArea         1460 non-null   float64 
 46  BsmtFullBath      1460 non-null   float64 
 47  BsmtHalfBath      1460 non-null   float64 
 48  FullBath          1460 non-null   float64 
 49  HalfBath          1460 non-null   float64 
 50  BedroomAbvGr      1460 non-null   float64 
 51  KitchenAbvGr      1460 non-null   float64 
 52  KitchenQual       1460 non-null   category
 53  TotRmsAbvGrd      1460 non-null   float64 
 54  Functional        1460 non-null   category
 55  Fireplaces        1460 non-null   float64 
 56  FireplaceQu       770 non-null    category
 57  GarageType        1379 non-null   category
 58  GarageYrBlt       1460 non-null   float64 
 59  GarageFinish      1379 non-null   category
 60  GarageCars        1460 non-null   float64 
 61  GarageArea        1460 non-null   float64 
 62  GarageQual        1379 non-null   category
 63  GarageCond        1379 non-null   category
 64  PavedDrive        1460 non-null   category
 65  WoodDeckSF        1460 non-null   float64 
 66  OpenPorchSF       1460 non-null   float64 
 67  EnclosedPorch     1460 non-null   float64 
 68  ThreeSeasonPorch  1460 non-null   float64 
 69  ScreenPorch       1460 non-null   float64 
 70  PoolArea          1460 non-null   float64 
 71  PoolQC            7 non-null      category
 72  Fence             281 non-null    category
 73  MiscFeature       54 non-null     category
 74  MiscVal           1460 non-null   float64 
 75  SaleType          1460 non-null   category
 76  SaleCondition     1460 non-null   category
 77  SalePrice         1460 non-null   float64 
 78  YrMoSold          1460 non-null   int64   
dtypes: category(46), float64(32), int64(1)
memory usage: 467.5 KB

None

df_train, df_test, features_numeric, features_ordinal, features_nominative = load_data()
X = df_train.copy()
y = X.pop("SalePrice")

X_test = df_test.copy()
# should be NaN
y_test = X_test.pop('SalePrice')

_ = X[features_numeric].hist(figsize=(12, 12))

n_features_ordinal = len(list(features_ordinal.keys()))
n_col_f_ord = math.ceil(np.sqrt(n_features_ordinal))
n_row_f_ord = math.ceil(n_features_ordinal / n_col_f_ord)
fig, ax = plt.subplots(nrows=n_row_f_ord, ncols=n_col_f_ord, figsize=(12, 12))
for i in range(0, n_col_f_ord):
    for j in range(0, n_row_f_ord):
        f_index = j + i * n_col_f_ord
        if f_index >= n_features_ordinal:
            continue
        _ = X[list(features_ordinal.keys())[f_index]].value_counts(sort=False, dropna=False).plot.bar(ax=ax[i, j])
plt.show()

n_features_nominative = len(features_nominative)
n_col_f_nom = math.ceil(np.sqrt(n_features_nominative))
n_row_f_nom = math.ceil(n_features_nominative / n_col_f_nom)
fig, ax = plt.subplots(nrows=n_row_f_nom, ncols=n_col_f_nom, figsize=(12, 12))
for i in range(0, n_col_f_nom):
    for j in range(0, n_row_f_nom):
        f_index = j + i * n_col_f_nom
        if f_index >= n_features_nominative:
            continue
        _ = X[list(features_nominative)[f_index]].value_counts(sort=False, dropna=False).plot.bar(ax=ax[i, j])
plt.show()

X.isna().mean().sort_values(ascending=False).head(20)

PoolQC          0.995205
MiscFeature     0.963014
Alley           0.937671
Fence           0.807534
MasVnrType      0.597260
FireplaceQu     0.472603
LotFrontage     0.177397
GarageFinish    0.055479
GarageType      0.055479
GarageCond      0.055479
GarageQual      0.055479
BsmtExposure    0.026027
BsmtFinType2    0.026027
BsmtCond        0.025342
BsmtFinType1    0.025342
BsmtQual        0.025342
MasVnrArea      0.005479
Electrical      0.000685
MSSubClass      0.000000
TotRmsAbvGrd    0.000000
dtype: float64

# distribution of NaN

features_with_nan = X.columns[X.isna().any()].to_list()
_ = msno.matrix(X[features_with_nan], labels=True, figsize=(7, 5), fontsize=9, sparkline=False)

# correlation of NaN

_ = msno.heatmap(X[features_with_nan], figsize=(6, 6), fontsize=9, cbar=False)

fig, ax = plt.subplots(1, 1, figsize=(18, 18))
# Pearson assumes normal distributions, use Kendall instead
corr_method = 'kendall'
decode_categories = FunctionTransformer(func=cat_to_num)
decode_categories.set_output(transform='pandas')

ax.set_title(f'pairwise correlation: {corr_method}')
ax = sns.heatmap(
    # ctnp.fit_transform(X).fillna(-1).corr(method=corr_method, numeric_only=False),
    decode_categories.fit_transform(X).fillna(-1).corr(method=corr_method, numeric_only=False),
    cmap='icefire',
    cbar=True,
    square=True,
)

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

def mutual_information_wrapper(arr1, arr2):
    return mutual_info_regression(arr1.reshape(-1, 1), arr2, random_state=0)


fig, ax = plt.subplots(1, 1, figsize=(18, 18))
corr_method = mutual_information_wrapper
ax.set_title(f'pairwise correlation: {corr_method}')
ax = sns.heatmap(
    decode_categories.fit_transform(X).fillna(-1).corr(method=corr_method, numeric_only=False),
    cbar=True,
    square=True,
)

def corrplot(df, method="kendall", annot=True, **kwargs):
    df = df.copy()
    for colname in df.select_dtypes(["category"]):
        df[colname] = df[colname].cat.codes
    sns.clustermap(df.corr(method), vmin=-1.0, vmax=1.0, cmap="icefire", method="complete", annot=annot, **kwargs)


corrplot(X[features_numeric], annot=None)

fig, ax = plt.subplots(1, 2, figsize=(8, 4))
sm.qqplot(y, line='s', ax=ax[0])
pd.options.mode.copy_on_write = False
sns.distplot(y, fit=scipy.stats.norm, ax=ax[1])
pd.options.mode.copy_on_write = True
fig.suptitle('target distribution')
fig.show()

/tmp/ipykernel_35469/1812137861.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(y, fit=scipy.stats.norm, ax=ax[1])
/tmp/ipykernel_35469/1812137861.py:7: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()

# compare categories in train and test
# to detect categories in test missing from train, if any
for c in df_train.select_dtypes(include='category'):
    idx_train = df_train[c].cat.categories
    idx_test = df_test[c].cat.categories
    if not idx_train.equals(idx_test):
        print(f'different train/test categorical index: {c}')

class SKPipeDataViewer(BaseEstimator, TransformerMixin):
    """
    Print out the X dataframe within the pipeline.
    """

    def __init__(self, show_dtypes=False, show_na=False, head_only=True, **kwargs):
        super().__init__(**kwargs)
        self.show_dtypes = show_dtypes
        self.show_na = show_na
        self.head_only = head_only
        for k, v in kwargs.items():
            setattr(self, k, v)

    def transform(self, X):
        self.dtypes = X.dtypes.to_dict()
        if self.head_only:
            display(X.head())
        else:
            display(X)
        if self.show_na == True:
            if self.head_only:
                print(X.isna().sum().sort_values(ascending=False).head())
            else:
                print(X.isna().sum().sort_values(ascending=False))
        if self.show_dtypes == True:
            pprint(self.dtypes, indent=2)
        return X

    def fit(self, X, y=None, **kwargs):
        return self

    def set_output(self, transform):
        pass


def make_impute_standardize_num_ohe_cat(ohe_drop=None):
    """
    Impute and standardize numeric features.
    One-hot encode categorical.
    Set ohe_drop='first' for plain linear models, if any.
    """
    steps_impute_num_ohe_cat = [
        (
            'impute_standardize_numeric',
            Pipeline(
                [
                    ('simple_imputer', SimpleImputer(strategy='mean')),
                    ('standard_scaler', StandardScaler()),
                ]
            ),
            make_column_selector(dtype_include='number'),
        ),
        (
            'one_hot_encode_categorical',
            OneHotEncoder(
                sparse_output=False,
                dtype=int,
                handle_unknown='infrequent_if_exist',
                min_frequency=1,
                drop=ohe_drop,
            ),
            make_column_selector(dtype_exclude='number'),
        ),
    ]
    impute_numeric_OHE_categorical = ColumnTransformer(
        steps_impute_num_ohe_cat,
        remainder='drop',
        n_jobs=1,
        verbose_feature_names_out=False,
    )
    impute_numeric_OHE_categorical.set_output(transform='pandas')
    return impute_numeric_OHE_categorical


def make_impute_num_ohe_cat_standardize(ohe_drop=None):
    """
    Impute numeric. One-hot encode categorical.
    Standardize the output.
    Set ohe_drop='first' for plain linear models, if any.
    """
    steps_impute_num_ohe_cat = [
        (
            'impute_numeric',
            SimpleImputer(strategy='mean'),
            make_column_selector(dtype_include='number'),
        ),
        (
            'one_hot_encode_categorical',
            OneHotEncoder(
                sparse_output=False,
                dtype=int,
                handle_unknown='infrequent_if_exist',
                min_frequency=1,
                drop=ohe_drop,
            ),
            make_column_selector(dtype_exclude='number'),
        ),
    ]
    impute_numeric_OHE_categorical = ColumnTransformer(
        steps_impute_num_ohe_cat,
        remainder='drop',
        n_jobs=1,
        verbose_feature_names_out=False,
    )
    impute_numeric_OHE_categorical_standardize = Pipeline(
        [
            ('impute_numeric_OHE_categorical', impute_numeric_OHE_categorical),
            ('standard_scaler', StandardScaler()),
        ]
    )
    impute_numeric_OHE_categorical_standardize.set_output(transform='pandas')
    return impute_numeric_OHE_categorical_standardize


def rebuild_xgboost(pipe_preproc, model_args={'random_state': 0, 'verbosity': 0, 'n_jobs': 1}):
    # xgboost performs worse with undecoded categories
    steps = [
        ('preprocessing', pipe_preproc),
        ('categorical_to_numeric', FunctionTransformer(func=cat_to_num)),
        ('xgboost', XGBRegressor(**model_args)),
    ]
    pipe = Pipeline(steps=steps, verbose=False)
    pipe.set_output(transform='pandas')
    return pipe


def rebuild_lgbm(pipe_preproc, model_args={'random_state': 0, 'n_jobs': 1, 'verbose': -1}):
    # lgbm can take categorical features, NaN-riddled features, etc.
    steps = [
        ('preprocessing', pipe_preproc),
        ('lgbm', LGBMRegressor(**model_args)),
    ]
    pipe = Pipeline(steps=steps, verbose=False)
    pipe.set_output(transform='pandas')
    return pipe


def rebuild_catboost(
    pipe_preproc,
    model_args={'random_state': 0, 'logging_level': 'Silent', 'thread_count': 1},
):
    steps = [
        ('preprocessing', pipe_preproc),
        ('categorical_to_numeric', FunctionTransformer(func=cat_to_num)),
        ('catboost', CatBoostRegressor(**model_args)),
    ]
    pipe = Pipeline(steps=steps, verbose=False)
    pipe.set_output(transform='pandas')
    return pipe


def rebuild_ridge(pipe_preproc, model_args={'random_state': 0}):
    steps = [
        ('preprocessing', pipe_preproc),
        (
            'impute_standardize_numeric_OHE_categorical',
            make_impute_standardize_num_ohe_cat(),
        ),
        # Ridge benefits from a transformed target
        (
            'ridge',
            TransformedTargetRegressor(
                regressor=Ridge(**model_args),
                transformer=QuantileTransformer(output_distribution='normal', random_state=0),
            ),
        ),
    ]
    pipe = Pipeline(steps=steps, verbose=False)
    pipe.set_output(transform='pandas')
    return pipe


def rebuild_enet(pipe_preproc, model_args={'random_state': 0, 'max_iter': 10000}):
    steps = [
        ('preprocessing', pipe_preproc),
        (
            'impute_numeric_OHE_categorical_standardize',
            make_impute_num_ohe_cat_standardize(),
        ),
        # ElasticNet performs worse with a transformed target
        ('enet', ElasticNet(**model_args)),
    ]
    pipe = Pipeline(steps=steps, verbose=False)
    pipe.set_output(transform='pandas')
    return pipe


def rebuild_all_pipelines(steps_preproc):
    """
    Return two objects:
    - the base (preproc) pipeline with steps from the argument
    - a dictionary with all the model pipelines
    """
    # placeholder for steps added later
    pipe_preproc = Pipeline(steps=steps_preproc, verbose=False)
    pipe_preproc.set_output(transform='pandas')

    pipe = {}
    pipe['xgboost'] = rebuild_xgboost(pipe_preproc)
    pipe['lgbm'] = rebuild_lgbm(pipe_preproc)
    pipe['catboost'] = rebuild_catboost(pipe_preproc)
    pipe['ridge'] = rebuild_ridge(pipe_preproc)
    pipe['enet'] = rebuild_enet(pipe_preproc)

    return pipe_preproc, pipe

def test_performance(pipe, X, y, n_splits=num_folds, n_jobs=cpu_count):
    """
    pipe: a pipeline containing a model

    Returns model performance.
    """
    cv = KFold(n_splits=n_splits, shuffle=False)
    scores = cross_val_score(
        pipe,
        X,
        y,
        cv=cv,
        verbose=0,
        n_jobs=n_jobs,
        error_score='raise',
        scoring=make_scorer(mean_squared_log_error, squared=False),
    )
    return np.mean(scores)


def test_all_models(test_name, pipe, X, y, n_splits=num_folds, n_jobs=cpu_count):
    """
    test_name: generic name for the test being run
    pipe: a dictionary with keys for model names, and values for pipelines with models

    If TEST_PIPELINES=True, all models are tested, performance is saved to disk.
    If TEST_PIPELINES=False, performance is read from disk and displayed (no tests are run).
    """
    perf_dir = 'performance_files'
    if not os.path.isdir(perf_dir):
        os.makedirs(perf_dir)
    if TEST_PIPELINES:
        # persist results to disk
        name_len_max = max([len(n) for n in pipe.keys()])
        with open(perf_dir + '/performance_' + test_name + '.txt', 'w') as perf_file:
            for model_name in tqdm(pipe.keys()):
                mean_score = test_performance(
                    pipe[model_name],
                    X,
                    y,
                    n_splits=n_splits,
                    n_jobs=n_jobs,
                )
                print(
                    f'{model_name.ljust(name_len_max, " ")} {str(mean_score)}',
                    file=perf_file,
                )
            # tqdm bug
            time.sleep(0.1)
    # read results from disk and display them
    print(test_name)
    with open(perf_dir + '/performance_' + test_name + '.txt', 'r') as perf_file:
        print(perf_file.read())


def submission_file_name(model_name, test_name):
    sub_dir = 'submission_files'
    if not os.path.isdir(sub_dir):
        os.makedirs(sub_dir)
    return sub_dir + '/submission_' + model_name + '_' + test_name + '.csv'


def make_submission(model, model_name, X_train, y_train, X_test, test_name):
    X_train = X_train.copy()
    y_train = y_train.copy()
    X_test = X_test.copy()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    # predictions = np.expm1(predictions)
    output = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})
    sf = submission_file_name(model_name, test_name)
    output.to_csv(sf, index=False)
    print(f'file created {sf}')


def make_submission_all(pipe, X_train, y_train, X_test, test_name):
    if TEST_PIPELINES:
        for model_name in pipe.keys():
            make_submission(pipe[model_name], model_name, X_train, y_train, X_test, test_name)

steps_preproc = [
    ('passthrough', 'passthrough'),
    # ('pdv', SKPipeDataViewer()),
]

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)

for k in list(pipe.keys()):
    display(pipe[k])

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('passthrough', 'passthrough')])),
                ('categorical_to_numeric',
                 FunctionTransformer(func=<function cat_to_num at 0x7ff036d65bc0>)),
                ('xgboost',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=None, early_stopping_rounds=None,
                              enable_categoric...
                              feature_types=None, gamma=None, gpu_id=None,
                              grow_policy=None, importance_type=None,
                              interaction_constraints=None, learning_rate=None,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=None, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, n_estimators=100,
                              n_jobs=1, num_parallel_tree=None, predictor=None,
                              random_state=0, ...))])

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('passthrough', 'passthrough')])),
                ('categorical_to_numeric',
                 FunctionTransformer(func=<function cat_to_num at 0x7ff036d65bc0>)),
                ('xgboost',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=None, early_stopping_rounds=None,
                              enable_categoric...
                              feature_types=None, gamma=None, gpu_id=None,
                              grow_policy=None, importance_type=None,
                              interaction_constraints=None, learning_rate=None,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=None, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, n_estimators=100,
                              n_jobs=1, num_parallel_tree=None, predictor=None,
                              random_state=0, ...))])

Pipeline(steps=[('passthrough', 'passthrough')])

passthrough

FunctionTransformer(func=<function cat_to_num at 0x7ff036d65bc0>)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=1, num_parallel_tree=None, predictor=None,
             random_state=0, ...)

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('passthrough', 'passthrough')])),
                ('lgbm', LGBMRegressor(n_jobs=1, random_state=0, verbose=-1))])

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('passthrough', 'passthrough')])),
                ('lgbm', LGBMRegressor(n_jobs=1, random_state=0, verbose=-1))])

Pipeline(steps=[('passthrough', 'passthrough')])

# whether to actually re-run all performance tests
# if False, simply display most recent results
TEST_PIPELINES = False

features_to_drop = []

steps_preproc = [
    ('passthrough', 'passthrough'),
    # ('pdv', SKPipeDataViewer()),
]

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
test_all_models('baseline', pipe, X, y)

baseline
xgboost  0.1362956720421577
lgbm     0.1285616308772774
catboost 0.11849045145088107
ridge    0.1350399777015928
enet     0.1369316049020572

make_submission_all(pipe, X, y, X_test, 'baseline')

vr = VotingRegressor(
    estimators=[('pipe_' + m, pipe[m]) for m in pipe.keys()],
    n_jobs=1,
)

test_all_models(
    'VotingRegressor_baseline',
    {'VotingRegressor': vr},
    X,
    y,
    n_splits=num_folds,
    n_jobs=cpu_count,
)

VotingRegressor_baseline
VotingRegressor 0.11698648818353263

make_submission_all({'VotingRegressor': vr}, X, y, X_test, 'VotingRegressor_baseline')

sr_fe_pipe_ridgecv = Pipeline(
    [
        ('standard_scaler', StandardScaler()),
        ('ridgecv', RidgeCV()),
    ]
)

sr_ridge = StackingRegressor(
    estimators=[('pipe_' + m, pipe[m]) for m in pipe.keys()],
    final_estimator=sr_fe_pipe_ridgecv,
    cv=KFold(n_splits=num_folds, shuffle=False),
    n_jobs=1,
)

sr_fe_pipe_lassocv = Pipeline(
    [
        ('standard_scaler', StandardScaler()),
        ('lassocv', LassoCV(n_jobs=1, random_state=0)),
    ]
)

sr_lasso = StackingRegressor(
    estimators=[('pipe_' + m, pipe[m]) for m in pipe.keys()],
    final_estimator=sr_fe_pipe_lassocv,
    cv=KFold(n_splits=num_folds, shuffle=False),
    n_jobs=1,
)

sr_fe_pipe_enetcv = Pipeline(
    [
        # poor performance with SS enabled
        # ('standard_scaler', StandardScaler()),
        ('enetcv', ElasticNetCV(n_jobs=1, random_state=0)),
    ]
)

sr_enet = StackingRegressor(
    estimators=[('pipe_' + m, pipe[m]) for m in pipe.keys()],
    final_estimator=sr_fe_pipe_enetcv,
    cv=KFold(n_splits=num_folds, shuffle=False),
    n_jobs=1,
)

sr_fe_pipe_knnr = Pipeline(
    [
        ('standard_scaler', StandardScaler()),
        ('knnregressor', KNeighborsRegressor(n_jobs=1)),
    ]
)

sr_knn = StackingRegressor(
    estimators=[('pipe_' + m, pipe[m]) for m in pipe.keys()],
    final_estimator=sr_fe_pipe_knnr,
    cv=KFold(n_splits=num_folds, shuffle=False),
    n_jobs=1,
)

sr_fe_pipe_mlpr = Pipeline(
    [
        ('standard_scaler', StandardScaler()),
        (
            'transformed_target_mlpregressor',
            TransformedTargetRegressor(
                regressor=MLPRegressor(random_state=0),
                transformer=QuantileTransformer(output_distribution='normal', random_state=0),
            ),
        ),
    ]
)

sr_mlpr = StackingRegressor(
    estimators=[('pipe_' + m, pipe[m]) for m in pipe.keys()],
    final_estimator=sr_fe_pipe_mlpr,
    cv=KFold(n_splits=num_folds, shuffle=False),
    n_jobs=1,
)

test_all_models(
    'StackingRegressor_baseline',
    {
        'StackingRegressor_Ridge': sr_ridge,
        'StackingRegressor_Lasso': sr_lasso,
        'StackingRegressor_ENet': sr_enet,
        'StackingRegressor_KNN': sr_knn,
        'StackingRegressor_MLPRegressor': sr_mlpr,
    },
    X,
    y,
    n_splits=num_folds,
    n_jobs=cpu_count,
)

StackingRegressor_baseline
StackingRegressor_Ridge        0.11854964144139617
StackingRegressor_Lasso        0.11942221026912289
StackingRegressor_ENet         0.11852654308298499
StackingRegressor_KNN          0.12415051057307841
StackingRegressor_MLPRegressor 0.1213023628997334

sr_ridge = StackingRegressor(
    estimators=[('pipe_' + m, pipe[m]) for m in pipe.keys()],
    final_estimator=sr_fe_pipe_ridgecv,
    cv=KFold(n_splits=num_folds, shuffle=False),
    n_jobs=cpu_count,
)

sr_lasso = StackingRegressor(
    estimators=[('pipe_' + m, pipe[m]) for m in pipe.keys()],
    final_estimator=sr_fe_pipe_lassocv,
    cv=KFold(n_splits=num_folds, shuffle=False),
    n_jobs=cpu_count,
)

sr_enet = StackingRegressor(
    estimators=[('pipe_' + m, pipe[m]) for m in pipe.keys()],
    final_estimator=sr_fe_pipe_enetcv,
    cv=KFold(n_splits=num_folds, shuffle=False),
    n_jobs=cpu_count,
)

sr_knn = StackingRegressor(
    estimators=[('pipe_' + m, pipe[m]) for m in pipe.keys()],
    final_estimator=sr_fe_pipe_knnr,
    cv=KFold(n_splits=num_folds, shuffle=False),
    n_jobs=cpu_count,
)

sr_mlpr = StackingRegressor(
    estimators=[('pipe_' + m, pipe[m]) for m in pipe.keys()],
    final_estimator=sr_fe_pipe_mlpr,
    cv=KFold(n_splits=num_folds, shuffle=False),
    n_jobs=cpu_count,
)

make_submission_all(
    {
        'StackingRegressor_Ridge': sr_ridge,
        'StackingRegressor_Lasso': sr_lasso,
        'StackingRegressor_ENet': sr_enet,
        'StackingRegressor_KNN': sr_knn,
        'StackingRegressor_MLPRegressor': sr_mlpr,
    },
    X,
    y,
    X_test,
    'StackingRegressor_baseline',
)

def compute_log(X, features, drop_original=True):
    X = X.copy()
    for c in features:
        X[c + '_log'] = np.log1p(X[c])
    if drop_original:
        X.drop(columns=features, inplace=True)
    return X


def make_steps_log_transform(features, drop_now=False, enable=True):
    if enable:
        steps = [
            (
                'log_transform',
                FunctionTransformer(
                    func=compute_log,
                    kw_args={'features': features, 'drop_original': drop_now},
                ),
            )
        ]
        drop_features = features
    else:
        steps = []
        drop_features = []
    return steps, drop_features


log_transform_features = [
    'LotFrontage',
    'LotArea',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    'FirstFlrSF',
    'GrLivArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    'MiscVal',
]


# in the final version, original features will not be dropped here
# we will collect features to drop, and drop them at the end of the pipeline
steps_preproc, _ = make_steps_log_transform(log_transform_features, drop_now=True, enable=True)

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
# pipe_preproc
# pipe_preproc.fit_transform(X, y)
_ = pipe_preproc.fit_transform(X[features_numeric], y).hist(figsize=(12, 12))
test_all_models('log_transform', pipe, X, y)

log_transform
xgboost  0.13506784578614034
lgbm     0.1286483077106619
catboost 0.11837461919873822
ridge    0.12753219857904963
enet     0.13929755320744852

pt = PowerTransformer(method='yeo-johnson', standardize=True, copy=True)
pt.set_output(transform='pandas')

qt = QuantileTransformer(output_distribution='normal', random_state=0)
qt.set_output(transform='pandas')

pd.options.mode.copy_on_write = False
sns.boxplot(qt.fit_transform(X[features_numeric]), orient='h')
pd.options.mode.copy_on_write = True

def make_steps_numeric_quantile_transformer(enable=True):
    if enable:
        ct_qt = ColumnTransformer(
            [
                (
                    'quantile_transformer',
                    QuantileTransformer(output_distribution='normal', random_state=0),
                    features_numeric,
                ),
            ],
            remainder='passthrough',
            n_jobs=1,
            verbose_feature_names_out=False,
        )
        steps = [('numeric_quantile_transformer', ct_qt)]
    else:
        steps = []
    drop_features = []
    return steps, drop_features


steps_preproc, _ = make_steps_numeric_quantile_transformer(enable=True)

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
# pipe_preproc.fit_transform(X, y)
test_all_models('numeric_quantile_transformer', pipe, X, y)

numeric_quantile_transformer
xgboost  0.137896618743097
lgbm     0.12744108790205297
catboost 0.11915351689825118
ridge    0.1290901148937026
enet     0.14338826663446375

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

make_submission_all(pipe, X, y, X_test, 'normalize_features')

steps_outlier_pca = [('impute_num_ohe_cat_standard', make_impute_standardize_num_ohe_cat())]

pipe_outlier_pca = Pipeline(steps=steps_outlier_pca, verbose=False)
pipe_outlier_pca.set_output(transform='pandas')

pca_model = pca_obj(
    n_components=0.95,
    random_state=0,
    verbose=2,
)
X_for_pca = pipe_outlier_pca.fit_transform(X)
# display(X_for_pca)
pca_results = pca_model.fit_transform(X_for_pca)
display(pca_results['outliers'])
# display(pca_results['PC'])
# display(pca_results['explained_var'])

pd.options.mode.copy_on_write = False
# TODO: make this image smaller
pca_model.biplot(
    SPE=False,
    HT2=True,
    density=True,
    title='Outliers marked using Hotellings T2 method.',
    figsize=(10, 10),
    fontsize=10,
    s=20,
)
pd.options.mode.copy_on_write = True

class pcaPandas(pca_obj):
    """
    Determine outliers in PCA space. Return outlier score.
    """

    def transform(self, X):
        X = X.copy()
        pca_transform = super().transform(X, verbose=0)
        output = super().compute_outliers(pca_transform, verbose=0)
        output = np.array(output[0]['y_score'].astype(float))
        if hasattr(self, 'output_format'):
            if self.output_format == 'pandas':
                output = pd.DataFrame({'PCA_outlier_score': output}, index=X.index)
        return output

    def fit_transform(self, X, y=None):
        X = X.copy()
        output = np.array(super().fit_transform(X, verbose=0)['outliers']['y_score'].astype(float))
        if hasattr(self, 'output_format'):
            if self.output_format == 'pandas':
                output = pd.DataFrame({'PCA_outlier_score': output}, index=X.index)
        return output

    def set_output(self, transform=None):
        if transform == 'pandas':
            self.output_format = 'pandas'
        pass


def make_steps_pca_outliers_score(min_explained_variation=0.95, pca_score_distribution='uniform', var_cutoff=0.1):
    """
    min_explained_variation: [0.0, 1.0]
    pca_score_distribution: 'uniform', 'normal'
    var_cutoff: minimum explained variation that will be considered at all

    min_explained_variation < 0.1 means disabled
    min_explained_variation is actually cumulative
    """

    # work around a quirk in the original parameter values
    # (same parameter means many things in the original object)
    mev = min_explained_variation
    if min_explained_variation == 1.0:
        mev = None

    # enforce a low cutoff
    if min_explained_variation >= var_cutoff:
        pca_model = pcaPandas(
            n_components=mev,
            random_state=0,
            verbose=0,
        )
        steps = [
            (
                'mark_pca_outliers',
                ColumnTransformer(
                    [
                        (
                            'mark_pca_outliers',
                            Pipeline(
                                [
                                    (
                                        'impute_num_ohe_cat_standard',
                                        make_impute_standardize_num_ohe_cat(),
                                    ),
                                    ('pca_model', pca_model),
                                    (
                                        'standard_scaler_out',
                                        QuantileTransformer(
                                            output_distribution=pca_score_distribution,
                                            random_state=0,
                                        ),
                                    ),
                                ]
                            ),
                            make_column_selector(dtype_exclude=None),
                        ),
                        (
                            'passthrough_original_features',
                            'passthrough',
                            make_column_selector(dtype_exclude=None),
                        ),
                    ],
                    remainder='drop',
                    verbose_feature_names_out=False,
                ),
            ),
        ]
    else:
        steps = []
    drop_features = []
    return steps, drop_features


steps_preproc, _ = make_steps_pca_outliers_score()

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
display(pipe_preproc)
display(pipe_preproc.fit_transform(X, y))
# _ = pipe_preproc.fit(X, y)
# pipe_preproc.transform(X_test)
test_all_models('pca_outliers_score', pipe, X, y)

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

Pipeline(steps=[('mark_pca_outliers',
                 ColumnTransformer(transformers=[('mark_pca_outliers',
                                                  Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                   ColumnTransformer(n_jobs=1,
                                                                                     transformers=[('impute_standardize_numeric',
                                                                                                    Pipeline(steps=[('simple_imputer',
                                                                                                                     SimpleImputer()),
                                                                                                                    ('standard_scaler',
                                                                                                                     StandardScaler())]),
                                                                                                    <sklearn.compose._column_transformer.make_column_sel...
                                                                   <__main__.pcaPandas object at 0x7ff039906f50>),
                                                                  ('standard_scaler_out',
                                                                   QuantileTransformer(random_state=0))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7ff020f3e3d0>),
                                                 ('passthrough_original_features',
                                                  'passthrough',
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7ff039513350>)],
                                   verbose_feature_names_out=False))])

Pipeline(steps=[('mark_pca_outliers',
                 ColumnTransformer(transformers=[('mark_pca_outliers',
                                                  Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                   ColumnTransformer(n_jobs=1,
                                                                                     transformers=[('impute_standardize_numeric',
                                                                                                    Pipeline(steps=[('simple_imputer',
                                                                                                                     SimpleImputer()),
                                                                                                                    ('standard_scaler',
                                                                                                                     StandardScaler())]),
                                                                                                    <sklearn.compose._column_transformer.make_column_sel...
                                                                   <__main__.pcaPandas object at 0x7ff039906f50>),
                                                                  ('standard_scaler_out',
                                                                   QuantileTransformer(random_state=0))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7ff020f3e3d0>),
                                                 ('passthrough_original_features',
                                                  'passthrough',
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7ff039513350>)],
                                   verbose_feature_names_out=False))])

ColumnTransformer(transformers=[('mark_pca_outliers',
                                 Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                  ColumnTransformer(n_jobs=1,
                                                                    transformers=[('impute_standardize_numeric',
                                                                                   Pipeline(steps=[('simple_imputer',
                                                                                                    SimpleImputer()),
                                                                                                   ('standard_scaler',
                                                                                                    StandardScaler())]),
                                                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7ff00f2e1f90>),
                                                                                  ('one_h...
                                                  <__main__.pcaPandas object at 0x7ff039906f50>),
                                                 ('standard_scaler_out',
                                                  QuantileTransformer(random_state=0))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7ff020f3e3d0>),
                                ('passthrough_original_features', 'passthrough',
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7ff039513350>)],
                  verbose_feature_names_out=False)

<sklearn.compose._column_transformer.make_column_selector object at 0x7ff020f3e3d0>

ColumnTransformer(n_jobs=1,
                  transformers=[('impute_standardize_numeric',
                                 Pipeline(steps=[('simple_imputer',
                                                  SimpleImputer()),
                                                 ('standard_scaler',
                                                  StandardScaler())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7ff00f2e1f90>),
                                ('one_hot_encode_categorical',
                                 OneHotEncoder(dtype=<class 'int'>,
                                               handle_unknown='infrequent_if_exist',
                                               min_frequency=1,
                                               sparse_output=False),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7ff020f3dd10>)],
                  verbose_feature_names_out=False)

<sklearn.compose._column_transformer.make_column_selector object at 0x7ff00f2e1f90>

SimpleImputer()

StandardScaler()

<sklearn.compose._column_transformer.make_column_selector object at 0x7ff020f3dd10>

steps_preproc = [
    (
        'impute_numeric_decode_categorical',
        ColumnTransformer(
            [
                (
                    'impute_numeric',
                    IterativeImputer(random_state=0),
                    make_column_selector(dtype_include='number'),
                ),
                (
                    'decode_categorical',
                    FunctionTransformer(func=cat_to_num),
                    make_column_selector(dtype_exclude='number'),
                ),
            ],
            remainder='passthrough',
            verbose_feature_names_out=False,
        ),
    ),
]

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)

# pipe_preproc
# pipe_preproc.fit_transform(X)

mi_scores = make_mi_scores(pipe_preproc.fit_transform(X), y)
mi_scores.tail(30)

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

Fence               0.050665
Electrical          0.043887
BsmtFinType2        0.039734
LandContour         0.035137
ExterCond           0.034941
BsmtCond            0.034163
BsmtFullBath        0.031220
BsmtHalfBath        0.028547
Alley               0.024762
KitchenAbvGr        0.020035
Heating             0.018472
EnclosedPorch       0.015764
LotConfig           0.014472
RoofStyle           0.012409
Condition1          0.011470
MiscFeature         0.010775
ScreenPorch         0.008822
Functional          0.007986
LowQualFinSF        0.007507
Condition2          0.006244
BsmtFinSF2          0.003878
RoofMatl            0.001923
Utilities           0.001381
ThreeSeasonPorch    0.000627
Street              0.000000
YrMoSold            0.000000
PoolQC              0.000000
MiscVal             0.000000
PoolArea            0.000000
LandSlope           0.000000
Name: MI Scores, dtype: float64

plot_mi_scores(mi_scores)

/home/florin/kaggle/feature_engineering/pipeline_pandas_utils.py:62: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()

def drop_uninformative(X, mi_scores, min_score):
    X = X.copy()
    return X[mi_scores.index[mi_scores >= min_score].to_list()]


def make_steps_drop_uninformative(mi_scores, min_score=0.0001, drop_now=False):
    """
    disable: min_score=0.0
    """
    if drop_now and min_score > 0.0:
        # run as a standalone step, not in a big pipeline
        steps = [
            (
                'drop_uninformative',
                FunctionTransformer(
                    func=drop_uninformative,
                    kw_args={'mi_scores': mi_scores, 'min_score': min_score},
                ),
            )
        ]
        drop_features = mi_scores.index[mi_scores < min_score].to_list()
    else:
        if min_score > 0.0:
            # run as intermediate step in a larger pipeline
            # features are dropped at the end of the pipeline
            steps = [('drop_uninformative_later', 'passthrough')]
            drop_features = mi_scores.index[mi_scores < min_score].to_list()
        else:
            # intermediate step in a larger pipeline
            # but disabled
            steps = [('drop_uninformative_later', 'passthrough')]
            drop_features = []
    return steps, drop_features


min_score = 0.0001
steps_preproc, _ = make_steps_drop_uninformative(mi_scores=mi_scores, min_score=min_score, drop_now=True)
print(f'dropped features: {_}')
print()

# fix tqdm bug
time.sleep(0.1)

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
display(pipe_preproc.fit_transform(X, y))
test_all_models('drop_uninformative', pipe, X, y)

dropped features: ['Street', 'YrMoSold', 'PoolQC', 'MiscVal', 'PoolArea', 'LandSlope']

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

drop_uninformative
xgboost  0.13448797551289685
lgbm     0.12835534176467855
catboost 0.11842801893224043
ridge    0.13393009886635232
enet     0.13439295721026112

def living_lot_ratio_feature(X):
    X = X.copy()
    X_new = pd.DataFrame(index=X.index)
    X_new["LivLotRatio"] = X['GrLivArea'] / X['LotArea']
    return X.join(X_new)


def spaciousness_feature(X):
    X = X.copy()
    X_new = pd.DataFrame(index=X.index)
    X_new["Spaciousness"] = (X['FirstFlrSF'] + X['SecondFlrSF']) / X['TotRmsAbvGrd']
    return X.join(X_new)


def total_outside_sf_feature(X, features):
    """
    Add square footage features.
    """
    X = X.copy()
    X_new = pd.DataFrame(index=X.index)
    X_new["TotalOutsideSF"] = 0.0
    for c in features:
        X_new["TotalOutsideSF"] += X[c]
    return X.join(X_new)


def outdoor_area_type_count(X, features):
    """
    create feature that describes how many kinds of outdoor areas a dwelling has
    """
    X = X.copy()
    X_new = pd.DataFrame()
    X_new["PorchTypes"] = X[features].gt(0.0).sum(axis=1)
    return X.join(X_new)


def make_steps_mathematical_transforms(
    enable_living_lot_ratio=True,
    enable_spaciousness=True,
    enable_outside_sf_features=True,
    enable_outdoor_area_type_count=True,
):
    outside_sf_features = [
        'WoodDeckSF',
        'OpenPorchSF',
        'EnclosedPorch',
        'ThreeSeasonPorch',
        'ScreenPorch',
    ]
    porch_features = [
        "WoodDeckSF",
        "OpenPorchSF",
        "EnclosedPorch",
        "ThreeSeasonPorch",
        "ScreenPorch",
    ]

    steps = []
    drop_features = []
    if enable_living_lot_ratio:
        steps += [('living_lot_ratio', FunctionTransformer(func=living_lot_ratio_feature))]
    if enable_spaciousness:
        steps += [('spaciousness', FunctionTransformer(func=spaciousness_feature))]
    if enable_outside_sf_features:
        steps += [
            (
                'outside_sf_features',
                FunctionTransformer(
                    func=total_outside_sf_feature,
                    kw_args={'features': outside_sf_features},
                ),
            )
        ]
    if enable_outdoor_area_type_count:
        steps += [
            (
                'outdoor_area_type_count',
                FunctionTransformer(func=outdoor_area_type_count, kw_args={'features': porch_features}),
            )
        ]

    return steps, drop_features


# each one will be selected independently during hyperparameter optimization
# dropping columns, if any, will happen at the end of the pipeline
steps_preproc, _ = make_steps_mathematical_transforms()

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
# pipe_preproc.fit_transform(X, y)
test_all_models('mathematical_transforms', pipe, X, y)

mathematical_transforms
xgboost  0.13524854842619866
lgbm     0.1272195075934009
catboost 0.11895582949591313
ridge    0.132938691129515
enet     0.13641504704995364

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

def area_to_linear(X, t_columns):
    """
    Take square root of a numeric feature.
    """
    X = X.copy()
    for c in t_columns:
        X[c + 'ToLinear'] = np.sqrt(X[c])
    return X


def make_steps_area_to_linear(
    lot_area='enable',
    total_bsmt_sf='enable',
    first_flr_sf='enable',
    gr_liv_area='enable',
):
    steps = []
    drop_features = []
    if lot_area in ['enable_drop', 'enable']:
        steps += [
            (
                'area_to_linear_LotArea',
                FunctionTransformer(func=area_to_linear, kw_args={'t_columns': ['LotArea']}),
            )
        ]
        if lot_area == 'enable_drop':
            drop_features += ['LotArea']
    if total_bsmt_sf in ['enable_drop', 'enable']:
        steps += [
            (
                'area_to_linear_TotalBsmtSF',
                FunctionTransformer(func=area_to_linear, kw_args={'t_columns': ['TotalBsmtSF']}),
            )
        ]
        if total_bsmt_sf == 'enable_drop':
            drop_features += ['TotalBsmtSF']
    if first_flr_sf in ['enable_drop', 'enable']:
        steps += [
            (
                'area_to_linear_FirstFlrSF',
                FunctionTransformer(func=area_to_linear, kw_args={'t_columns': ['FirstFlrSF']}),
            )
        ]
        if first_flr_sf == 'enable_drop':
            drop_features += ['FirstFlrSF']
    if gr_liv_area in ['enable_drop', 'enable']:
        steps += [
            (
                'area_to_linear_GrLivArea',
                FunctionTransformer(func=area_to_linear, kw_args={'t_columns': ['GrLivArea']}),
            )
        ]
        if gr_liv_area == 'enable_drop':
            drop_features += ['GrLivArea']
    return steps, drop_features


steps_preproc, _ = make_steps_area_to_linear()
print(f'drop: {_}')
time.sleep(0.1)

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
# pipe_preproc.fit_transform(X, y)
test_all_models('area_to_linear', pipe, X, y)

drop: []
area_to_linear
xgboost  0.1362956720421577
lgbm     0.1285616308772774
catboost 0.11901901583192231
ridge    0.12810528192248716
enet     0.13590565572345223

def do_coalesce_mssubclass(X):
    X = X.copy()
    sc = X['MSSubClass'].astype(str)
    classes = {
        '20': 1,
        '30': 1,
        '40': 1,
        '45': 1,
        '50': 1,
        '60': 2,
        '70': 2,
        '75': 2,
        '80': 3,
        '85': 3,
        '90': 4,
        '120': 1,
        '150': 1,
        '160': 2,
        '180': 5,
        '190': 6,
    }
    sc = sc.replace(classes)
    X['MSClass'] = sc
    return X


def make_steps_coalesce_mssubclass(action='enable'):
    """
    action: 'disable', 'enable_drop', 'enable'
    """
    steps = []
    drop_features = []
    if action in ['enable_drop', 'enable']:
        steps += [('coalesce_mssubclass', FunctionTransformer(func=do_coalesce_mssubclass))]
    if action == 'enable_drop':
        drop_features += ['MSSubClass']
    return steps, drop_features


steps_preproc, _ = make_steps_coalesce_mssubclass()
print(f'drop: {_}')
time.sleep(0.1)

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
test_all_models('coalesce_mssubclass', pipe, X, y)

drop: []
coalesce_mssubclass
xgboost  0.13694397079904713
lgbm     0.12844654422256963
catboost 0.11845392386178624
ridge    0.13507635191339812
enet     0.1369613493191226

_ = sns.lmplot(
    x='GrLivArea',
    y="SalePrice",
    hue="BldgType",
    col="BldgType",
    data=df,
    scatter_kws={"edgecolor": 'w'},
    col_wrap=3,
    height=3,
)

/home/florin/.local/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)

def do_GrLivArea_BldgType_interaction(X):
    """
    dummy BldgType and multiply it by GrLivArea
    """
    X = X.copy()
    X_new = pd.get_dummies(df['BldgType'], prefix='Bldg', drop_first=True)
    X_new = X_new.mul(df['GrLivArea'], axis=0)
    return X.join(X_new)


def make_steps_GrLivArea_BldgType_interaction(action='enable'):
    steps = []
    drop_features = []
    if action in ['enable_drop', 'enable']:
        steps += [
            (
                'GrLivArea_BldgType_interaction',
                FunctionTransformer(func=do_GrLivArea_BldgType_interaction),
            )
        ]
    if action == 'enable_drop':
        drop_features += ['BldgType', 'GrLivArea']
    return steps, drop_features


steps_preproc, _ = make_steps_GrLivArea_BldgType_interaction()
print(f'drop: {_}')

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
test_all_models('GrLivArea_BldgType_interaction', pipe, X, y)

drop: []
GrLivArea_BldgType_interaction
xgboost  0.13428976367516854
lgbm     0.12761072141786642
catboost 0.11887978008118195
ridge    0.13469351280934772
enet     0.13685634524485973

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

def decode_cat_columns(X, t_columns):
    """
    Convert categorical columns to their codes.
    """
    X = X.copy()
    X_new = pd.DataFrame(index=X.index)
    for c in t_columns:
        X_new[c] = X[c].cat.codes
    return X_new


pd.options.mode.copy_on_write = False

fig, ax = plt.subplots(2, 2, figsize=(8, 8))

fig1 = sns.pairplot(decode_cat_columns(X, ['OverallQual', 'OverallCond']), kind='reg', diag_kind='kde')
fig1.savefig('qual_cond1.png')
plt.close(fig1.fig)
fig2 = sns.pairplot(decode_cat_columns(X, ['GarageQual', 'GarageCond']), kind='reg', diag_kind='kde')
fig2.savefig('qual_cond2.png')
plt.close(fig2.fig)
fig3 = sns.pairplot(decode_cat_columns(X, ['ExterQual', 'ExterCond']), kind='reg', diag_kind='kde')
fig3.savefig('qual_cond3.png')
plt.close(fig3.fig)
fig4 = sns.pairplot(decode_cat_columns(X, ['BsmtQual', 'BsmtCond']), kind='reg', diag_kind='kde')
fig4.savefig('qual_cond4.png')
plt.close(fig4.fig)

ax[0, 0].imshow(PIL.Image.open('qual_cond1.png'))
ax[0, 0].set_axis_off()
ax[0, 1].imshow(PIL.Image.open('qual_cond2.png'))
ax[0, 1].set_axis_off()
ax[1, 0].imshow(PIL.Image.open('qual_cond3.png'))
ax[1, 0].set_axis_off()
ax[1, 1].imshow(PIL.Image.open('qual_cond4.png'))
ax[1, 1].set_axis_off()

plt.tight_layout()
fig.show()

pd.options.mode.copy_on_write = True

/home/florin/.local/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
/home/florin/.local/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
/home/florin/.local/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
/home/florin/.local/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
/tmp/ipykernel_35469/3093716740.py:39: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()

def decode_two_multiply(X, t_columns, n_column):
    """
    Transform two columns to their codes (if categorical), then multiply them.
    """
    assert len(t_columns) == 2, f't_columns must contain 2 columns, got {len(t_columns)} instead'
    X = X.copy()

    X_new = pd.DataFrame(index=X.index)
    for c in t_columns:
        if X[c].dtype.name == 'category':
            X_new[c] = X[c].cat.codes
        else:
            X_new[c] = X[c]

    X_new[n_column] = X_new[t_columns[0]] * X_new[t_columns[1]]
    # in X_new, only keep the product column
    X_new.drop(columns=t_columns, inplace=True)
    X = X.join(X_new)
    return X


def make_steps_qual_cond_interactions(
    garage_interaction=True,
    exterior_interaction=True,
    overall_interaction=True,
    basement_interaction=True,
):
    steps = []
    drop_features = []
    if garage_interaction:
        steps += [
            (
                'garage_interaction',
                FunctionTransformer(
                    func=decode_two_multiply,
                    kw_args={
                        't_columns': ['GarageQual', 'GarageCond'],
                        'n_column': 'GarageOverall',
                    },
                ),
            )
        ]
    if exterior_interaction:
        steps += [
            (
                'exterior_interaction',
                FunctionTransformer(
                    func=decode_two_multiply,
                    kw_args={
                        't_columns': ['ExterQual', 'ExterCond'],
                        'n_column': 'ExterOverall',
                    },
                ),
            ),
        ]
    if overall_interaction:
        steps += [
            (
                'overall_interaction',
                FunctionTransformer(
                    func=decode_two_multiply,
                    kw_args={
                        't_columns': ['OverallQual', 'OverallCond'],
                        'n_column': 'OverallOverall',
                    },
                ),
            ),
        ]
    if basement_interaction:
        steps += [
            (
                'basement_interaction',
                FunctionTransformer(
                    func=decode_two_multiply,
                    kw_args={
                        't_columns': ['BsmtQual', 'BsmtCond'],
                        'n_column': 'BsmtOverall',
                    },
                ),
            ),
        ]
    return steps, drop_features


steps_preproc, _ = make_steps_qual_cond_interactions()

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
# pipe_preproc
# pipe_preproc.fit_transform(X, y)
test_all_models('qual_cond_interactions', pipe, X, y)

qual_cond_interactions
xgboost  0.1333236043664156
lgbm     0.12858964798910738
catboost 0.11860501415397526
ridge    0.1337616024689054
enet     0.13529024677410023

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

pd.options.mode.copy_on_write = False

fig, ax = plt.subplots(1, 2, figsize=(8, 4))

X_inter = X[['BsmtQual', 'TotalBsmtSF']].copy()
X_inter['BsmtQual'] = X_inter['BsmtQual'].cat.codes
X_inter['TotalBsmtSF'] = np.log1p(X_inter['TotalBsmtSF'])
fig1 = sns.pairplot(X_inter, kind='reg', diag_kind='kde')
fig1.savefig('bsmt_qual_sf.png')
plt.close(fig1.fig)
ax[0].imshow(PIL.Image.open('bsmt_qual_sf.png'))
ax[0].set_axis_off()

X_inter = X[['PoolArea', 'PoolQC']].copy()
X_inter['PoolQC'] = X_inter['PoolQC'].cat.codes
fig2 = sns.pairplot(X_inter, kind='reg', diag_kind='kde')
fig2.savefig('pool_area_qc.png')
plt.close(fig2.fig)
ax[1].imshow(PIL.Image.open('pool_area_qc.png'))
ax[1].set_axis_off()

plt.tight_layout()
fig.show()

pd.options.mode.copy_on_write = True

/home/florin/.local/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
/home/florin/.local/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
/tmp/ipykernel_35469/2766856711.py:23: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()

def make_steps_num_cat_interactions(basement_num_cat=True, pool_num_cat=True):
    steps = []
    drop_features = []
    if basement_num_cat:
        steps += [
            (
                'basement_num_cat_interaction',
                FunctionTransformer(
                    func=decode_two_multiply,
                    kw_args={
                        't_columns': ['BsmtQual', 'TotalBsmtSF'],
                        'n_column': 'BsmtNumCat',
                    },
                ),
            ),
        ]
    if pool_num_cat:
        steps += [
            (
                'pool_num_cat_interaction',
                FunctionTransformer(
                    func=decode_two_multiply,
                    kw_args={
                        't_columns': ['PoolArea', 'PoolQC'],
                        'n_column': 'PoolNumCat',
                    },
                ),
            ),
        ]
    return steps, drop_features


steps_preproc, _ = make_steps_num_cat_interactions()

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
# pipe_preproc
# pipe_preproc.fit_transform(X, y)
test_all_models('num_cat_interactions', pipe, X, y)

num_cat_interactions
xgboost  0.13565366574084264
lgbm     0.1281065622360887
catboost 0.11797581822666771
ridge    0.1347210528506853
enet     0.13675698301327843

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

def group_transforms(X, feature, group_by):
    X = X.copy()
    X_new = pd.DataFrame()
    new_col = feature + '_by_' + group_by + '_median_diff'
    X_new[new_col] = X.groupby(group_by)[feature].transform('median')
    X_new[new_col] = X[feature] - X_new[new_col]
    return X.join(X_new)


def make_steps_median_diff_groupby(
    group_LotArea_by_MSZoning=True,
    group_GrLivArea_by_MSSubClass=True,
    group_LotArea_by_MSSubClass=True,
    group_GrLivArea_by_HouseStyle=True,
):
    steps = []
    drop_features = []
    if group_LotArea_by_MSZoning:
        steps += [
            (
                'LotArea_by_MSZoning_median_diff_norm',
                FunctionTransformer(
                    func=group_transforms,
                    kw_args={'feature': 'LotArea', 'group_by': 'MSZoning'},
                ),
            ),
        ]
    if group_GrLivArea_by_MSSubClass:
        steps += [
            (
                'GrLivArea_by_MSSubClass_median_diff',
                FunctionTransformer(
                    func=group_transforms,
                    kw_args={'feature': 'GrLivArea', 'group_by': 'MSSubClass'},
                ),
            ),
        ]
    if group_LotArea_by_MSSubClass:
        steps += [
            (
                'LotArea_by_MSSubClass_median_diff_norm',
                FunctionTransformer(
                    func=group_transforms,
                    kw_args={'feature': 'LotArea', 'group_by': 'MSSubClass'},
                ),
            ),
        ]
    if group_GrLivArea_by_HouseStyle:
        steps += [
            (
                'GrLivArea_by_HouseStyle_median_diff_norm',
                FunctionTransformer(
                    func=group_transforms,
                    kw_args={'feature': 'GrLivArea', 'group_by': 'HouseStyle'},
                ),
            ),
        ]
    return steps, drop_features


steps_preproc, _ = make_steps_median_diff_groupby()

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
# display(pipe_preproc.fit_transform(X, y))
# pipe_preproc.fit_transform(X, y).iloc[:, -1].info()
test_all_models('median_diff_groupby', pipe, X, y)

median_diff_groupby
xgboost  0.13375412867552253
lgbm     0.12622006464898997
catboost 0.11940600745204362
ridge    0.13519220023606957
enet     0.1375736604535861

# square-footage features
cluster_area_features = [
    "LotArea",
    "TotalBsmtSF",
    "FirstFlrSF",
    "SecondFlrSF",
    "GrLivArea",
]


def cluster_evaluate_elbow(X, features, n_max=20):
    X = X.copy()
    X_scaled = X.loc[:, features]
    si = SimpleImputer(strategy='mean')
    X_scaled = si.fit_transform(X_scaled)
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
    kmeans = KMeans(n_init=50, random_state=0)
    ke_vis = KElbowVisualizer(kmeans, k=(2, n_max), metric='distortion')
    ke_vis.fit(X_scaled)
    ke_vis.show()
    n_best = ke_vis.elbow_value_


def cluster_evaluate_silhouette(X, features, elbow_val, delta):
    X = X.copy()
    X_scaled = X.loc[:, features]
    si = SimpleImputer(strategy='mean')
    X_scaled = si.fit_transform(X_scaled)
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
    fig, ax = plt.subplots(nrows=delta, ncols=2, figsize=(9, round(3 * delta)))
    sil_score_best = 0.0
    i_best = 0
    for i in range(elbow_val - delta, elbow_val + delta):
        _, col = divmod(i - ((elbow_val - delta) % 2), 2)
        row = (i - elbow_val + delta) // 2
        kmeans = KMeans(n_clusters=i, n_init=50, random_state=0)
        ks_vis = SilhouetteVisualizer(kmeans, ax=ax[row, col])
        ks_vis.fit(X_scaled)
        sil_score = ks_vis.silhouette_score_
        if sil_score > sil_score_best:
            sil_score_best = sil_score
            i_best = i
        ax[row, col].set_title(f'n_clusters={i} score={sil_score:.4f}')
    print(f'best values: n_clusters={i_best} score={sil_score_best}')
    fig.show()

cluster_evaluate_elbow(X, cluster_area_features)

cluster_evaluate_silhouette(X, cluster_area_features, 9, 7)

best values: n_clusters=5 score=0.41659716411855224

/tmp/ipykernel_35469/4119913658.py:45: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()

def extract_cluster_labels(X, column_name):
    c_labels = X.apply(lambda row: row.values.argmin(), axis=1)
    c_labels = pd.DataFrame(c_labels, index=X.index, columns=[column_name])
    return c_labels


def make_steps_cluster_labels_area_feature(n_clusters=6):
    """
    n_clusters=1 means disabled
    """
    steps = []
    drop_features = []
    if n_clusters >= 2:
        steps = [
            (
                'cluster_labels_area_features',
                ColumnTransformer(
                    [
                        (
                            'create_cluster_labels',
                            Pipeline(
                                [
                                    ('simple_imputer', SimpleImputer(strategy='mean')),
                                    ('standard_scaler', StandardScaler()),
                                    (
                                        'kmeans_clustering',
                                        KMeans(
                                            n_clusters=n_clusters,
                                            n_init='auto',
                                            random_state=0,
                                        ),
                                    ),
                                    (
                                        'extract_labels',
                                        FunctionTransformer(
                                            func=extract_cluster_labels,
                                            kw_args={'column_name': 'AreaFeaturesCluster'},
                                        ),
                                    ),
                                ]
                            ),
                            cluster_area_features,
                        ),
                        (
                            'passthrough_cluster_features',
                            'passthrough',
                            cluster_area_features,
                        ),
                    ],
                    remainder='passthrough',
                    n_jobs=1,
                    verbose_feature_names_out=False,
                ),
            ),
        ]
    return steps, drop_features


steps_preproc, _ = make_steps_cluster_labels_area_feature(n_clusters=6)

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
display(pipe_preproc)
# pipe_preproc.fit_transform(X, y)
test_all_models('cluster_labels_area_feature', pipe, X, y)

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

Pipeline(steps=[('cluster_labels_area_features',
                 ColumnTransformer(n_jobs=1, remainder='passthrough',
                                   transformers=[('create_cluster_labels',
                                                  Pipeline(steps=[('simple_imputer',
                                                                   SimpleImputer()),
                                                                  ('standard_scaler',
                                                                   StandardScaler()),
                                                                  ('kmeans_clustering',
                                                                   KMeans(n_clusters=6,
                                                                          n_init='auto',
                                                                          random_state=0)),
                                                                  ('extract_labels',
                                                                   FunctionTransformer(func=<function extract_cluster_labels at 0x7ff00ecd7560>,
                                                                                       kw_args={'column_name': 'AreaFeaturesCluster'}))]),
                                                  ['LotArea', 'TotalBsmtSF',
                                                   'FirstFlrSF', 'SecondFlrSF',
                                                   'GrLivArea']),
                                                 ('passthrough_cluster_features',
                                                  'passthrough',
                                                  ['LotArea', 'TotalBsmtSF',
                                                   'FirstFlrSF', 'SecondFlrSF',
                                                   'GrLivArea'])],
                                   verbose_feature_names_out=False))])

Pipeline(steps=[('cluster_labels_area_features',
                 ColumnTransformer(n_jobs=1, remainder='passthrough',
                                   transformers=[('create_cluster_labels',
                                                  Pipeline(steps=[('simple_imputer',
                                                                   SimpleImputer()),
                                                                  ('standard_scaler',
                                                                   StandardScaler()),
                                                                  ('kmeans_clustering',
                                                                   KMeans(n_clusters=6,
                                                                          n_init='auto',
                                                                          random_state=0)),
                                                                  ('extract_labels',
                                                                   FunctionTransformer(func=<function extract_cluster_labels at 0x7ff00ecd7560>,
                                                                                       kw_args={'column_name': 'AreaFeaturesCluster'}))]),
                                                  ['LotArea', 'TotalBsmtSF',
                                                   'FirstFlrSF', 'SecondFlrSF',
                                                   'GrLivArea']),
                                                 ('passthrough_cluster_features',
                                                  'passthrough',
                                                  ['LotArea', 'TotalBsmtSF',
                                                   'FirstFlrSF', 'SecondFlrSF',
                                                   'GrLivArea'])],
                                   verbose_feature_names_out=False))])

ColumnTransformer(n_jobs=1, remainder='passthrough',
                  transformers=[('create_cluster_labels',
                                 Pipeline(steps=[('simple_imputer',
                                                  SimpleImputer()),
                                                 ('standard_scaler',
                                                  StandardScaler()),
                                                 ('kmeans_clustering',
                                                  KMeans(n_clusters=6,
                                                         n_init='auto',
                                                         random_state=0)),
                                                 ('extract_labels',
                                                  FunctionTransformer(func=<function extract_cluster_labels at 0x7ff00ecd7560>,
                                                                      kw_args={'column_name': 'AreaFeaturesCluster'}))]),
                                 ['LotArea', 'TotalBsmtSF', 'FirstFlrSF',
                                  'SecondFlrSF', 'GrLivArea']),
                                ('passthrough_cluster_features', 'passthrough',
                                 ['LotArea', 'TotalBsmtSF', 'FirstFlrSF',
                                  'SecondFlrSF', 'GrLivArea'])],
                  verbose_feature_names_out=False)

['LotArea', 'TotalBsmtSF', 'FirstFlrSF', 'SecondFlrSF', 'GrLivArea']

SimpleImputer()

StandardScaler()

KMeans(n_clusters=6, n_init='auto', random_state=0)

FunctionTransformer(func=<function extract_cluster_labels at 0x7ff00ecd7560>,
                    kw_args={'column_name': 'AreaFeaturesCluster'})

['LotArea', 'TotalBsmtSF', 'FirstFlrSF', 'SecondFlrSF', 'GrLivArea']

make_submission_all(pipe, X, y, X_test, 'cluster_labels_area_feature')

Xy = pipe_preproc.fit_transform(X, y)
Xy["AreaFeaturesCluster"] = Xy['AreaFeaturesCluster'].astype("category")
Xy["SalePrice"] = y
_ = sns.relplot(
    x="value",
    y="SalePrice",
    hue="AreaFeaturesCluster",
    col="var",
    height=4,
    aspect=1,
    facet_kws={'sharex': False},
    col_wrap=3,
    data=Xy.melt(
        value_vars=cluster_area_features,
        id_vars=["SalePrice", "AreaFeaturesCluster"],
        var_name='var',
    ),
)

/home/florin/.local/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)

def rename_cluster_columns(X, prefix, numeric=True):
    old_cols = X.columns.to_list()
    if numeric:
        new_cols = [prefix + '_' + str(i) for i in range(len(old_cols))]
    else:
        new_cols = [prefix + '_' + i for i in old_cols]
    cols_dict = dict(zip(old_cols, new_cols))
    return X.rename(columns=cols_dict)


def make_steps_cluster_distances_area_features(n_clusters=6):
    """ "
    n_clusters=1 means disabled
    """
    steps = []
    drop_features = []
    if n_clusters >= 2:
        steps += [
            (
                'cluster_distances_area_features',
                ColumnTransformer(
                    [
                        (
                            'create_cluster_distances',
                            Pipeline(
                                [
                                    ('simple_imputer', SimpleImputer(strategy='mean')),
                                    ('standard_scaler', StandardScaler()),
                                    (
                                        'kmeans_clustering',
                                        KMeans(
                                            n_clusters=n_clusters,
                                            n_init='auto',
                                            random_state=0,
                                        ),
                                    ),
                                    (
                                        'rename_cluster_columns',
                                        FunctionTransformer(
                                            func=rename_cluster_columns,
                                            kw_args={'prefix': 'AreaFeatDistToCluster'},
                                        ),
                                    ),
                                ]
                            ),
                            cluster_area_features,
                        ),
                        (
                            'passthrough_cluster_features',
                            'passthrough',
                            cluster_area_features,
                        ),
                    ],
                    remainder='passthrough',
                    n_jobs=1,
                    verbose_feature_names_out=False,
                ),
            ),
        ]
    return steps, drop_features


steps_preproc, _ = make_steps_cluster_distances_area_features(n_clusters=6)

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
# display(pipe_preproc)
# pipe_preproc.fit_transform(X, y)
test_all_models('cluster_distances_area_features', pipe, X, y)

cluster_distances_area_features
xgboost  0.13731106727665227
lgbm     0.12690222447929816
catboost 0.11707804266761493
ridge    0.13254908258731002
enet     0.13698341814245335

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

make_submission_all(pipe, X, y, X_test, 'cluster_distances_area_features')

def apply_pca(X_orig):
    X = X_orig.copy()
    # Create principal components
    pca = PCA(random_state=0)
    X_pca = pca.fit_transform(X)
    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names, index=X.index)
    # Create loadings
    loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=X.columns,  # and the rows are the original features
    )
    return pca, X_pca, loadings


steps_pca_all = [
    ('simple_imputer', SimpleImputer(strategy='mean')),
    ('standard_scaler', StandardScaler()),
]

pipe_pca_all = Pipeline(steps=steps_pca_all, verbose=False)
pipe_pca_all.set_output(transform='pandas')


pca, X_pca, loadings = apply_pca(pipe_pca_all.fit_transform(X[features_numeric]))
_ = plot_variance(pca, 16)

X_pca

loadings

fig, ax = plt.subplots(1, 1, figsize=(8, 8))
ax = sns.heatmap(loadings, cmap='icefire', center=0.0, square=True, cbar=False)

def make_steps_replace_numeric_with_pca_features(pca_features_count=10, drop_now=False):
    """
    pca_features_count=0 means disabled
    """
    steps = []
    drop_features = []
    if pca_features_count >= 1:
        pca_features_list = ['pca' + str(i) for i in range(len(features_numeric))]
        ct_list = [
            (
                'add_pca_features',
                Pipeline(
                    [
                        ('impute_numeric', SimpleImputer(strategy='mean')),
                        ('standard_scaler', StandardScaler()),
                        ('pca', PCA(random_state=0)),
                        (
                            'filter_pca_features',
                            ColumnTransformer(
                                [
                                    (
                                        'passthrough',
                                        'passthrough',
                                        pca_features_list[:pca_features_count],
                                    )
                                ],
                                remainder='drop',
                                verbose_feature_names_out=False,
                            ),
                        ),
                    ]
                ),
                features_numeric,
            ),
        ]
        if not drop_now:
            ct_list += [('numeric_features', 'passthrough', features_numeric)]
            drop_features = features_numeric
        steps += [
            (
                'add_pca_features',
                ColumnTransformer(
                    ct_list,
                    remainder='passthrough',
                    verbose_feature_names_out=False,
                ),
            )
        ]
    return steps, drop_features


steps_preproc, _ = make_steps_replace_numeric_with_pca_features(pca_features_count=10, drop_now=True)

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
# display(pipe_preproc)
# display(pipe_preproc.fit_transform(X, y))
test_all_models('replace_numeric_with_pca_features', pipe, X, y)

replace_numeric_with_pca_features
xgboost  0.13312909019040395
lgbm     0.12747933412692666
catboost 0.12319170278761034
ridge    0.13565631495796443
enet     0.15047298891518832

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

make_submission_all(pipe, X, y, X_test, 'replace_numeric_with_pca_features')

cluster_evaluate_elbow(X_pca, ['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])

cluster_evaluate_silhouette(X_pca, ['PC1', 'PC2', 'PC3', 'PC4', 'PC5'], 9, 5)

best values: n_clusters=11 score=0.2575858589959137

/tmp/ipykernel_35469/4119913658.py:45: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()

def make_steps_cluster_labels_pca(cluster_labels_pca_features_count=10, n_clusters_labels_pca=9):
    """
    n_clusters_labels_pca: how many clusters to create
    cluster_labels_pca_features_count: how many PCA features to use for cluster creation

    n_clusters_labels_pca=1 means disabled (no-op)
    cluster_labels_pca_features_count <= len(features_numeric)
    """
    steps = []
    drop_features = []
    if n_clusters_labels_pca >= 2:
        pca_features_list = ['pca' + str(i) for i in range(len(features_numeric))]
        steps = [
            (
                'cluster_labels_pca',
                ColumnTransformer(
                    [
                        (
                            'pca_kmeans',
                            Pipeline(
                                [
                                    ('impute_numeric', SimpleImputer(strategy='mean')),
                                    ('standard_scaler', StandardScaler()),
                                    ('pca', PCA(random_state=0)),
                                    (
                                        'filter_pca_features',
                                        ColumnTransformer(
                                            [
                                                (
                                                    'passthrough',
                                                    'passthrough',
                                                    pca_features_list[:cluster_labels_pca_features_count],
                                                )
                                            ],
                                            remainder='drop',
                                            verbose_feature_names_out=False,
                                        ),
                                    ),
                                    (
                                        'kmeans_clustering',
                                        KMeans(
                                            n_clusters=n_clusters_labels_pca,
                                            n_init='auto',
                                            random_state=0,
                                        ),
                                    ),
                                    (
                                        'extract_labels',
                                        FunctionTransformer(
                                            func=extract_cluster_labels,
                                            kw_args={'column_name': 'PCACluster'},
                                        ),
                                    ),
                                ]
                            ),
                            features_numeric,
                        ),
                        (
                            'features_numeric_passthrough',
                            'passthrough',
                            features_numeric,
                        ),
                    ],
                    remainder='passthrough',
                    verbose_feature_names_out=False,
                ),
            )
        ]
    return steps, drop_features


steps_preproc, _ = make_steps_cluster_labels_pca()

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
display(pipe_preproc)
# pipe_preproc.fit_transform(X, y)
test_all_models('cluster_labels_pca', pipe, X, y)

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

Pipeline(steps=[('cluster_labels_pca',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pca_kmeans',
                                                  Pipeline(steps=[('impute_numeric',
                                                                   SimpleImputer()),
                                                                  ('standard_scaler',
                                                                   StandardScaler()),
                                                                  ('pca',
                                                                   PCA(random_state=0)),
                                                                  ('filter_pca_features',
                                                                   ColumnTransformer(transformers=[('passthrough',
                                                                                                    'passthrough',
                                                                                                    ['pca0',
                                                                                                     'pca1',
                                                                                                     'pca2',
                                                                                                     'pca3',
                                                                                                     'pca4',
                                                                                                     'pca5'...
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', 'FirstFlrSF',
                                                   'SecondFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBath',
                                                   'HalfBath', 'BedroomAbvGr',
                                                   'KitchenAbvGr',
                                                   'TotRmsAbvGrd', 'Fireplaces',
                                                   'GarageYrBlt', 'GarageCars',
                                                   'GarageArea', 'WoodDeckSF',
                                                   'OpenPorchSF',
                                                   'EnclosedPorch',
                                                   'ThreeSeasonPorch',
                                                   'ScreenPorch', 'PoolArea', ...])],
                                   verbose_feature_names_out=False))])

Pipeline(steps=[('cluster_labels_pca',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pca_kmeans',
                                                  Pipeline(steps=[('impute_numeric',
                                                                   SimpleImputer()),
                                                                  ('standard_scaler',
                                                                   StandardScaler()),
                                                                  ('pca',
                                                                   PCA(random_state=0)),
                                                                  ('filter_pca_features',
                                                                   ColumnTransformer(transformers=[('passthrough',
                                                                                                    'passthrough',
                                                                                                    ['pca0',
                                                                                                     'pca1',
                                                                                                     'pca2',
                                                                                                     'pca3',
                                                                                                     'pca4',
                                                                                                     'pca5'...
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', 'FirstFlrSF',
                                                   'SecondFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBath',
                                                   'HalfBath', 'BedroomAbvGr',
                                                   'KitchenAbvGr',
                                                   'TotRmsAbvGrd', 'Fireplaces',
                                                   'GarageYrBlt', 'GarageCars',
                                                   'GarageArea', 'WoodDeckSF',
                                                   'OpenPorchSF',
                                                   'EnclosedPorch',
                                                   'ThreeSeasonPorch',
                                                   'ScreenPorch', 'PoolArea', ...])],
                                   verbose_feature_names_out=False))])

ColumnTransformer(remainder='passthrough',
                  transformers=[('pca_kmeans',
                                 Pipeline(steps=[('impute_numeric',
                                                  SimpleImputer()),
                                                 ('standard_scaler',
                                                  StandardScaler()),
                                                 ('pca', PCA(random_state=0)),
                                                 ('filter_pca_features',
                                                  ColumnTransformer(transformers=[('passthrough',
                                                                                   'passthrough',
                                                                                   ['pca0',
                                                                                    'pca1',
                                                                                    'pca2',
                                                                                    'pca3',
                                                                                    'pca4',
                                                                                    'pca5',
                                                                                    'pca6',
                                                                                    'pca7',
                                                                                    'pca8',
                                                                                    'pca9'])],
                                                                    verbos...
                                  'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                                  'FirstFlrSF', 'SecondFlrSF', 'LowQualFinSF',
                                  'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
                                  'FullBath', 'HalfBath', 'BedroomAbvGr',
                                  'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
                                  'GarageYrBlt', 'GarageCars', 'GarageArea',
                                  'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
                                  'ThreeSeasonPorch', 'ScreenPorch', 'PoolArea', ...])],
                  verbose_feature_names_out=False)

['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'FirstFlrSF', 'SecondFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ThreeSeasonPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'YrMoSold']

SimpleImputer()

StandardScaler()

PCA(random_state=0)

ColumnTransformer(transformers=[('passthrough', 'passthrough',
                                 ['pca0', 'pca1', 'pca2', 'pca3', 'pca4',
                                  'pca5', 'pca6', 'pca7', 'pca8', 'pca9'])],
                  verbose_feature_names_out=False)

['pca0', 'pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'pca6', 'pca7', 'pca8', 'pca9']

def make_steps_cluster_distances_pca(cluster_distances_pca_features_count=10, n_clusters_distances_pca=7):
    """
    n_clusters_distances_pca: how many clusters to create
    cluster_distances_pca_features_count: how many PCA features to use for clustering

    n_clusters_distances_pca=1 means disabled
    1 <= cluster_distances_pca_features_count <= len(features_numeric)
    """
    steps = []
    drop_features = []
    if n_clusters_distances_pca >= 2:
        pca_features_list = ['pca' + str(i) for i in range(len(features_numeric))]
        steps += [
            (
                'cluster_distances_pca',
                ColumnTransformer(
                    [
                        (
                            'pca_kmeans',
                            Pipeline(
                                [
                                    ('impute_numeric', SimpleImputer(strategy='mean')),
                                    ('standard_scaler', StandardScaler()),
                                    ('pca', PCA(random_state=0)),
                                    (
                                        'filter_pca_features',
                                        ColumnTransformer(
                                            [
                                                (
                                                    'passthrough',
                                                    'passthrough',
                                                    pca_features_list[:cluster_distances_pca_features_count],
                                                )
                                            ],
                                            remainder='drop',
                                            verbose_feature_names_out=False,
                                        ),
                                    ),
                                    (
                                        'kmeans_clustering',
                                        KMeans(
                                            n_clusters=n_clusters_distances_pca,
                                            n_init='auto',
                                            random_state=0,
                                        ),
                                    ),
                                    (
                                        'rename_cluster_columns',
                                        FunctionTransformer(
                                            func=rename_cluster_columns,
                                            kw_args={'prefix': 'PCADistToCluster'},
                                        ),
                                    ),
                                ]
                            ),
                            features_numeric,
                        ),
                        (
                            'features_numeric_passthrough',
                            'passthrough',
                            features_numeric,
                        ),
                    ],
                    remainder='passthrough',
                    verbose_feature_names_out=False,
                ),
            )
        ]
    return steps, drop_features


pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
display(pipe_preproc)
# display(pipe_preproc.fit_transform(X, y))
test_all_models('cluster_distances_pca', pipe, X, y)

Pipeline(steps=[('cluster_labels_pca',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pca_kmeans',
                                                  Pipeline(steps=[('impute_numeric',
                                                                   SimpleImputer()),
                                                                  ('standard_scaler',
                                                                   StandardScaler()),
                                                                  ('pca',
                                                                   PCA(random_state=0)),
                                                                  ('filter_pca_features',
                                                                   ColumnTransformer(transformers=[('passthrough',
                                                                                                    'passthrough',
                                                                                                    ['pca0',
                                                                                                     'pca1',
                                                                                                     'pca2',
                                                                                                     'pca3',
                                                                                                     'pca4',
                                                                                                     'pca5'...
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', 'FirstFlrSF',
                                                   'SecondFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBath',
                                                   'HalfBath', 'BedroomAbvGr',
                                                   'KitchenAbvGr',
                                                   'TotRmsAbvGrd', 'Fireplaces',
                                                   'GarageYrBlt', 'GarageCars',
                                                   'GarageArea', 'WoodDeckSF',
                                                   'OpenPorchSF',
                                                   'EnclosedPorch',
                                                   'ThreeSeasonPorch',
                                                   'ScreenPorch', 'PoolArea', ...])],
                                   verbose_feature_names_out=False))])

Pipeline(steps=[('cluster_labels_pca',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pca_kmeans',
                                                  Pipeline(steps=[('impute_numeric',
                                                                   SimpleImputer()),
                                                                  ('standard_scaler',
                                                                   StandardScaler()),
                                                                  ('pca',
                                                                   PCA(random_state=0)),
                                                                  ('filter_pca_features',
                                                                   ColumnTransformer(transformers=[('passthrough',
                                                                                                    'passthrough',
                                                                                                    ['pca0',
                                                                                                     'pca1',
                                                                                                     'pca2',
                                                                                                     'pca3',
                                                                                                     'pca4',
                                                                                                     'pca5'...
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', 'FirstFlrSF',
                                                   'SecondFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBath',
                                                   'HalfBath', 'BedroomAbvGr',
                                                   'KitchenAbvGr',
                                                   'TotRmsAbvGrd', 'Fireplaces',
                                                   'GarageYrBlt', 'GarageCars',
                                                   'GarageArea', 'WoodDeckSF',
                                                   'OpenPorchSF',
                                                   'EnclosedPorch',
                                                   'ThreeSeasonPorch',
                                                   'ScreenPorch', 'PoolArea', ...])],
                                   verbose_feature_names_out=False))])

ColumnTransformer(remainder='passthrough',
                  transformers=[('pca_kmeans',
                                 Pipeline(steps=[('impute_numeric',
                                                  SimpleImputer()),
                                                 ('standard_scaler',
                                                  StandardScaler()),
                                                 ('pca', PCA(random_state=0)),
                                                 ('filter_pca_features',
                                                  ColumnTransformer(transformers=[('passthrough',
                                                                                   'passthrough',
                                                                                   ['pca0',
                                                                                    'pca1',
                                                                                    'pca2',
                                                                                    'pca3',
                                                                                    'pca4',
                                                                                    'pca5',
                                                                                    'pca6',
                                                                                    'pca7',
                                                                                    'pca8',
                                                                                    'pca9'])],
                                                                    verbos...
                                  'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                                  'FirstFlrSF', 'SecondFlrSF', 'LowQualFinSF',
                                  'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
                                  'FullBath', 'HalfBath', 'BedroomAbvGr',
                                  'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
                                  'GarageYrBlt', 'GarageCars', 'GarageArea',
                                  'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
                                  'ThreeSeasonPorch', 'ScreenPorch', 'PoolArea', ...])],
                  verbose_feature_names_out=False)

['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'FirstFlrSF', 'SecondFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ThreeSeasonPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'YrMoSold']

SimpleImputer()

StandardScaler()

PCA(random_state=0)

ColumnTransformer(transformers=[('passthrough', 'passthrough',
                                 ['pca0', 'pca1', 'pca2', 'pca3', 'pca4',
                                  'pca5', 'pca6', 'pca7', 'pca8', 'pca9'])],
                  verbose_feature_names_out=False)

['pca0', 'pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'pca6', 'pca7', 'pca8', 'pca9']

passthrough

make_mi_scores(X_pca, y).head(10)

PC1     0.957850
PC4     0.113449
PC2     0.108403
PC3     0.106048
PC5     0.066863
PC14    0.061428
PC20    0.056117
PC28    0.054208
PC26    0.052675
PC24    0.048837
Name: MI Scores, dtype: float64

bsmt_sf = [
    "TotalBsmtSF",
    "BsmtFinSF2",
    "BsmtFinSF1",
    "BsmtUnfSF",
]

pca_corr_mat_bsmt_sf, X_pca_corr_mat_bsmt_sf, loadings_corr_mat_bsmt_sf = apply_pca(
    pipe_pca_all.fit_transform(X[bsmt_sf])
)
_ = plot_variance(pca_corr_mat_bsmt_sf, 16)

loadings_corr_mat_bsmt_sf

def make_steps_basement_sf_pca(enable=True, drop_now=False):
    steps = []
    drop_features = []
    if enable:
        bsmt_sf = ["TotalBsmtSF", "BsmtFinSF2", "BsmtFinSF1", "BsmtUnfSF"]
        bsmt_sf_pca_features_list = ['pca' + str(i) for i in range(len(bsmt_sf))]
        ct_steps = [
            (
                'create_pca_features',
                Pipeline(
                    [
                        ('impute_numeric', SimpleImputer(strategy='mean')),
                        ('standard_scaler', StandardScaler()),
                        ('pca', PCA(random_state=0)),
                        (
                            'filter_pca_features',
                            ColumnTransformer(
                                [
                                    (
                                        'passthrough',
                                        'passthrough',
                                        bsmt_sf_pca_features_list[:3],
                                    )
                                ],
                                remainder='drop',
                                verbose_feature_names_out=False,
                            ),
                        ),
                        (
                            'rename_cluster_columns',
                            FunctionTransformer(
                                func=rename_cluster_columns,
                                kw_args={'prefix': 'BsmtSfPCA'},
                            ),
                        ),
                    ]
                ),
                bsmt_sf,
            ),
        ]
        if not drop_now:
            ct_steps += [('orig_features_passthrough', 'passthrough', bsmt_sf)]
            drop_features += bsmt_sf
        steps += [
            (
                'basement_sf_pca',
                ColumnTransformer(
                    ct_steps,
                    remainder='passthrough',
                    verbose_feature_names_out=False,
                ),
            )
        ]
    return steps, drop_features


steps_preproc, _ = make_steps_basement_sf_pca(drop_now=True)

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
display(pipe_preproc)
# print(f'drop: {_}')
# display(pipe_preproc.fit_transform(X, y))
test_all_models('basement_sf_pca', pipe, X, y)

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

Pipeline(steps=[('basement_sf_pca',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('create_pca_features',
                                                  Pipeline(steps=[('impute_numeric',
                                                                   SimpleImputer()),
                                                                  ('standard_scaler',
                                                                   StandardScaler()),
                                                                  ('pca',
                                                                   PCA(random_state=0)),
                                                                  ('filter_pca_features',
                                                                   ColumnTransformer(transformers=[('passthrough',
                                                                                                    'passthrough',
                                                                                                    ['pca0',
                                                                                                     'pca1',
                                                                                                     'pca2'])],
                                                                                     verbose_feature_names_out=False)),
                                                                  ('rename_cluster_columns',
                                                                   FunctionTransformer(func=<function rename_cluster_columns at 0x7ff00c120680>,
                                                                                       kw_args={'prefix': 'BsmtSfPCA'}))]),
                                                  ['TotalBsmtSF', 'BsmtFinSF2',
                                                   'BsmtFinSF1',
                                                   'BsmtUnfSF'])],
                                   verbose_feature_names_out=False))])

Pipeline(steps=[('basement_sf_pca',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('create_pca_features',
                                                  Pipeline(steps=[('impute_numeric',
                                                                   SimpleImputer()),
                                                                  ('standard_scaler',
                                                                   StandardScaler()),
                                                                  ('pca',
                                                                   PCA(random_state=0)),
                                                                  ('filter_pca_features',
                                                                   ColumnTransformer(transformers=[('passthrough',
                                                                                                    'passthrough',
                                                                                                    ['pca0',
                                                                                                     'pca1',
                                                                                                     'pca2'])],
                                                                                     verbose_feature_names_out=False)),
                                                                  ('rename_cluster_columns',
                                                                   FunctionTransformer(func=<function rename_cluster_columns at 0x7ff00c120680>,
                                                                                       kw_args={'prefix': 'BsmtSfPCA'}))]),
                                                  ['TotalBsmtSF', 'BsmtFinSF2',
                                                   'BsmtFinSF1',
                                                   'BsmtUnfSF'])],
                                   verbose_feature_names_out=False))])

ColumnTransformer(remainder='passthrough',
                  transformers=[('create_pca_features',
                                 Pipeline(steps=[('impute_numeric',
                                                  SimpleImputer()),
                                                 ('standard_scaler',
                                                  StandardScaler()),
                                                 ('pca', PCA(random_state=0)),
                                                 ('filter_pca_features',
                                                  ColumnTransformer(transformers=[('passthrough',
                                                                                   'passthrough',
                                                                                   ['pca0',
                                                                                    'pca1',
                                                                                    'pca2'])],
                                                                    verbose_feature_names_out=False)),
                                                 ('rename_cluster_columns',
                                                  FunctionTransformer(func=<function rename_cluster_columns at 0x7ff00c120680>,
                                                                      kw_args={'prefix': 'BsmtSfPCA'}))]),
                                 ['TotalBsmtSF', 'BsmtFinSF2', 'BsmtFinSF1',
                                  'BsmtUnfSF'])],
                  verbose_feature_names_out=False)

['TotalBsmtSF', 'BsmtFinSF2', 'BsmtFinSF1', 'BsmtUnfSF']

SimpleImputer()

StandardScaler()

PCA(random_state=0)

ColumnTransformer(transformers=[('passthrough', 'passthrough',
                                 ['pca0', 'pca1', 'pca2'])],
                  verbose_feature_names_out=False)

['pca0', 'pca1', 'pca2']

def make_steps_target_encoder(enable=True):
    steps = []
    drop_features = []
    if enable:
        # location, location, location
        te_features = ['Neighborhood']
        steps += [
            (
                'target_encoder',
                ColumnTransformer(
                    [
                        (
                            'encoder',
                            Pipeline(
                                [
                                    (
                                        'encoder',
                                        TargetEncoder(min_samples_leaf=20, smoothing=10),
                                    ),
                                    (
                                        'rename_columns',
                                        FunctionTransformer(
                                            func=rename_cluster_columns,
                                            kw_args={
                                                'prefix': 'TargetEncoded',
                                                'numeric': False,
                                            },
                                        ),
                                    ),
                                ]
                            ),
                            te_features,
                        ),
                        ('passthrough_transformed', 'passthrough', te_features),
                    ],
                    remainder='passthrough',
                    n_jobs=1,
                    verbose_feature_names_out=False,
                ),
            )
        ]
    return steps, drop_features


steps_preproc, _ = make_steps_target_encoder()

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
display(pipe_preproc)
display(pipe_preproc.fit_transform(X, y))
test_all_models('target_encoder', pipe, X, y)

Pipeline(steps=[('target_encoder',
                 ColumnTransformer(n_jobs=1, remainder='passthrough',
                                   transformers=[('encoder',
                                                  Pipeline(steps=[('encoder',
                                                                   TargetEncoder()),
                                                                  ('rename_columns',
                                                                   FunctionTransformer(func=<function rename_cluster_columns at 0x7ff00c120680>,
                                                                                       kw_args={'numeric': False,
                                                                                                'prefix': 'TargetEncoded'}))]),
                                                  ['Neighborhood']),
                                                 ('passthrough_transformed',
                                                  'passthrough',
                                                  ['Neighborhood'])],
                                   verbose_feature_names_out=False))])

Pipeline(steps=[('target_encoder',
                 ColumnTransformer(n_jobs=1, remainder='passthrough',
                                   transformers=[('encoder',
                                                  Pipeline(steps=[('encoder',
                                                                   TargetEncoder()),
                                                                  ('rename_columns',
                                                                   FunctionTransformer(func=<function rename_cluster_columns at 0x7ff00c120680>,
                                                                                       kw_args={'numeric': False,
                                                                                                'prefix': 'TargetEncoded'}))]),
                                                  ['Neighborhood']),
                                                 ('passthrough_transformed',
                                                  'passthrough',
                                                  ['Neighborhood'])],
                                   verbose_feature_names_out=False))])

ColumnTransformer(n_jobs=1, remainder='passthrough',
                  transformers=[('encoder',
                                 Pipeline(steps=[('encoder', TargetEncoder()),
                                                 ('rename_columns',
                                                  FunctionTransformer(func=<function rename_cluster_columns at 0x7ff00c120680>,
                                                                      kw_args={'numeric': False,
                                                                               'prefix': 'TargetEncoded'}))]),
                                 ['Neighborhood']),
                                ('passthrough_transformed', 'passthrough',
                                 ['Neighborhood'])],
                  verbose_feature_names_out=False)

['Neighborhood']

TargetEncoder()

FunctionTransformer(func=<function rename_cluster_columns at 0x7ff00c120680>,
                    kw_args={'numeric': False, 'prefix': 'TargetEncoded'})

['Neighborhood']

passthrough

passthrough

make_submission_all(pipe, X, y, X_test, 'target_encoder')

def make_steps_drop_frequent_nan_features(max_acceptable_nan_proportion=0.1, drop_now=False):
    """
    max_acceptable_nan_proportion=1.0 means disabled
    """
    steps = []
    drop_features = []
    if max_acceptable_nan_proportion < 1.0:
        features_to_drop = features_nan_proportion[
            features_nan_proportion > max_acceptable_nan_proportion
        ].index.to_list()
        if drop_now:
            steps += [
                (
                    'drop_frequent_nan_features',
                    ColumnTransformer(
                        [('drop', 'drop', features_to_drop)],
                        remainder='passthrough',
                        n_jobs=1,
                        verbose_feature_names_out=False,
                    ),
                ),
            ]
        else:
            steps += [('drop_frequent_nan_features', 'passthrough')]
            drop_features = features_to_drop
    return steps, drop_features


features_nan_proportion = X.isna().mean().sort_values(ascending=False)

steps_preproc, _ = make_steps_drop_frequent_nan_features(drop_now=True)
print(f'drop: {_}')

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
display(pipe_preproc)
display(pipe_preproc.fit_transform(X, y))
test_all_models('drop_frequent_nan_features', pipe, X, y)

drop: []

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

Pipeline(steps=[('drop_frequent_nan_features',
                 ColumnTransformer(n_jobs=1, remainder='passthrough',
                                   transformers=[('drop', 'drop',
                                                  ['PoolQC', 'MiscFeature',
                                                   'Alley', 'Fence',
                                                   'MasVnrType', 'FireplaceQu',
                                                   'LotFrontage'])],
                                   verbose_feature_names_out=False))])

Pipeline(steps=[('drop_frequent_nan_features',
                 ColumnTransformer(n_jobs=1, remainder='passthrough',
                                   transformers=[('drop', 'drop',
                                                  ['PoolQC', 'MiscFeature',
                                                   'Alley', 'Fence',
                                                   'MasVnrType', 'FireplaceQu',
                                                   'LotFrontage'])],
                                   verbose_feature_names_out=False))])

ColumnTransformer(n_jobs=1, remainder='passthrough',
                  transformers=[('drop', 'drop',
                                 ['PoolQC', 'MiscFeature', 'Alley', 'Fence',
                                  'MasVnrType', 'FireplaceQu',
                                  'LotFrontage'])],
                  verbose_feature_names_out=False)

['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu', 'LotFrontage']

drop

passthrough

drop_frequent_nan_features
xgboost  0.13321589484241608
lgbm     0.12747931598754508
catboost 0.11781978430774054
ridge    0.13457851578174262
enet     0.13356035530108173

make_submission_all(pipe, X, y, X_test, 'drop_frequent_nan_features')

def make_steps_simple_imputer(enable=True):
    steps = []
    drop_features = []
    if enable:
        steps += [
            (
                ('simple_imputer_split_num_cat'),
                ColumnTransformer(
                    [
                        (
                            'numeric_imputer',
                            SimpleImputer(strategy='median'),
                            make_column_selector(dtype_include='number'),
                        ),
                        (
                            'categorical_imputer',
                            SimpleImputerKeepCategories(strategy='most_frequent'),
                            make_column_selector(dtype_exclude='number'),
                        ),
                    ],
                    remainder='drop',
                    n_jobs=1,
                    verbose_feature_names_out=False,
                ),
            )
        ]
    return steps, drop_features


steps_preproc, _ = make_steps_simple_imputer()

pipe_preproc, pipe = rebuild_all_pipelines(steps_preproc)
# display(pipe_preproc)
# pipe_preproc.fit_transform(X, y).info()
test_all_models('simple_imputer', pipe, X, y)

simple_imputer
xgboost  0.13568360140025726
lgbm     0.1277483757240745
catboost 0.11963081619388051
ridge    0.13577055969382246
enet     0.13689558352362224

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(

def rebuild_pipe_preproc_from_args(pipe_args):
    """
    pipe_args: flags to enable/disable various pipeline steps
    Each step has been tested above.

    Returns: a pipeline with all data processing steps enabled by the args.
    """
    steps_preproc = []
    drop_features = []

    sp, df = make_steps_pca_outliers_score(
        min_explained_variation=pipe_args['pca_outliers_compon_min_expl_var'],
        pca_score_distribution=pipe_args['pca_outliers_score_distribution'],
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_mathematical_transforms(
        enable_living_lot_ratio=pipe_args['living_lot_ratio'],
        enable_spaciousness=pipe_args['spaciousness'],
        enable_outside_sf_features=pipe_args['outside_sf_features'],
        enable_outdoor_area_type_count=pipe_args['outdoor_area_type_count'],
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_area_to_linear(
        lot_area=pipe_args['a2l_lot_area'],
        total_bsmt_sf=pipe_args['a2l_total_bsmt_sf'],
        first_flr_sf=pipe_args['a2l_first_flr_sf'],
        gr_liv_area=pipe_args['a2l_gr_liv_area'],
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_coalesce_mssubclass(
        action=pipe_args['coalesce_mssubclass'],
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_GrLivArea_BldgType_interaction(
        action=pipe_args['GrLivArea_BldgType_interaction'],
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_qual_cond_interactions(
        garage_interaction=pipe_args['garage_interaction'],
        exterior_interaction=pipe_args['exterior_interaction'],
        overall_interaction=pipe_args['overall_interaction'],
        basement_interaction=pipe_args['basement_interaction'],
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_num_cat_interactions(
        basement_num_cat=pipe_args['basement_num_cat_interaction'], pool_num_cat=pipe_args['pool_num_cat_interaction']
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_median_diff_groupby(
        group_LotArea_by_MSZoning=pipe_args['group_LotArea_by_MSZoning'],
        group_GrLivArea_by_MSSubClass=pipe_args['group_GrLivArea_by_MSSubClass'],
        group_LotArea_by_MSSubClass=pipe_args['group_LotArea_by_MSSubClass'],
        group_GrLivArea_by_HouseStyle=pipe_args['group_GrLivArea_by_HouseStyle'],
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_cluster_labels_area_feature(n_clusters=pipe_args['area_features_cluster_labels'])
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_cluster_distances_area_features(n_clusters=pipe_args['area_features_cluster_distances'])
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_replace_numeric_with_pca_features(
        pca_features_count=pipe_args['pca_replace_num_features'],
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_cluster_labels_pca(
        cluster_labels_pca_features_count=pipe_args['cluster_labels_pca_features_count'],
        n_clusters_labels_pca=pipe_args['n_clusters_labels_pca'],
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_cluster_distances_pca(
        cluster_distances_pca_features_count=pipe_args['cluster_distances_pca_features_count'],
        n_clusters_distances_pca=pipe_args['n_clusters_distances_pca'],
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_basement_sf_pca(
        enable=pipe_args['basement_sf_pca'],
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_target_encoder(
        enable=pipe_args['target_encoder'],
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_drop_frequent_nan_features(
        max_acceptable_nan_proportion=pipe_args['max_acceptable_nan'],
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_drop_uninformative(
        mi_scores=mi_scores,
        min_score=pipe_args['min_mutual_info_score'],
    )
    steps_preproc += sp
    drop_features += df

    sp, df = make_steps_simple_imputer(
        enable=pipe_args['simple_imputer'],
    )
    steps_preproc += sp
    drop_features += df

    if pipe_args['log_quantile_transform_numeric'] == 'log':
        sp, df = make_steps_log_transform(log_transform_features, enable=True)
    elif pipe_args['log_quantile_transform_numeric'] == 'quantile':
        sp, df = make_steps_numeric_quantile_transformer(enable=True)
    else:
        sp = []
        df = []
    steps_preproc += sp
    drop_features += df

    # remove duplicates
    drop_features = list(set(drop_features))
    # drop_features was collected at all steps above, this is where it is applied
    steps_preproc += [
        (
            'drop_features_final',
            ColumnTransformer(
                [('drop', 'drop', drop_features)],
                remainder='passthrough',
                n_jobs=1,
                verbose_feature_names_out=False,
            ),
        ),
    ]

    pipe_preproc = Pipeline(steps=steps_preproc, verbose=False)
    pipe_preproc.set_output(transform='pandas')
    return pipe_preproc

def rebuild_model_pipe_from_args(model_name, pipe_preproc, model_args):
    """
    model_name: string with name of model
    pipe_preproc: a fully build pipeline with data processing steps
    model_args: model-specific parameters (used by Optuna)

    Returns a complete pipeline, including the regression model.
    """
    if model_name == 'xgboost':
        pipe = rebuild_xgboost(pipe_preproc, model_args=model_args)
    elif model_name == 'lgbm':
        pipe = rebuild_lgbm(pipe_preproc, model_args=model_args)
    elif model_name == 'catboost':
        pipe = rebuild_catboost(pipe_preproc, model_args=model_args)
    elif model_name == 'ridge':
        pipe = rebuild_ridge(pipe_preproc, model_args=model_args)
    elif model_name == 'enet':
        pipe = rebuild_enet(pipe_preproc, model_args=model_args)
    return pipe

def make_pipe_args(trial):
    """
    trial: an Optuna trial with flags to enable/disable various data processing steps

    Returns pipe_args for rebuild_pipe_preproc_from_args().
    """
    pipe_args = {}
    # min_explained_variation < 0.1 means disabled
    pipe_args['pca_outliers_compon_min_expl_var'] = trial.suggest_float('pca_outliers_compon_min_expl_var', 0.0, 1.0)
    pipe_args['pca_outliers_score_distribution'] = trial.suggest_categorical(
        'pca_outliers_score_distribution', ['uniform', 'normal']
    )
    pipe_args['living_lot_ratio'] = trial.suggest_categorical('living_lot_ratio', [False, True])
    pipe_args['spaciousness'] = trial.suggest_categorical('spaciousness', [False, True])
    pipe_args['outside_sf_features'] = trial.suggest_categorical('outside_sf_features', [False, True])
    pipe_args['outdoor_area_type_count'] = trial.suggest_categorical('outdoor_area_type_count', [False, True])
    pipe_args['a2l_lot_area'] = trial.suggest_categorical('a2l_lot_area', ['disable', 'enable_drop', 'enable'])
    pipe_args['a2l_total_bsmt_sf'] = trial.suggest_categorical(
        'a2l_total_bsmt_sf', ['disable', 'enable_drop', 'enable']
    )
    pipe_args['a2l_first_flr_sf'] = trial.suggest_categorical('a2l_first_flr_sf', ['disable', 'enable_drop', 'enable'])
    pipe_args['a2l_gr_liv_area'] = trial.suggest_categorical('a2l_gr_liv_area', ['disable', 'enable_drop', 'enable'])
    pipe_args['coalesce_mssubclass'] = trial.suggest_categorical(
        'coalesce_mssubclass', ['disable', 'enable_drop', 'enable']
    )
    pipe_args['GrLivArea_BldgType_interaction'] = trial.suggest_categorical(
        'GrLivArea_BldgType_interaction', ['disable', 'enable_drop', 'enable']
    )
    pipe_args['garage_interaction'] = trial.suggest_categorical('garage_interaction', [False, True])
    pipe_args['exterior_interaction'] = trial.suggest_categorical('exterior_interaction', [False, True])
    pipe_args['overall_interaction'] = trial.suggest_categorical('overall_interaction', [False, True])
    pipe_args['basement_interaction'] = trial.suggest_categorical('basement_interaction', [False, True])
    pipe_args['basement_num_cat_interaction'] = trial.suggest_categorical('basement_num_cat_interaction', [False, True])
    pipe_args['pool_num_cat_interaction'] = trial.suggest_categorical('pool_num_cat_interaction', [False, True])
    pipe_args['group_LotArea_by_MSZoning'] = trial.suggest_categorical('group_LotArea_by_MSZoning', [False, True])
    pipe_args['group_GrLivArea_by_MSSubClass'] = trial.suggest_categorical(
        'group_GrLivArea_by_MSSubClass', [False, True]
    )
    pipe_args['group_LotArea_by_MSSubClass'] = trial.suggest_categorical('group_LotArea_by_MSSubClass', [False, True])
    pipe_args['group_GrLivArea_by_HouseStyle'] = trial.suggest_categorical(
        'group_GrLivArea_by_HouseStyle', [False, True]
    )
    # area_features_cluster_labels=1 means disabled
    pipe_args['area_features_cluster_labels'] = trial.suggest_int('area_features_cluster_labels', 1, 30)
    # area_features_cluster_distances=1 means disabled
    pipe_args['area_features_cluster_distances'] = trial.suggest_int('area_features_cluster_distances', 1, 30)
    # pca_replace_num_features=0 means disabled
    pipe_args['pca_replace_num_features'] = trial.suggest_int('pca_replace_num_features', 0, len(features_numeric))
    # n_clusters_labels_pca=1 means disabled
    pipe_args['n_clusters_labels_pca'] = trial.suggest_int('n_clusters_labels_pca', 1, 30)
    pipe_args['cluster_labels_pca_features_count'] = trial.suggest_int(
        'cluster_labels_pca_features_count', 1, len(features_numeric)
    )
    # n_clusters_distances_pca=1 means disabled
    pipe_args['n_clusters_distances_pca'] = trial.suggest_int('n_clusters_distances_pca', 1, 30)
    pipe_args['cluster_distances_pca_features_count'] = trial.suggest_int(
        'cluster_distances_pca_features_count', 1, len(features_numeric)
    )
    pipe_args['basement_sf_pca'] = trial.suggest_categorical('basement_sf_pca', [False, True])
    pipe_args['target_encoder'] = trial.suggest_categorical('target_encoder', [False, True])
    # max_acceptable_nan=1.0 means disabled
    pipe_args['max_acceptable_nan'] = trial.suggest_float('max_acceptable_nan', 0.0001, 1.0, log=True)
    # min_mutual_info_score=0.0 means disabled
    pipe_args['min_mutual_info_score'] = trial.suggest_float('min_mutual_info_score', 0.0, 0.01)
    pipe_args['simple_imputer'] = trial.suggest_categorical('simple_imputer', [False, True])
    pipe_args['log_quantile_transform_numeric'] = trial.suggest_categorical(
        'log_quantile_transform_numeric', [None, 'log', 'quantile']
    )
    return pipe_args

def make_model_args(trial, study_name):
    """
    trial: an Optuna trial with model parameters to try
    study_name: the name of the model or study

    Returns: model_args for rebuild_model_pipe_from_args().
    """
    model_args = {}
    if study_name == 'xgboost':
        model_args = {'random_state': 0, 'verbosity': 0, 'n_jobs': 1}
        model_args['n_estimators'] = trial.suggest_int('xgboost_n_estimators', 500, 10000, log=False)
        model_args['max_depth'] = trial.suggest_int('xgboost_max_depth', 2, 6)
        model_args['learning_rate'] = trial.suggest_float('xgboost_learning_rate', 0.002, 0.1, log=True)
        model_args['gamma'] = trial.suggest_float('xgboost_gamma', 0.0, 2.0)
        model_args['min_child_weight'] = trial.suggest_int('xgboost_min_child_weight', 1, 7)
        model_args['subsample'] = trial.suggest_float('xgboost_subsample', 0.6, 1.0)
        model_args['colsample_bytree'] = trial.suggest_float('xgboost_colsample_bytree', 0.1, 1.0, log=True)
        model_args['reg_alpha'] = trial.suggest_float('xgboost_reg_alpha', 0.0, 2.0)
        model_args['reg_lambda'] = trial.suggest_float('xgboost_reg_lambda', 0.0, 15.0)
        model_args['num_parallel_tree'] = trial.suggest_int('xgboost_num_parallel_tree', 0, 6)
        if model_args['num_parallel_tree'] == 0:
            model_args['num_parallel_tree'] = None
    elif study_name == 'lgbm':
        model_args = {'random_state': 0, 'n_jobs': 1, 'verbose': -1}
        model_args['learning_rate'] = trial.suggest_float('lgbm_learning_rate', 0.005, 0.1, log=True)
        model_args['reg_alpha'] = trial.suggest_float('lgbm_reg_alpha', 0.0, 20.0, log=False)
        model_args['reg_lambda'] = trial.suggest_float('lgbm_reg_lambda', 0.0, 10.0, log=False)
        model_args['max_depth'] = trial.suggest_int('lgbm_max_depth', 3, 8)
        num_leaves_frac_max_depth_exp = trial.suggest_float('lgbm_num_leaves_frac_max_depth_exp', 0.2, 0.8)
        model_args['num_leaves'] = round(num_leaves_frac_max_depth_exp * (2 ** model_args['max_depth']))
        model_args['n_estimators'] = trial.suggest_int('lgbm_n_estimators', 500, 5000)
        model_args['bagging_freq'] = trial.suggest_int('lgbm_bagging_freq', 0, 10)
        model_args['bagging_fraction'] = trial.suggest_float('lgbm_bagging_fraction', 0.5, 1.0)
        model_args['feature_fraction'] = trial.suggest_float('lgbm_feature_fraction', 0.1, 1.0, log=True)
        model_args['max_bin'] = trial.suggest_int('lgbm_max_bin', 200, 2000)
        model_args['min_data_in_leaf'] = trial.suggest_int('lgbm_min_data_in_leaf', 1, 10)
        model_args['min_split_gain'] = trial.suggest_float('lgbm_min_split_gain', 1e-5, 1.0, log=True)
    elif study_name == 'catboost':
        model_args = {'random_state': 0, 'logging_level': 'Silent', 'thread_count': 1}
        model_args['iterations'] = trial.suggest_int('catboost_iterations', 500, 10000)
        model_args['learning_rate'] = trial.suggest_float('catboost_learning_rate', 0.001, 0.1, log=True)
        model_args['l2_leaf_reg'] = trial.suggest_int('catboost_l2_leaf_reg', 2, 8)
        model_args['random_strength'] = trial.suggest_float('catboost_random_strength', 0.01, 10.0, log=True)
        model_args['early_stopping_rounds'] = trial.suggest_int('catboost_early_stopping_rounds', 4, 20)
        model_args['depth'] = trial.suggest_int('catboost_depth', 2, 8)
        model_args['subsample'] = trial.suggest_float('catboost_subsample', 0.3, 1.0)
        model_args['bagging_temperature'] = trial.suggest_float('catboost_bagging_temperature', 0.01, 100.0, log=True)
        model_args['colsample_bylevel'] = trial.suggest_float('catboost_colsample_bylevel', 0.02, 1.0, log=True)
        model_args['min_data_in_leaf'] = trial.suggest_int('catboost_min_data_in_leaf', 1, 120)
    elif study_name == 'ridge':
        model_args = {'random_state': 0}
        model_args['alpha'] = trial.suggest_float('ridge_alpha', 3.0, 16.0)
    elif study_name == 'enet':
        model_args = {'random_state': 0, 'max_iter': 10000}
        model_args['selection'] = trial.suggest_categorical('enet_selection', ['cyclic', 'random'])
        model_args['alpha'] = trial.suggest_float('enet_alpha', 0.5, 5.0, log=True)
        model_args['l1_ratio'] = trial.suggest_float('enet_l1_ratio', 0.2, 0.9, log=True)
    return model_args

def objective(trial, study_name):
    """
    Optuna objective function for single model pipelines.
    """
    pipe_args = make_pipe_args(trial)
    model_args = make_model_args(trial, study_name)

    pipe_preproc = rebuild_pipe_preproc_from_args(pipe_args=pipe_args)
    pipe = rebuild_model_pipe_from_args(model_name=study_name, pipe_preproc=pipe_preproc, model_args=model_args)

    return test_performance(pipe=pipe, X=X, y=y, n_splits=num_folds, n_jobs=cpu_count)

# a MySQL server is used for distributed optimization
# otherwise, just use a local file
if 'OPTUNA_URL' in os.environ.keys():
    OPTUNA_URL = os.environ['OPTUNA_URL']
else:
    OPTUNA_URL = 'sqlite:///optuna.db'


study = {}
baseline_models = ['xgboost', 'lgbm', 'catboost', 'ridge', 'enet']
# optuna_reset_list = baseline_models
optuna_reset_list = []

for m in baseline_models:
    if m in optuna_reset_list:
        print(f'delete study {m}')
        try:
            optuna.delete_study(
                study_name=m,
                storage=OPTUNA_URL,
            )
        except:
            pass
time.sleep(0.1)
for m in baseline_models:
    study[m] = optuna.create_study(
        direction='minimize',
        study_name=m,
        storage=OPTUNA_URL,
        load_if_exists=True,
    )

[I 2023-08-14 13:19:14,276] Using an existing study with name 'xgboost' instead of creating a new one.
[I 2023-08-14 13:19:14,308] Using an existing study with name 'lgbm' instead of creating a new one.
[I 2023-08-14 13:19:14,340] Using an existing study with name 'catboost' instead of creating a new one.
[I 2023-08-14 13:19:14,371] Using an existing study with name 'ridge' instead of creating a new one.
[I 2023-08-14 13:19:14,399] Using an existing study with name 'enet' instead of creating a new one.

for m in baseline_models:
    print()
    print(m)
    # study[m].optimize(lambda trial: objective(trial, m), n_trials=3000, catch=(ValueError,), show_progress_bar=False)

xgboost

lgbm

catboost

ridge

enet

best_params = {}
for m in baseline_models:
    print()
    print(m)
    bp_file = 'best_params_' + m + '.json'
    if not os.path.exists(bp_file):
        print(f'best trial: {best_trial.number}, best score: {best_trial.value}')
        best_trial = study[m].best_trial
        with open(bp_file, 'w') as bpf:
            bpf.write(json.dumps(best_trial.params, indent=2))
    with open(bp_file, 'r') as bpf:
        best_params[m] = json.load(bpf)
        print(best_params[m])

xgboost
{'pca_outliers_compon_min_expl_var': 0.6792160610201244, 'pca_outliers_score_distribution': 'uniform', 'living_lot_ratio': True, 'spaciousness': True, 'outside_sf_features': True, 'outdoor_area_type_count': False, 'a2l_lot_area': 'disable', 'a2l_total_bsmt_sf': 'enable_drop', 'a2l_first_flr_sf': 'enable', 'a2l_gr_liv_area': 'disable', 'coalesce_mssubclass': 'enable_drop', 'GrLivArea_BldgType_interaction': 'disable', 'garage_interaction': True, 'exterior_interaction': False, 'overall_interaction': True, 'basement_interaction': False, 'basement_num_cat_interaction': False, 'pool_num_cat_interaction': True, 'group_LotArea_by_MSZoning': True, 'group_GrLivArea_by_MSSubClass': False, 'group_LotArea_by_MSSubClass': True, 'group_GrLivArea_by_HouseStyle': True, 'area_features_cluster_labels': 5, 'area_features_cluster_distances': 22, 'pca_replace_num_features': 0, 'n_clusters_labels_pca': 10, 'cluster_labels_pca_features_count': 8, 'n_clusters_distances_pca': 26, 'cluster_distances_pca_features_count': 2, 'basement_sf_pca': True, 'target_encoder': True, 'max_acceptable_nan': 0.9898168862678122, 'min_mutual_info_score': 0.006684153644376341, 'simple_imputer': False, 'log_quantile_transform_numeric': None, 'xgboost_n_estimators': 9167, 'xgboost_max_depth': 3, 'xgboost_learning_rate': 0.007954996213047077, 'xgboost_gamma': 1.8878965142297428, 'xgboost_min_child_weight': 1, 'xgboost_subsample': 0.8057864465614245, 'xgboost_colsample_bytree': 0.11573611124523174, 'xgboost_reg_alpha': 1.9207869752515219, 'xgboost_reg_lambda': 2.8588908361664043, 'xgboost_num_parallel_tree': 3}

lgbm
{'pca_outliers_compon_min_expl_var': 0.4693924952169375, 'pca_outliers_score_distribution': 'uniform', 'living_lot_ratio': True, 'spaciousness': False, 'outside_sf_features': True, 'outdoor_area_type_count': False, 'a2l_lot_area': 'enable_drop', 'a2l_total_bsmt_sf': 'enable_drop', 'a2l_first_flr_sf': 'enable_drop', 'a2l_gr_liv_area': 'enable', 'coalesce_mssubclass': 'enable_drop', 'GrLivArea_BldgType_interaction': 'enable', 'garage_interaction': False, 'exterior_interaction': False, 'overall_interaction': True, 'basement_interaction': False, 'basement_num_cat_interaction': False, 'pool_num_cat_interaction': False, 'group_LotArea_by_MSZoning': True, 'group_GrLivArea_by_MSSubClass': False, 'group_LotArea_by_MSSubClass': False, 'group_GrLivArea_by_HouseStyle': False, 'area_features_cluster_labels': 3, 'area_features_cluster_distances': 1, 'pca_replace_num_features': 4, 'n_clusters_labels_pca': 28, 'cluster_labels_pca_features_count': 13, 'n_clusters_distances_pca': 1, 'cluster_distances_pca_features_count': 31, 'basement_sf_pca': False, 'target_encoder': False, 'max_acceptable_nan': 0.9905825130421669, 'min_mutual_info_score': 0.007685542570961947, 'simple_imputer': False, 'log_quantile_transform_numeric': 'quantile', 'lgbm_learning_rate': 0.02165737167195487, 'lgbm_reg_alpha': 10.850386698476989, 'lgbm_reg_lambda': 0.2304919757106442, 'lgbm_max_depth': 4, 'lgbm_num_leaves_frac_max_depth_exp': 0.5030905420520119, 'lgbm_n_estimators': 3351, 'lgbm_bagging_freq': 9, 'lgbm_bagging_fraction': 0.7690824854888628, 'lgbm_feature_fraction': 0.423111768380243, 'lgbm_max_bin': 1974, 'lgbm_min_data_in_leaf': 1, 'lgbm_min_split_gain': 0.031341064357919134}

catboost
{'pca_outliers_compon_min_expl_var': 0.3521045184497489, 'pca_outliers_score_distribution': 'normal', 'living_lot_ratio': True, 'spaciousness': False, 'outside_sf_features': True, 'outdoor_area_type_count': False, 'a2l_lot_area': 'enable_drop', 'a2l_total_bsmt_sf': 'disable', 'a2l_first_flr_sf': 'disable', 'a2l_gr_liv_area': 'disable', 'coalesce_mssubclass': 'disable', 'GrLivArea_BldgType_interaction': 'disable', 'garage_interaction': False, 'exterior_interaction': True, 'overall_interaction': True, 'basement_interaction': True, 'basement_num_cat_interaction': True, 'pool_num_cat_interaction': False, 'group_LotArea_by_MSZoning': True, 'group_GrLivArea_by_MSSubClass': False, 'group_LotArea_by_MSSubClass': True, 'group_GrLivArea_by_HouseStyle': True, 'area_features_cluster_labels': 9, 'area_features_cluster_distances': 15, 'pca_replace_num_features': 0, 'n_clusters_labels_pca': 11, 'cluster_labels_pca_features_count': 30, 'n_clusters_distances_pca': 7, 'cluster_distances_pca_features_count': 14, 'basement_sf_pca': False, 'target_encoder': True, 'max_acceptable_nan': 0.11352702557833981, 'min_mutual_info_score': 0.007539091012842199, 'simple_imputer': False, 'log_quantile_transform_numeric': None, 'catboost_iterations': 7657, 'catboost_learning_rate': 0.013038680410097994, 'catboost_l2_leaf_reg': 4, 'catboost_random_strength': 0.0479014817267622, 'catboost_early_stopping_rounds': 14, 'catboost_depth': 5, 'catboost_subsample': 0.6192770345762606, 'catboost_bagging_temperature': 0.09690984798868421, 'catboost_colsample_bylevel': 0.1313914681422102, 'catboost_min_data_in_leaf': 13}

ridge
{'pca_outliers_compon_min_expl_var': 0.643179173453164, 'pca_outliers_score_distribution': 'uniform', 'living_lot_ratio': True, 'spaciousness': False, 'outside_sf_features': False, 'outdoor_area_type_count': False, 'a2l_lot_area': 'enable_drop', 'a2l_total_bsmt_sf': 'enable_drop', 'a2l_first_flr_sf': 'enable_drop', 'a2l_gr_liv_area': 'disable', 'coalesce_mssubclass': 'enable', 'GrLivArea_BldgType_interaction': 'disable', 'garage_interaction': True, 'exterior_interaction': True, 'overall_interaction': True, 'basement_interaction': True, 'basement_num_cat_interaction': True, 'pool_num_cat_interaction': False, 'group_LotArea_by_MSZoning': False, 'group_GrLivArea_by_MSSubClass': False, 'group_LotArea_by_MSSubClass': False, 'group_GrLivArea_by_HouseStyle': False, 'area_features_cluster_labels': 6, 'area_features_cluster_distances': 21, 'pca_replace_num_features': 6, 'n_clusters_labels_pca': 9, 'cluster_labels_pca_features_count': 20, 'n_clusters_distances_pca': 29, 'cluster_distances_pca_features_count': 1, 'basement_sf_pca': False, 'target_encoder': True, 'max_acceptable_nan': 0.00048401658660185334, 'min_mutual_info_score': 0.006594125449177525, 'simple_imputer': False, 'log_quantile_transform_numeric': 'log', 'ridge_alpha': 10.996218159097683}

enet
{'pca_outliers_compon_min_expl_var': 0.15676597280159815, 'pca_outliers_score_distribution': 'uniform', 'living_lot_ratio': False, 'spaciousness': True, 'outside_sf_features': False, 'outdoor_area_type_count': False, 'a2l_lot_area': 'enable', 'a2l_total_bsmt_sf': 'enable_drop', 'a2l_first_flr_sf': 'disable', 'a2l_gr_liv_area': 'enable', 'coalesce_mssubclass': 'enable_drop', 'GrLivArea_BldgType_interaction': 'enable_drop', 'garage_interaction': True, 'exterior_interaction': False, 'overall_interaction': True, 'basement_interaction': True, 'basement_num_cat_interaction': True, 'pool_num_cat_interaction': False, 'group_LotArea_by_MSZoning': True, 'group_GrLivArea_by_MSSubClass': False, 'group_LotArea_by_MSSubClass': False, 'group_GrLivArea_by_HouseStyle': False, 'area_features_cluster_labels': 9, 'area_features_cluster_distances': 7, 'pca_replace_num_features': 0, 'n_clusters_labels_pca': 13, 'cluster_labels_pca_features_count': 14, 'n_clusters_distances_pca': 28, 'cluster_distances_pca_features_count': 6, 'basement_sf_pca': True, 'target_encoder': True, 'max_acceptable_nan': 0.0345505313614619, 'min_mutual_info_score': 0.0017260615859535154, 'simple_imputer': False, 'log_quantile_transform_numeric': 'log', 'enet_selection': 'cyclic', 'enet_alpha': 1.300552633069198, 'enet_l1_ratio': 0.44339141668657467}

# precompute parameter importances, and save them to disk since this takes a long time
# every time Optuna is re-run for any model, then parameter importances must be re-computed

# optuna_reset_list = []
param_importance = {}
for m in tqdm(baseline_models):
    pi_file = 'param_importance_' + m + '.csv'
    if m in optuna_reset_list or not os.path.exists(pi_file):
        pi_dict = optuna.importance.get_param_importances(study[m])
        param_importance[m] = pd.DataFrame(
            {
                'Hyperparameter': pi_dict.keys(),
                'Importance for Objective Value': pi_dict.values(),
            }
        )
        param_importance[m].to_csv(pi_file, index=False)
    else:
        param_importance[m] = pd.read_csv(pi_file)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1703.20it/s]

for m in baseline_models:
    print('-------------------------------------------------------------------------------')
    fig1 = px.bar(
        data_frame=param_importance[m],
        y='Hyperparameter',
        x='Importance for Objective Value',
        title='Hyperparameter Importances',
    )
    fig1.update_layout(
        title=m + ' - Hyperparameter Importances',
        yaxis=dict(autorange="reversed"),
        width=1000,
        height=1000,
    )

    fig2 = optuna.visualization.plot_optimization_history(study[m])
    fig2.update_layout(title=m + ' - Optimization History Plot', autosize=False, width=1000, height=360)

    fig3 = optuna.visualization.plot_slice(study[m], params=param_importance[m]['Hyperparameter'].to_list()[:5])
    fig3.update_layout(title=m + ' - Slice Plot', autosize=False, width=1200, height=360)

    fig1.show('png')
    fig2.show('png')
    fig3.show('png')

-------------------------------------------------------------------------------

-------------------------------------------------------------------------------

-------------------------------------------------------------------------------

-------------------------------------------------------------------------------

-------------------------------------------------------------------------------

m = 'xgboost'
i_min = 0
p_list = param_importance[m]['Hyperparameter'].to_list()[i_min : i_min + 5]
print(p_list)
fig = optuna.visualization.plot_slice(study[m], params=p_list)
fig.update_layout(title=m + ' - Slice Plot', width=1300, height=400)
fig.show('png')

['xgboost_n_estimators', 'xgboost_reg_lambda', 'xgboost_learning_rate', 'pca_replace_num_features', 'target_encoder']

p = 'pca_replace_num_features'
for m in baseline_models:
    fig = optuna.visualization.plot_slice(study[m], params=[p])
    fig.update_layout(title=m + ' - Slice Plot', autosize=False, width=400, height=350)
    fig.show('png')

def visualize_model_parameters(m, log_x=False):
    trials_df = study[m].trials_dataframe()
    # filter columns
    trials_df = trials_df[['value'] + [c for c in trials_df.columns.to_list() if c.startswith('params_' + m)]]
    param_columns = trials_df.columns.to_list()[1:]
    n_col = 3
    n_row = int(np.ceil(len(param_columns) / n_col))
    fig, ax = plt.subplots(nrows=n_row, ncols=n_col, figsize=(4 * n_col, 3 * n_row))

    i = 0
    for c in param_columns:
        x = trials_df[c].to_list()
        y = trials_df['value'].to_list()
        if n_row == 1:
            ax_curr = ax[i]
        elif n_row > 1:
            ax_curr = ax[int(i / n_col), int(i % n_col)]
        ax_curr.scatter(x=x, y=y, s=5)
        if log_x and trials_df[c].dtype != 'object':
            x_ticks = [x for x in ax_curr.get_xticks() if x >= 0.0]
            ax_curr.set_xscale('log')
            ax_curr.get_xaxis().set_major_locator(FixedLocator(x_ticks))
            ax_curr.get_xaxis().set_major_formatter(ScalarFormatter())
        ax_curr.set_title(c)
        i += 1
    fig.show()
    print(param_columns)

visualize_model_parameters('xgboost', log_x=False)

/tmp/ipykernel_35469/4190299772.py:26: UserWarning:

Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.

['params_xgboost_colsample_bytree', 'params_xgboost_gamma', 'params_xgboost_learning_rate', 'params_xgboost_max_depth', 'params_xgboost_min_child_weight', 'params_xgboost_n_estimators', 'params_xgboost_num_parallel_tree', 'params_xgboost_reg_alpha', 'params_xgboost_reg_lambda', 'params_xgboost_subsample']

visualize_model_parameters('lgbm', log_x=False)

/tmp/ipykernel_35469/4190299772.py:26: UserWarning:

Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.

['params_lgbm_bagging_fraction', 'params_lgbm_bagging_freq', 'params_lgbm_feature_fraction', 'params_lgbm_learning_rate', 'params_lgbm_max_bin', 'params_lgbm_max_depth', 'params_lgbm_min_data_in_leaf', 'params_lgbm_min_split_gain', 'params_lgbm_n_estimators', 'params_lgbm_num_leaves_frac_max_depth_exp', 'params_lgbm_reg_alpha', 'params_lgbm_reg_lambda']

visualize_model_parameters('catboost', log_x=True)

/tmp/ipykernel_35469/4190299772.py:26: UserWarning:

Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.

['params_catboost_bagging_temperature', 'params_catboost_colsample_bylevel', 'params_catboost_depth', 'params_catboost_early_stopping_rounds', 'params_catboost_iterations', 'params_catboost_l2_leaf_reg', 'params_catboost_learning_rate', 'params_catboost_min_data_in_leaf', 'params_catboost_random_strength', 'params_catboost_subsample']

visualize_model_parameters('ridge', log_x=False)

/tmp/ipykernel_35469/4190299772.py:26: UserWarning:

Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.

['params_ridge_alpha']

visualize_model_parameters('enet', log_x=True)

/tmp/ipykernel_35469/4190299772.py:26: UserWarning:

Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.

['params_enet_alpha', 'params_enet_l1_ratio', 'params_enet_selection']

best_model = {}
for m in baseline_models:
    best_pipe_args = {}
    best_model_args = {}
    if m == 'xgboost':
        best_model_args = {'random_state': 0, 'verbosity': 0, 'n_jobs': 1}
    elif m == 'lgbm':
        best_model_args = {'random_state': 0, 'n_jobs': 1, 'verbose': -1}
    elif m == 'catboost':
        best_model_args = {
            'random_state': 0,
            'logging_level': 'Silent',
            'thread_count': 1,
        }
    elif m == 'ridge':
        best_model_args = {'random_state': 0}
    elif m == 'enet':
        best_model_args = {'random_state': 0, 'max_iter': 10000}
    for k, v in best_params[m].items():
        if k.startswith(m + '_'):
            best_model_args[k.split('_', 1)[1]] = v
        else:
            best_pipe_args[k] = v

    # TODO: fix these exceptions here and in make_model_args()
    if m == 'xgboost':
        if best_model_args['num_parallel_tree'] == 0:
            best_model_args['num_parallel_tree'] = None
    if m == 'lgbm':
        best_model_args['num_leaves'] = round(
            best_model_args['num_leaves_frac_max_depth_exp'] * (2 ** best_model_args['max_depth'])
        )
        del best_model_args['num_leaves_frac_max_depth_exp']

    print(best_model_args)
    pipe_preproc = rebuild_pipe_preproc_from_args(pipe_args=best_pipe_args)
    best_model[m] = rebuild_model_pipe_from_args(model_name=m, pipe_preproc=pipe_preproc, model_args=best_model_args)

{'random_state': 0, 'verbosity': 0, 'n_jobs': 1, 'n_estimators': 9167, 'max_depth': 3, 'learning_rate': 0.007954996213047077, 'gamma': 1.8878965142297428, 'min_child_weight': 1, 'subsample': 0.8057864465614245, 'colsample_bytree': 0.11573611124523174, 'reg_alpha': 1.9207869752515219, 'reg_lambda': 2.8588908361664043, 'num_parallel_tree': 3}
{'random_state': 0, 'n_jobs': 1, 'verbose': -1, 'learning_rate': 0.02165737167195487, 'reg_alpha': 10.850386698476989, 'reg_lambda': 0.2304919757106442, 'max_depth': 4, 'n_estimators': 3351, 'bagging_freq': 9, 'bagging_fraction': 0.7690824854888628, 'feature_fraction': 0.423111768380243, 'max_bin': 1974, 'min_data_in_leaf': 1, 'min_split_gain': 0.031341064357919134, 'num_leaves': 8}
{'random_state': 0, 'logging_level': 'Silent', 'thread_count': 1, 'iterations': 7657, 'learning_rate': 0.013038680410097994, 'l2_leaf_reg': 4, 'random_strength': 0.0479014817267622, 'early_stopping_rounds': 14, 'depth': 5, 'subsample': 0.6192770345762606, 'bagging_temperature': 0.09690984798868421, 'colsample_bylevel': 0.1313914681422102, 'min_data_in_leaf': 13}
{'random_state': 0, 'alpha': 10.996218159097683}
{'random_state': 0, 'max_iter': 10000, 'selection': 'cyclic', 'alpha': 1.300552633069198, 'l1_ratio': 0.44339141668657467}

/home/florin/.local/lib/python3.11/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning:

With transform="pandas", `func` should return a DataFrame to follow the set_output API.

best_model['xgboost']

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('mark_pca_outliers',
                                  ColumnTransformer(transformers=[('mark_pca_outliers',
                                                                   Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                                    ColumnTransformer(n_jobs=1,
                                                                                                      transformers=[('impute_standardize_numeric',
                                                                                                                     Pipeline(steps=[('simple_imputer',
                                                                                                                                      SimpleImputer()),
                                                                                                                                     ('standard_scaler',
                                                                                                                                      StandardScaler())]),
                                                                                                                     <sklearn.compose._c...
                              gpu_id=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None,
                              learning_rate=0.007954996213047077, max_bin=None,
                              max_cat_threshold=None, max_cat_to_onehot=None,
                              max_delta_step=None, max_depth=3, max_leaves=None,
                              min_child_weight=1, missing=nan,
                              monotone_constraints=None, n_estimators=9167,
                              n_jobs=1, num_parallel_tree=3, predictor=None,
                              random_state=0, ...))])

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('mark_pca_outliers',
                                  ColumnTransformer(transformers=[('mark_pca_outliers',
                                                                   Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                                    ColumnTransformer(n_jobs=1,
                                                                                                      transformers=[('impute_standardize_numeric',
                                                                                                                     Pipeline(steps=[('simple_imputer',
                                                                                                                                      SimpleImputer()),
                                                                                                                                     ('standard_scaler',
                                                                                                                                      StandardScaler())]),
                                                                                                                     <sklearn.compose._c...
                              gpu_id=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None,
                              learning_rate=0.007954996213047077, max_bin=None,
                              max_cat_threshold=None, max_cat_to_onehot=None,
                              max_delta_step=None, max_depth=3, max_leaves=None,
                              min_child_weight=1, missing=nan,
                              monotone_constraints=None, n_estimators=9167,
                              n_jobs=1, num_parallel_tree=3, predictor=None,
                              random_state=0, ...))])

Pipeline(steps=[('mark_pca_outliers',
                 ColumnTransformer(transformers=[('mark_pca_outliers',
                                                  Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                   ColumnTransformer(n_jobs=1,
                                                                                     transformers=[('impute_standardize_numeric',
                                                                                                    Pipeline(steps=[('simple_imputer',
                                                                                                                     SimpleImputer()),
                                                                                                                    ('standard_scaler',
                                                                                                                     StandardScaler())]),
                                                                                                    <sklearn.compose._column_transformer.make_column_sel...
                ('drop_uninformative_later', 'passthrough'),
                ('drop_features_final',
                 ColumnTransformer(n_jobs=1, remainder='passthrough',
                                   transformers=[('drop', 'drop',
                                                  ['ThreeSeasonPorch',
                                                   'TotalBsmtSF', 'Utilities',
                                                   'Street', 'Condition2',
                                                   'RoofMatl', 'LandSlope',
                                                   'YrMoSold', 'BsmtFinSF2',
                                                   'PoolQC', 'BsmtUnfSF',
                                                   'PoolArea', 'BsmtFinSF1',
                                                   'MSSubClass', 'MiscVal'])],
                                   verbose_feature_names_out=False))])

ColumnTransformer(transformers=[('mark_pca_outliers',
                                 Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                  ColumnTransformer(n_jobs=1,
                                                                    transformers=[('impute_standardize_numeric',
                                                                                   Pipeline(steps=[('simple_imputer',
                                                                                                    SimpleImputer()),
                                                                                                   ('standard_scaler',
                                                                                                    StandardScaler())]),
                                                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5121d90>),
                                                                                  ('one_h...
                                                  <__main__.pcaPandas object at 0x7fefd52b04d0>),
                                                 ('standard_scaler_out',
                                                  QuantileTransformer(random_state=0))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefdd9afe50>),
                                ('passthrough_original_features', 'passthrough',
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd52e9ad0>)],
                  verbose_feature_names_out=False)

<sklearn.compose._column_transformer.make_column_selector object at 0x7fefdd9afe50>

ColumnTransformer(n_jobs=1,
                  transformers=[('impute_standardize_numeric',
                                 Pipeline(steps=[('simple_imputer',
                                                  SimpleImputer()),
                                                 ('standard_scaler',
                                                  StandardScaler())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5121d90>),
                                ('one_hot_encode_categorical',
                                 OneHotEncoder(dtype=<class 'int'>,
                                               handle_unknown='infrequent_if_exist',
                                               min_frequency=1,
                                               sparse_output=False),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7ff035f8dd10>)],
                  verbose_feature_names_out=False)

<sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5121d90>

SimpleImputer()

StandardScaler()

<sklearn.compose._column_transformer.make_column_selector object at 0x7ff035f8dd10>

best_model['lgbm']

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('mark_pca_outliers',
                                  ColumnTransformer(transformers=[('mark_pca_outliers',
                                                                   Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                                    ColumnTransformer(n_jobs=1,
                                                                                                      transformers=[('impute_standardize_numeric',
                                                                                                                     Pipeline(steps=[('simple_imputer',
                                                                                                                                      SimpleImputer()),
                                                                                                                                     ('standard_scaler',
                                                                                                                                      StandardScaler())]),
                                                                                                                     <sklearn.compose._c...
                ('lgbm',
                 LGBMRegressor(bagging_fraction=0.7690824854888628,
                               bagging_freq=9,
                               feature_fraction=0.423111768380243,
                               learning_rate=0.02165737167195487, max_bin=1974,
                               max_depth=4, min_data_in_leaf=1,
                               min_split_gain=0.031341064357919134,
                               n_estimators=3351, n_jobs=1, num_leaves=8,
                               random_state=0, reg_alpha=10.850386698476989,
                               reg_lambda=0.2304919757106442, verbose=-1))])

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('mark_pca_outliers',
                                  ColumnTransformer(transformers=[('mark_pca_outliers',
                                                                   Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                                    ColumnTransformer(n_jobs=1,
                                                                                                      transformers=[('impute_standardize_numeric',
                                                                                                                     Pipeline(steps=[('simple_imputer',
                                                                                                                                      SimpleImputer()),
                                                                                                                                     ('standard_scaler',
                                                                                                                                      StandardScaler())]),
                                                                                                                     <sklearn.compose._c...
                ('lgbm',
                 LGBMRegressor(bagging_fraction=0.7690824854888628,
                               bagging_freq=9,
                               feature_fraction=0.423111768380243,
                               learning_rate=0.02165737167195487, max_bin=1974,
                               max_depth=4, min_data_in_leaf=1,
                               min_split_gain=0.031341064357919134,
                               n_estimators=3351, n_jobs=1, num_leaves=8,
                               random_state=0, reg_alpha=10.850386698476989,
                               reg_lambda=0.2304919757106442, verbose=-1))])

Pipeline(steps=[('mark_pca_outliers',
                 ColumnTransformer(transformers=[('mark_pca_outliers',
                                                  Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                   ColumnTransformer(n_jobs=1,
                                                                                     transformers=[('impute_standardize_numeric',
                                                                                                    Pipeline(steps=[('simple_imputer',
                                                                                                                     SimpleImputer()),
                                                                                                                    ('standard_scaler',
                                                                                                                     StandardScaler())]),
                                                                                                    <sklearn.compose._column_transformer.make_column_sel...
                                                   'OpenPorchSF', 'BsmtFinSF1',
                                                   'KitchenAbvGr', 'Utilities',
                                                   'Street', 'MasVnrArea',
                                                   'WoodDeckSF', 'HalfBath',
                                                   'ScreenPorch', 'LotArea',
                                                   'LandSlope', 'YrMoSold',
                                                   'LowQualFinSF', 'PoolArea',
                                                   'BsmtUnfSF', 'Fireplaces',
                                                   'MiscVal', 'LotFrontage',
                                                   'YearBuilt', 'TotRmsAbvGrd',
                                                   'GarageCars', 'TotalBsmtSF',
                                                   'Condition2', 'RoofMatl',
                                                   'GrLivArea', 'FullBath', ...])],
                                   verbose_feature_names_out=False))])

ColumnTransformer(transformers=[('mark_pca_outliers',
                                 Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                  ColumnTransformer(n_jobs=1,
                                                                    transformers=[('impute_standardize_numeric',
                                                                                   Pipeline(steps=[('simple_imputer',
                                                                                                    SimpleImputer()),
                                                                                                   ('standard_scaler',
                                                                                                    StandardScaler())]),
                                                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd515d150>),
                                                                                  ('one_h...
                                                  <__main__.pcaPandas object at 0x7fefd7181d10>),
                                                 ('standard_scaler_out',
                                                  QuantileTransformer(random_state=0))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd515c390>),
                                ('passthrough_original_features', 'passthrough',
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd515f990>)],
                  verbose_feature_names_out=False)

<sklearn.compose._column_transformer.make_column_selector object at 0x7fefd515c390>

ColumnTransformer(n_jobs=1,
                  transformers=[('impute_standardize_numeric',
                                 Pipeline(steps=[('simple_imputer',
                                                  SimpleImputer()),
                                                 ('standard_scaler',
                                                  StandardScaler())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd515d150>),
                                ('one_hot_encode_categorical',
                                 OneHotEncoder(dtype=<class 'int'>,
                                               handle_unknown='infrequent_if_exist',
                                               min_frequency=1,
                                               sparse_output=False),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd515c9d0>)],
                  verbose_feature_names_out=False)

<sklearn.compose._column_transformer.make_column_selector object at 0x7fefd515d150>

SimpleImputer()

StandardScaler()

<sklearn.compose._column_transformer.make_column_selector object at 0x7fefd515c9d0>

best_model['catboost']

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('mark_pca_outliers',
                                  ColumnTransformer(transformers=[('mark_pca_outliers',
                                                                   Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                                    ColumnTransformer(n_jobs=1,
                                                                                                      transformers=[('impute_standardize_numeric',
                                                                                                                     Pipeline(steps=[('simple_imputer',
                                                                                                                                      SimpleImputer()),
                                                                                                                                     ('standard_scaler',
                                                                                                                                      StandardScaler())]),
                                                                                                                     <sklearn.compose._c...
                                                                    'LandSlope',
                                                                    'YrMoSold',
                                                                    'LowQualFinSF',
                                                                    'PoolArea',
                                                                    'FireplaceQu',
                                                                    'MiscVal',
                                                                    'LotFrontage',
                                                                    'MiscFeature',
                                                                    'Condition2',
                                                                    'RoofMatl',
                                                                    'Fence',
                                                                    'MasVnrType',
                                                                    'PoolQC'])],
                                                    verbose_feature_names_out=False))])),
                ('categorical_to_numeric',
                 FunctionTransformer(func=<function cat_to_num at 0x7ff036d65bc0>)),
                ('catboost',
                 <catboost.core.CatBoostRegressor object at 0x7fefd5160210>)])

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('mark_pca_outliers',
                                  ColumnTransformer(transformers=[('mark_pca_outliers',
                                                                   Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                                    ColumnTransformer(n_jobs=1,
                                                                                                      transformers=[('impute_standardize_numeric',
                                                                                                                     Pipeline(steps=[('simple_imputer',
                                                                                                                                      SimpleImputer()),
                                                                                                                                     ('standard_scaler',
                                                                                                                                      StandardScaler())]),
                                                                                                                     <sklearn.compose._c...
                                                                    'LandSlope',
                                                                    'YrMoSold',
                                                                    'LowQualFinSF',
                                                                    'PoolArea',
                                                                    'FireplaceQu',
                                                                    'MiscVal',
                                                                    'LotFrontage',
                                                                    'MiscFeature',
                                                                    'Condition2',
                                                                    'RoofMatl',
                                                                    'Fence',
                                                                    'MasVnrType',
                                                                    'PoolQC'])],
                                                    verbose_feature_names_out=False))])),
                ('categorical_to_numeric',
                 FunctionTransformer(func=<function cat_to_num at 0x7ff036d65bc0>)),
                ('catboost',
                 <catboost.core.CatBoostRegressor object at 0x7fefd5160210>)])

Pipeline(steps=[('mark_pca_outliers',
                 ColumnTransformer(transformers=[('mark_pca_outliers',
                                                  Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                   ColumnTransformer(n_jobs=1,
                                                                                     transformers=[('impute_standardize_numeric',
                                                                                                    Pipeline(steps=[('simple_imputer',
                                                                                                                     SimpleImputer()),
                                                                                                                    ('standard_scaler',
                                                                                                                     StandardScaler())]),
                                                                                                    <sklearn.compose._column_transformer.make_column_sel...
                ('drop_features_final',
                 ColumnTransformer(n_jobs=1, remainder='passthrough',
                                   transformers=[('drop', 'drop',
                                                  ['ThreeSeasonPorch', 'Alley',
                                                   'BsmtFinSF2', 'Utilities',
                                                   'Street', 'LotArea',
                                                   'LandSlope', 'YrMoSold',
                                                   'LowQualFinSF', 'PoolArea',
                                                   'FireplaceQu', 'MiscVal',
                                                   'LotFrontage', 'MiscFeature',
                                                   'Condition2', 'RoofMatl',
                                                   'Fence', 'MasVnrType',
                                                   'PoolQC'])],
                                   verbose_feature_names_out=False))])

ColumnTransformer(transformers=[('mark_pca_outliers',
                                 Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                  ColumnTransformer(n_jobs=1,
                                                                    transformers=[('impute_standardize_numeric',
                                                                                   Pipeline(steps=[('simple_imputer',
                                                                                                    SimpleImputer()),
                                                                                                   ('standard_scaler',
                                                                                                    StandardScaler())]),
                                                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd52f2350>),
                                                                                  ('one_h...
                                                  <__main__.pcaPandas object at 0x7fefd529b850>),
                                                 ('standard_scaler_out',
                                                  QuantileTransformer(output_distribution='normal',
                                                                      random_state=0))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd52f21d0>),
                                ('passthrough_original_features', 'passthrough',
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd52f1d50>)],
                  verbose_feature_names_out=False)

<sklearn.compose._column_transformer.make_column_selector object at 0x7fefd52f21d0>

ColumnTransformer(n_jobs=1,
                  transformers=[('impute_standardize_numeric',
                                 Pipeline(steps=[('simple_imputer',
                                                  SimpleImputer()),
                                                 ('standard_scaler',
                                                  StandardScaler())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd52f2350>),
                                ('one_hot_encode_categorical',
                                 OneHotEncoder(dtype=<class 'int'>,
                                               handle_unknown='infrequent_if_exist',
                                               min_frequency=1,
                                               sparse_output=False),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd52f2f10>)],
                  verbose_feature_names_out=False)

<sklearn.compose._column_transformer.make_column_selector object at 0x7fefd52f2350>

SimpleImputer()

StandardScaler()

<sklearn.compose._column_transformer.make_column_selector object at 0x7fefd52f2f10>

best_model['ridge']

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('mark_pca_outliers',
                                  ColumnTransformer(transformers=[('mark_pca_outliers',
                                                                   Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                                    ColumnTransformer(n_jobs=1,
                                                                                                      transformers=[('impute_standardize_numeric',
                                                                                                                     Pipeline(steps=[('simple_imputer',
                                                                                                                                      SimpleImputer()),
                                                                                                                                     ('standard_scaler',
                                                                                                                                      StandardScaler())]),
                                                                                                                     <sklearn.compose._c...
                                                                handle_unknown='infrequent_if_exist',
                                                                min_frequency=1,
                                                                sparse_output=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7ff00ccd8250>)],
                                   verbose_feature_names_out=False)),
                ('ridge',
                 TransformedTargetRegressor(regressor=Ridge(alpha=10.996218159097683,
                                                            random_state=0),
                                            transformer=QuantileTransformer(output_distribution='normal',
                                                                            random_state=0)))])

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('mark_pca_outliers',
                                  ColumnTransformer(transformers=[('mark_pca_outliers',
                                                                   Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                                    ColumnTransformer(n_jobs=1,
                                                                                                      transformers=[('impute_standardize_numeric',
                                                                                                                     Pipeline(steps=[('simple_imputer',
                                                                                                                                      SimpleImputer()),
                                                                                                                                     ('standard_scaler',
                                                                                                                                      StandardScaler())]),
                                                                                                                     <sklearn.compose._c...
                                                                handle_unknown='infrequent_if_exist',
                                                                min_frequency=1,
                                                                sparse_output=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7ff00ccd8250>)],
                                   verbose_feature_names_out=False)),
                ('ridge',
                 TransformedTargetRegressor(regressor=Ridge(alpha=10.996218159097683,
                                                            random_state=0),
                                            transformer=QuantileTransformer(output_distribution='normal',
                                                                            random_state=0)))])

Pipeline(steps=[('mark_pca_outliers',
                 ColumnTransformer(transformers=[('mark_pca_outliers',
                                                  Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                   ColumnTransformer(n_jobs=1,
                                                                                     transformers=[('impute_standardize_numeric',
                                                                                                    Pipeline(steps=[('simple_imputer',
                                                                                                                     SimpleImputer()),
                                                                                                                    ('standard_scaler',
                                                                                                                     StandardScaler())]),
                                                                                                    <sklearn.compose._column_transformer.make_column_sel...
                                                   'GarageCond', 'GarageFinish',
                                                   'BsmtHalfBath',
                                                   'BsmtExposure', 'BsmtFinSF2',
                                                   'OpenPorchSF', 'BsmtFinSF1',
                                                   'BsmtCond', 'KitchenAbvGr',
                                                   'Utilities', 'Street',
                                                   'MasVnrArea', 'WoodDeckSF',
                                                   'HalfBath', 'ScreenPorch',
                                                   'LotArea', 'LandSlope',
                                                   'YrMoSold', 'LowQualFinSF',
                                                   'PoolArea', 'BsmtUnfSF',
                                                   'FireplaceQu', 'Fireplaces',
                                                   'MiscVal', 'LotFrontage',
                                                   'YearBuilt', ...])],
                                   verbose_feature_names_out=False))])

ColumnTransformer(transformers=[('mark_pca_outliers',
                                 Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                  ColumnTransformer(n_jobs=1,
                                                                    transformers=[('impute_standardize_numeric',
                                                                                   Pipeline(steps=[('simple_imputer',
                                                                                                    SimpleImputer()),
                                                                                                   ('standard_scaler',
                                                                                                    StandardScaler())]),
                                                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5177690>),
                                                                                  ('one_h...
                                                  <__main__.pcaPandas object at 0x7fefd5110810>),
                                                 ('standard_scaler_out',
                                                  QuantileTransformer(random_state=0))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5176990>),
                                ('passthrough_original_features', 'passthrough',
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5175f90>)],
                  verbose_feature_names_out=False)

<sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5176990>

ColumnTransformer(n_jobs=1,
                  transformers=[('impute_standardize_numeric',
                                 Pipeline(steps=[('simple_imputer',
                                                  SimpleImputer()),
                                                 ('standard_scaler',
                                                  StandardScaler())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5177690>),
                                ('one_hot_encode_categorical',
                                 OneHotEncoder(dtype=<class 'int'>,
                                               handle_unknown='infrequent_if_exist',
                                               min_frequency=1,
                                               sparse_output=False),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5175b90>)],
                  verbose_feature_names_out=False)

<sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5177690>

SimpleImputer()

StandardScaler()

<sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5175b90>

best_model['enet']

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('mark_pca_outliers',
                                  ColumnTransformer(transformers=[('mark_pca_outliers',
                                                                   Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                                    ColumnTransformer(n_jobs=1,
                                                                                                      transformers=[('impute_standardize_numeric',
                                                                                                                     Pipeline(steps=[('simple_imputer',
                                                                                                                                      SimpleImputer()),
                                                                                                                                     ('standard_scaler',
                                                                                                                                      StandardScaler())]),
                                                                                                                     <sklearn.compose._c...
                                                                   OneHotEncoder(dtype=<class 'int'>,
                                                                                 handle_unknown='infrequent_if_exist',
                                                                                 min_frequency=1,
                                                                                 sparse_output=False),
                                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fefe649e3d0>)],
                                                    verbose_feature_names_out=False)),
                                 ('standard_scaler', StandardScaler())])),
                ('enet',
                 ElasticNet(alpha=1.300552633069198,
                            l1_ratio=0.44339141668657467, max_iter=10000,
                            random_state=0))])

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('mark_pca_outliers',
                                  ColumnTransformer(transformers=[('mark_pca_outliers',
                                                                   Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                                    ColumnTransformer(n_jobs=1,
                                                                                                      transformers=[('impute_standardize_numeric',
                                                                                                                     Pipeline(steps=[('simple_imputer',
                                                                                                                                      SimpleImputer()),
                                                                                                                                     ('standard_scaler',
                                                                                                                                      StandardScaler())]),
                                                                                                                     <sklearn.compose._c...
                                                                   OneHotEncoder(dtype=<class 'int'>,
                                                                                 handle_unknown='infrequent_if_exist',
                                                                                 min_frequency=1,
                                                                                 sparse_output=False),
                                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fefe649e3d0>)],
                                                    verbose_feature_names_out=False)),
                                 ('standard_scaler', StandardScaler())])),
                ('enet',
                 ElasticNet(alpha=1.300552633069198,
                            l1_ratio=0.44339141668657467, max_iter=10000,
                            random_state=0))])

Pipeline(steps=[('mark_pca_outliers',
                 ColumnTransformer(transformers=[('mark_pca_outliers',
                                                  Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                                   ColumnTransformer(n_jobs=1,
                                                                                     transformers=[('impute_standardize_numeric',
                                                                                                    Pipeline(steps=[('simple_imputer',
                                                                                                                     SimpleImputer()),
                                                                                                                    ('standard_scaler',
                                                                                                                     StandardScaler())]),
                                                                                                    <sklearn.compose._column_transformer.make_column_sel...
                                                   'BsmtFinSF2', 'OpenPorchSF',
                                                   'BsmtFinSF1', 'Utilities',
                                                   'Street', 'MasVnrArea',
                                                   'WoodDeckSF', 'LandSlope',
                                                   'LotArea', 'YrMoSold',
                                                   'PoolArea', 'BsmtUnfSF',
                                                   'FireplaceQu', 'MiscVal',
                                                   'GarageQual', 'LotFrontage',
                                                   'TotalBsmtSF', 'MiscFeature',
                                                   'Fence', 'GrLivArea',
                                                   'GarageType',
                                                   'EnclosedPorch',
                                                   'MSSubClass', 'FirstFlrSF',
                                                   'MasVnrType', ...])],
                                   verbose_feature_names_out=False))])

ColumnTransformer(transformers=[('mark_pca_outliers',
                                 Pipeline(steps=[('impute_num_ohe_cat_standard',
                                                  ColumnTransformer(n_jobs=1,
                                                                    transformers=[('impute_standardize_numeric',
                                                                                   Pipeline(steps=[('simple_imputer',
                                                                                                    SimpleImputer()),
                                                                                                   ('standard_scaler',
                                                                                                    StandardScaler())]),
                                                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5190190>),
                                                                                  ('one_h...
                                                  <__main__.pcaPandas object at 0x7fefd5207790>),
                                                 ('standard_scaler_out',
                                                  QuantileTransformer(random_state=0))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5192610>),
                                ('passthrough_original_features', 'passthrough',
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5192ad0>)],
                  verbose_feature_names_out=False)

<sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5192610>

ColumnTransformer(n_jobs=1,
                  transformers=[('impute_standardize_numeric',
                                 Pipeline(steps=[('simple_imputer',
                                                  SimpleImputer()),
                                                 ('standard_scaler',
                                                  StandardScaler())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5190190>),
                                ('one_hot_encode_categorical',
                                 OneHotEncoder(dtype=<class 'int'>,
                                               handle_unknown='infrequent_if_exist',
                                               min_frequency=1,
                                               sparse_output=False),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5190150>)],
                  verbose_feature_names_out=False)

<sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5190190>

SimpleImputer()

StandardScaler()

<sklearn.compose._column_transformer.make_column_selector object at 0x7fefd5190150>

test_all_models('Pipeline_Single_Model', pipe=best_model, X=X, y=y)

Pipeline_Single_Model
xgboost  0.10898826066371538
lgbm     0.11154989023087401
catboost 0.10978151743588008
ridge    0.10915505517186783
enet     0.1267509045592845

best_model_fi = best_model
fi = {}

for m in tqdm(baseline_models):
    best_model_fi[m].fit(X, y)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:31<00:00,  6.34s/it]

fi['xgboost'] = pd.DataFrame(
    {
        'importance': best_model_fi['xgboost']._final_estimator.feature_importances_,
    },
    index=best_model_fi['xgboost']._final_estimator.feature_names_in_,
).sort_values(by='importance')

_ = fi['xgboost'].plot(kind='barh', legend=False, figsize=(8, 16))

print('features dropped by the pipeline optimization:')
best_model_fi['xgboost'].named_steps['preprocessing'].named_steps['drop_features_final'].transformers[0][2]

features dropped by the pipeline optimization:

['ThreeSeasonPorch',
 'TotalBsmtSF',
 'Utilities',
 'Street',
 'Condition2',
 'RoofMatl',
 'LandSlope',
 'YrMoSold',
 'BsmtFinSF2',
 'PoolQC',
 'BsmtUnfSF',
 'PoolArea',
 'BsmtFinSF1',
 'MSSubClass',
 'MiscVal']

fi['lgbm'] = pd.DataFrame(
    {
        'importance': best_model_fi['lgbm']._final_estimator.feature_importances_,
    },
    index=best_model_fi['lgbm']._final_estimator.feature_name_,
).sort_values(by='importance')

_ = fi['lgbm'].plot(kind='barh', legend=False, figsize=(8, 8))

print('features dropped by the pipeline optimization:')
best_model_fi['lgbm'].named_steps['preprocessing'].named_steps['drop_features_final'].transformers[0][2]

features dropped by the pipeline optimization:

['ThreeSeasonPorch',
 'SecondFlrSF',
 'BsmtHalfBath',
 'BsmtFinSF2',
 'OpenPorchSF',
 'BsmtFinSF1',
 'KitchenAbvGr',
 'Utilities',
 'Street',
 'MasVnrArea',
 'WoodDeckSF',
 'HalfBath',
 'ScreenPorch',
 'LotArea',
 'LandSlope',
 'YrMoSold',
 'LowQualFinSF',
 'PoolArea',
 'BsmtUnfSF',
 'Fireplaces',
 'MiscVal',
 'LotFrontage',
 'YearBuilt',
 'TotRmsAbvGrd',
 'GarageCars',
 'TotalBsmtSF',
 'Condition2',
 'RoofMatl',
 'GrLivArea',
 'FullBath',
 'EnclosedPorch',
 'BsmtFullBath',
 'MSSubClass',
 'FirstFlrSF',
 'GarageYrBlt',
 'GarageArea',
 'BedroomAbvGr',
 'YearRemodAdd',
 'PoolQC']

fi['catboost'] = pd.DataFrame(
    {
        'importance': best_model_fi['catboost']._final_estimator.feature_importances_,
    },
    index=best_model_fi['catboost']._final_estimator.feature_names_,
).sort_values(by='importance')

_ = fi['catboost'].plot(kind='barh', legend=False, figsize=(8, 13))

print('features dropped by the pipeline optimization:')
best_model_fi['catboost'].named_steps['preprocessing'].named_steps['drop_features_final'].transformers[0][2]

features dropped by the pipeline optimization:

['ThreeSeasonPorch',
 'Alley',
 'BsmtFinSF2',
 'Utilities',
 'Street',
 'LotArea',
 'LandSlope',
 'YrMoSold',
 'LowQualFinSF',
 'PoolArea',
 'FireplaceQu',
 'MiscVal',
 'LotFrontage',
 'MiscFeature',
 'Condition2',
 'RoofMatl',
 'Fence',
 'MasVnrType',
 'PoolQC']

# features are scaled before the model
fi['ridge'] = pd.DataFrame(
    {
        'importance': np.abs(best_model_fi['ridge']._final_estimator.regressor_.coef_),
    },
    index=best_model_fi['ridge']._final_estimator.regressor_.feature_names_in_,
).sort_values(by='importance', ascending=False)

fi_ridge = fi['ridge'].head(100)
_ = fi_ridge.sort_values(by='importance').plot(kind='barh', legend=False, figsize=(8, 16))

print('features dropped by the pipeline optimization:')
best_model_fi['ridge'].named_steps['preprocessing'].named_steps['drop_features_final'].transformers[0][2]

features dropped by the pipeline optimization:

['ThreeSeasonPorch',
 'BsmtFinType1',
 'SecondFlrSF',
 'Alley',
 'GarageCond',
 'GarageFinish',
 'BsmtHalfBath',
 'BsmtExposure',
 'BsmtFinSF2',
 'OpenPorchSF',
 'BsmtFinSF1',
 'BsmtCond',
 'KitchenAbvGr',
 'Utilities',
 'Street',
 'MasVnrArea',
 'WoodDeckSF',
 'HalfBath',
 'ScreenPorch',
 'LotArea',
 'LandSlope',
 'YrMoSold',
 'LowQualFinSF',
 'PoolArea',
 'BsmtUnfSF',
 'FireplaceQu',
 'Fireplaces',
 'MiscVal',
 'LotFrontage',
 'YearBuilt',
 'TotRmsAbvGrd',
 'GarageCars',
 'GarageQual',
 'TotalBsmtSF',
 'MiscFeature',
 'BsmtQual',
 'Condition2',
 'Fence',
 'GrLivArea',
 'RoofMatl',
 'GarageType',
 'FullBath',
 'EnclosedPorch',
 'BsmtFullBath',
 'FirstFlrSF',
 'GarageYrBlt',
 'Electrical',
 'MasVnrType',
 'GarageArea',
 'BedroomAbvGr',
 'BsmtFinType2',
 'YearRemodAdd',
 'PoolQC']

# features are scaled before the model
fi['enet'] = pd.DataFrame(
    {
        'importance': np.abs(best_model_fi['enet']._final_estimator.coef_),
    },
    index=best_model_fi['enet']._final_estimator.feature_names_in_,
).sort_values(by='importance', ascending=False)

fi_enet = fi['enet'].head(100)
_ = fi_enet.sort_values(by='importance').plot(kind='barh', legend=False, figsize=(8, 16))

print('features dropped by the pipeline optimization:')
best_model_fi['enet'].named_steps['preprocessing'].named_steps['drop_features_final'].transformers[0][2]

features dropped by the pipeline optimization:

['ThreeSeasonPorch',
 'BldgType',
 'Alley',
 'GarageCond',
 'GarageFinish',
 'BsmtFinSF2',
 'OpenPorchSF',
 'BsmtFinSF1',
 'Utilities',
 'Street',
 'MasVnrArea',
 'WoodDeckSF',
 'LandSlope',
 'LotArea',
 'YrMoSold',
 'PoolArea',
 'BsmtUnfSF',
 'FireplaceQu',
 'MiscVal',
 'GarageQual',
 'LotFrontage',
 'TotalBsmtSF',
 'MiscFeature',
 'Fence',
 'GrLivArea',
 'GarageType',
 'EnclosedPorch',
 'MSSubClass',
 'FirstFlrSF',
 'MasVnrType',
 'PoolQC']

# if True, this will restart optimization from scratch
# and will delete all optimization history
OPTUNA_VOTING_REGRESSOR_RESET = False

if OPTUNA_VOTING_REGRESSOR_RESET:
    try:
        optuna.delete_study(
            study_name='vr',
            storage=OPTUNA_URL,
        )
    except:
        pass

study_vr = optuna.create_study(
    direction='minimize',
    study_name='vr',
    storage=OPTUNA_URL,
    load_if_exists=True,
)

[I 2023-08-14 13:20:32,869] Using an existing study with name 'vr' instead of creating a new one.

def objective_vr(trial, best_model):
    weight = {}
    for m in best_model.keys():
        weight[m] = trial.suggest_float('weight_' + m, 0.0, 1.0, log=False)

    vr = VotingRegressor(
        estimators=[('pipe_' + m, best_model[m]) for m in best_model.keys()],
        weights=[weight[m] for m in best_model.keys()],
        n_jobs=1,
    )

    return test_performance(pipe=vr, X=X, y=y, n_splits=num_folds, n_jobs=cpu_count)


# drop ElasticNet since it gets zeroed out anyway
best_model_short = {k: best_model[k] for k in best_model.keys() if k not in ['enet']}
# study_vr.optimize(lambda trial: objective_vr(trial, best_model_short), timeout=36000, catch=(ValueError,), show_progress_bar=False)

best_trial_vr = study_vr.best_trial
best_params_vr = best_trial_vr.params
print(f'best trial: {best_trial_vr.number}, best score: {best_trial_vr.value}')

best_params_vr_df = pd.DataFrame(
    {
        'model': [m.split('_')[1] for m in best_params_vr.keys()],
        'weight': [w for w in best_params_vr.values()],
    }
).sort_values(by='weight')

ax = best_params_vr_df.plot(
    kind='barh',
    y='weight',
    x='model',
    xlim=(0.0, 1.0),
    xlabel='weight',
    ylabel='model',
    legend=False,
    title='Voting regressor model weights',
    figsize=(6, 4),
)
ax.bar_label(container=ax.containers[0], labels=[f'{w:.3f}' for w in best_params_vr_df['weight']])
plt.show()

best trial: 211, best score: 0.10442150749133243

# default backend is way too slow here, use matplotlib instead
# optuna.visualization.matplotlib.plot_contour(study_vr)
optuna.visualization.plot_param_importances(study_vr).update_layout(width=600, height=400).show('png')
optuna.visualization.plot_optimization_history(study_vr).update_layout(width=1200, height=400).show('png')
optuna.visualization.plot_slice(study_vr).update_layout(width=1200, height=400).show('png')

vr = VotingRegressor(
    estimators=[('pipe_' + m, best_model_short[m]) for m in best_model_short.keys()],
    weights=[best_params_vr['weight_' + m] for m in best_model_short.keys()],
    n_jobs=1,
)

test_all_models(
    'VotingRegressor_optimized',
    {'VotingRegressor': vr},
    X,
    y,
    n_splits=num_folds,
    n_jobs=cpu_count,
)

VotingRegressor_optimized
VotingRegressor 0.10442150749133243

vr.weights

[0.6132624643750012,
 0.27392747363203707,
 0.4128720952346723,
 0.9222712355638678]

make_submission_all({'VotingRegressor': vr}, X, y, X_test, 'VotingRegressor_optimized')

sr_fe_pipe_ridge = Pipeline(
    [
        ('standard_scaler', StandardScaler()),
        ('ridge', Ridge(random_state=0)),
    ]
)

sr_ridge = StackingRegressor(
    estimators=[('pipe_' + m, best_model[m]) for m in best_model.keys() if m not in ['enet']],
    final_estimator=sr_fe_pipe_ridge,
    cv=KFold(n_splits=num_folds, shuffle=False),
    verbose=0,
    n_jobs=1,
)

# TEST_PIPELINES = True
test_all_models(
    'StackingRegressor_Ridge_full_models',
    {'StackingRegressor_Ridge_full_models': sr_ridge},
    X,
    y,
    n_splits=num_folds,
    n_jobs=cpu_count,
)
# TEST_PIPELINES = False

StackingRegressor_Ridge_full_models
StackingRegressor_Ridge_full_models 0.10570325701640888

# if True, this will restart optimization from scratch
# and will delete all optimization history
OPTUNA_SR_RIDGE_RESET = False

if OPTUNA_SR_RIDGE_RESET:
    try:
        optuna.delete_study(
            study_name='sr_ridge',
            storage=OPTUNA_URL,
        )
    except:
        pass

study_sr_ridge = optuna.create_study(
    direction='minimize',
    study_name='sr_ridge',
    storage=OPTUNA_URL,
    load_if_exists=True,
)

[I 2023-08-14 13:20:55,936] Using an existing study with name 'sr_ridge' instead of creating a new one.

def two_or_more(element_list):
    """
    From a given list of strings, return all combinations of 2 or more elements,
    joined by '_'.
    """
    combo_list = []
    for r in range(2, 1 + len(element_list)):
        combos = list(combinations(element_list, r))
        combos = ['_'.join(c) for c in combos]
        combo_list += combos
    return combo_list


def objective_sr_ridge(trial, best_model):
    model_args = {}
    model_args['alpha'] = trial.suggest_float('ridge_alpha', 0.1, 100.0)
    model_args['base_models'] = trial.suggest_categorical('base_models', two_or_more(best_model.keys()))

    estimators = [('pipe_' + m, best_model[m]) for m in best_model.keys() if m in model_args['base_models'].split('_')]

    sr_fe_pipe_ridge = Pipeline(
        [
            ('standard_scaler', StandardScaler()),
            ('ridge', Ridge(alpha=model_args['alpha'], random_state=0)),
        ]
    )

    sr_ridge = StackingRegressor(
        estimators=estimators,
        final_estimator=sr_fe_pipe_ridge,
        cv=KFold(n_splits=num_folds, shuffle=False),
        verbose=0,
        n_jobs=1,
    )

    # return np.random.random()
    return test_performance(pipe=sr_ridge, X=X, y=y, n_splits=num_folds, n_jobs=cpu_count)


# study_sr_ridge.optimize(lambda trial: objective_sr_ridge(trial, best_model_short), timeout=10*3600, catch=(ValueError,), show_progress_bar=False)

optuna.visualization.plot_param_importances(study_sr_ridge).update_layout(width=600, height=400).show('png')
optuna.visualization.plot_optimization_history(study_sr_ridge).update_layout(width=1200, height=400).show('png')
optuna.visualization.plot_slice(study_sr_ridge).update_layout(width=1200, height=400).show('png')

sr_fe_pipe_enet = Pipeline(
    [
        # ('standard_scaler', StandardScaler()),
        ('enet', ElasticNet(random_state=0, max_iter=10000)),
    ]
)

sr_enet = StackingRegressor(
    estimators=[('pipe_' + m, best_model_short[m]) for m in best_model_short.keys()],
    final_estimator=sr_fe_pipe_enet,
    cv=KFold(n_splits=num_folds, shuffle=False),
    verbose=0,
    n_jobs=1,
)

# TEST_PIPELINES = True
test_all_models(
    'StackingRegressor_ElasticNet_full_models',
    {'StackingRegressor_ElasticNet_full_models': sr_enet},
    X,
    y,
    n_splits=num_folds,
    n_jobs=cpu_count,
)
# TEST_PIPELINES = False

StackingRegressor_ElasticNet_full_models
StackingRegressor_ElasticNet_full_models 0.10580027270420111

# if True, this will restart optimization from scratch
# and will delete all optimization history
OPTUNA_SR_ENET_RESET = False

if OPTUNA_SR_ENET_RESET:
    try:
        optuna.delete_study(
            study_name='sr_enet',
            storage=OPTUNA_URL,
        )
    except:
        pass

study_sr_enet = optuna.create_study(
    direction='minimize',
    study_name='sr_enet',
    storage=OPTUNA_URL,
    load_if_exists=True,
)

[I 2023-08-14 13:20:59,043] Using an existing study with name 'sr_enet' instead of creating a new one.

def objective_sr_enet(trial, best_model):
    model_args = {}
    model_args['alpha'] = trial.suggest_float('enet_alpha', 0.1, 10.0, log=True)
    model_args['l1_ratio'] = trial.suggest_float('enet_l1_ratio', 0.001, 0.999, log=True)
    model_args['selection'] = trial.suggest_categorical('enet_selection', ['cyclic', 'random'])
    model_args['base_models'] = trial.suggest_categorical('base_models', two_or_more(best_model.keys()))

    estimators = [('pipe_' + m, best_model[m]) for m in best_model.keys() if m in model_args['base_models'].split('_')]

    sr_fe_pipe_enet = Pipeline(
        [
            # ('standard_scaler', StandardScaler()),
            (
                'enet',
                ElasticNet(
                    alpha=model_args['alpha'],
                    l1_ratio=model_args['l1_ratio'],
                    selection=model_args['selection'],
                    random_state=0,
                    max_iter=10000,
                ),
            ),
        ]
    )

    sr_enet = StackingRegressor(
        estimators=estimators,
        final_estimator=sr_fe_pipe_enet,
        cv=KFold(n_splits=num_folds, shuffle=False),
        verbose=0,
        n_jobs=1,
    )

    # return np.random.random()
    return test_performance(pipe=sr_enet, X=X, y=y, n_splits=num_folds, n_jobs=cpu_count)


# study_sr_enet.optimize(lambda trial: objective_sr_enet(trial, best_model_short), timeout=20*3600, catch=(ValueError,), show_progress_bar=False)

optuna.visualization.plot_param_importances(study_sr_enet).update_layout(width=600, height=400).show('png')
optuna.visualization.plot_optimization_history(study_sr_enet).update_layout(width=1200, height=400).show('png')
optuna.visualization.plot_slice(study_sr_enet).update_layout(width=1200, height=500).show('png')

	y_proba	p_raw	y_score	y_bool	y_bool_spe	y_score_spe
1	1.000000	1.000000	91.480418	False	False	2.293311
2	1.000000	0.943230	149.185965	False	False	1.226769
3	1.000000	1.000000	91.372478	False	False	2.642532
4	1.000000	0.481874	178.190867	False	False	1.844994
5	1.000000	0.999613	121.341461	False	False	5.054902
...	...	...	...	...	...	...
1456	1.000000	1.000000	85.433656	False	False	1.938461
1457	1.000000	0.971802	143.797092	False	False	2.148554
1458	0.378591	0.040172	212.324863	False	False	3.279412
1459	0.050034	0.003667	232.714853	False	False	3.658729
1460	1.000000	0.505228	177.087154	False	False	1.801041

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	...	PC23	PC24	PC25	PC26	PC27	PC28	PC29	PC30	PC31	PC32
Id
1	1.319667	0.253511	-1.465137	-2.241872	1.018470	0.213784	0.505476	0.341499	0.420613	0.009350	...	0.217328	0.058187	0.380878	-0.109228	-0.393812	-0.198528	0.366336	-0.184331	3.784541e-15	2.249875e-15
2	0.188263	-1.030096	0.785191	0.151605	-0.994604	-0.618935	-3.462510	0.983373	1.060380	-1.312354	...	-0.608682	0.755184	0.219853	0.316759	0.253835	0.094876	0.459792	0.154973	-5.758691e-15	3.227463e-15
3	1.443037	0.076919	-1.275494	-1.766320	0.305316	-0.080309	0.542641	0.121386	-0.147398	0.376114	...	-0.200565	-0.284208	0.809865	0.113971	0.209419	0.511120	-0.383688	-0.250367	3.971903e-15	4.206107e-16
4	-0.584346	1.147321	1.042259	-0.005675	0.089426	0.988350	1.172386	2.328988	-1.936959	-0.594625	...	-0.581900	-1.317109	0.254077	-1.486220	0.069142	0.474494	0.065772	1.071727	-2.024218e-16	7.222443e-16
5	4.128104	0.908141	-0.455301	-1.768404	0.590250	-0.430892	0.070445	0.263687	-0.694756	0.183212	...	-0.174506	-0.849708	0.328150	-0.054969	0.045859	-0.088122	0.066637	-0.100611	5.491133e-15	-1.196395e-15
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1456	0.797221	1.280392	-2.180601	-0.082128	-0.593274	0.285918	0.190878	-0.011470	0.024647	-0.041398	...	-0.231493	0.132148	0.115910	0.083520	0.062321	0.033482	0.109365	0.022206	5.819208e-17	5.929316e-18
1457	2.567203	-1.531531	2.028088	0.618137	0.581316	-0.294900	-0.926247	-1.253234	-1.233935	0.414069	...	-0.045444	-0.005702	0.030275	0.109629	-0.498817	0.435507	-0.423289	0.208758	-3.582002e-17	1.297926e-16
1458	0.655551	3.205122	1.058579	0.248362	0.365413	-0.704300	0.198081	-1.807445	1.775119	0.844847	...	1.367273	-0.621316	0.269872	-0.022714	0.703932	-0.431905	-0.294173	-0.127209	-6.440945e-17	1.480647e-16
1459	-2.643897	-1.800869	2.032289	-0.815118	0.039010	1.584402	-2.469130	-3.924796	-2.671213	0.716054	...	0.904487	-0.033233	0.129937	0.431736	0.196866	-0.472690	-0.006877	-0.015853	-2.851290e-16	2.402431e-16
1460	-0.593611	-1.420755	1.593847	-1.856167	0.438588	1.068380	-2.164236	-1.388932	-1.301690	-0.382335	...	-0.697762	0.707884	-0.295428	0.527040	-0.212089	0.091772	0.249222	-0.011596	2.973514e-16	4.056012e-17

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	...	PC23	PC24	PC25	PC26	PC27	PC28	PC29	PC30	PC31	PC32
LotFrontage	0.175396	0.015549	0.234096	0.125092	-0.035916	0.048305	-0.026918	0.111289	-0.000084	0.206924	...	0.027354	0.061363	0.152329	-0.010805	0.016390	0.033472	0.034995	0.031895	2.139601e-16	-0.000000e+00
LotArea	0.120766	-0.020536	0.277302	0.006986	-0.085585	-0.020800	-0.234320	-0.047378	-0.098655	0.094364	...	-0.047629	-0.014952	-0.064483	-0.004832	-0.012269	-0.032900	0.004166	-0.003411	-1.868961e-16	1.238875e-16
YearBuilt	0.238688	-0.220281	-0.322435	-0.068910	0.010597	0.017620	-0.061004	-0.045108	0.079290	-0.033148	...	-0.405266	0.094429	0.056713	0.115782	-0.139026	-0.590229	-0.349727	-0.163359	9.070800e-17	5.616481e-19
YearRemodAdd	0.214818	-0.116506	-0.303616	-0.048179	0.028095	0.183624	-0.075246	-0.095923	0.050684	0.102703	...	0.553918	-0.362972	-0.192046	0.283654	0.126263	-0.022384	-0.013933	-0.062427	3.942365e-17	-1.143878e-16
MasVnrArea	0.205032	-0.029369	0.005524	-0.034691	-0.059342	-0.309263	0.069873	0.202594	-0.038944	-0.112917	...	0.032250	-0.229090	0.002753	0.048609	-0.017173	0.031894	0.036478	0.007316	8.126963e-17	1.175461e-16
BsmtFinSF1	0.151324	-0.306658	0.280720	-0.260624	0.176625	-0.048756	0.078328	0.192017	0.153119	-0.042750	...	0.019412	0.269801	-0.210996	0.027086	0.302003	-0.092594	0.121158	0.021168	-4.227070e-01	-3.948353e-01
BsmtFinSF2	-0.009678	-0.066052	0.189413	-0.067531	-0.147332	0.157129	-0.266769	-0.541824	-0.163633	-0.009866	...	0.051089	0.044371	-0.025850	-0.016498	0.117495	-0.027432	0.037475	0.003215	-1.495091e-01	-1.396510e-01
BsmtUnfSF	0.116066	0.139441	-0.186305	0.556068	-0.163917	0.019239	0.069847	-0.039937	-0.088491	0.016098	...	-0.036425	-0.116690	0.329031	-0.021760	0.098308	0.007136	0.042552	-0.017328	-4.095178e-01	-3.825157e-01
TotalBsmtSF	0.270666	-0.202658	0.173852	0.264287	-0.035648	0.026467	0.053688	-0.039833	0.009890	-0.031859	...	0.002281	0.179282	0.102536	0.000176	0.456198	-0.099164	0.182600	0.005736	4.065876e-01	3.797788e-01
FirstFlrSF	0.272621	-0.141130	0.244889	0.299338	0.039731	-0.028597	0.006350	-0.049693	0.021530	-0.036829	...	0.115334	0.125375	-0.253915	0.018158	-0.458906	0.204821	-0.326232	0.083866	-3.355416e-01	3.592277e-01
SecondFlrSF	0.144140	0.412906	-0.074553	-0.332335	-0.008298	0.012531	0.021312	0.059656	-0.062983	0.045369	...	0.188315	0.123020	0.286835	-0.252369	0.302596	-0.117267	-0.238655	0.053511	-3.788880e-01	4.056340e-01
LowQualFinSF	-0.009318	0.125937	0.093924	0.047431	0.000893	0.497381	0.054010	-0.080902	0.186547	0.171304	...	-0.093452	0.002246	0.018088	0.130370	0.026084	-0.054306	-0.027118	-0.012667	-4.220275e-02	4.518188e-02
GrLivArea	0.319441	0.250837	0.126919	-0.051471	0.022419	0.035394	0.027374	0.005513	-0.019221	0.026445	...	0.232640	0.194639	0.053153	-0.184227	-0.083823	0.048243	-0.440769	0.104979	4.560945e-01	-4.882905e-01
BsmtFullBath	0.081233	-0.308730	0.228113	-0.285946	0.281885	0.092903	0.161498	-0.049549	-0.040126	0.031306	...	-0.043224	-0.385558	0.595864	-0.011747	-0.224428	0.128615	-0.011134	-0.003644	-4.374788e-17	-6.754434e-17
BsmtHalfBath	-0.010647	0.001948	0.066094	-0.035145	-0.296231	-0.176484	-0.604015	0.122184	0.272773	-0.151048	...	-0.032930	-0.063345	0.226208	-0.049019	-0.082710	0.033270	0.004314	-0.004935	-1.013543e-16	1.877955e-17
FullBath	0.274066	0.136919	-0.143221	0.069030	0.172549	0.041591	-0.080095	-0.082728	0.069473	-0.022328	...	-0.098264	0.391097	0.288806	0.358368	-0.086391	0.276190	0.279361	-0.083811	-2.051501e-16	-1.974361e-17
HalfBath	0.132798	0.203903	-0.133067	-0.409761	-0.170762	-0.069314	0.082419	0.029887	-0.063992	0.061417	...	-0.118682	0.182653	-0.091701	0.303943	-0.154806	0.256110	0.166789	-0.039035	-4.933403e-17	6.200365e-17
BedroomAbvGr	0.129884	0.376553	0.110859	0.002212	0.132179	-0.030081	-0.127826	-0.058401	0.068580	-0.038753	...	-0.337702	-0.378301	-0.239264	0.230007	0.210296	0.157207	-0.164982	0.031064	-2.380340e-17	-1.846278e-16
KitchenAbvGr	-0.008576	0.167241	0.083322	0.133024	0.549849	-0.216271	-0.014165	-0.117711	0.220468	-0.203373	...	-0.076757	-0.077470	-0.004820	0.053966	0.178772	-0.032701	-0.083217	-0.012212	4.002366e-18	3.983065e-17
TotRmsAbvGrd	0.260598	0.331886	0.102039	-0.000018	0.150587	-0.024831	-0.021272	-0.050227	0.029095	-0.026745	...	0.121819	-0.113427	-0.129357	-0.289135	-0.358958	-0.425172	0.541646	-0.143363	9.175399e-17	2.588104e-16
Fireplaces	0.193598	0.012384	0.207242	-0.080322	-0.286884	-0.128955	0.073185	-0.005063	-0.131658	0.028286	...	-0.208502	-0.283843	-0.030396	-0.037165	0.146709	0.038872	0.032692	-0.041767	-3.643599e-19	-9.414273e-17
GarageYrBlt	0.227802	-0.158469	-0.362209	-0.037866	0.098281	0.164369	-0.045966	-0.070027	0.079845	0.003609	...	-0.305071	-0.033041	-0.172256	-0.609411	0.052969	0.365310	0.109434	0.193348	-5.611820e-18	-8.971969e-18
GarageCars	0.295087	-0.080953	-0.118361	0.021699	-0.037722	-0.119902	0.034421	0.100998	-0.119669	-0.052375	...	0.016620	-0.082859	0.015072	0.202016	-0.005332	-0.151473	0.102229	0.661807	-6.342547e-17	2.086008e-17
GarageArea	0.292684	-0.110294	-0.057612	0.036194	-0.026793	-0.080763	0.059170	0.120509	-0.107293	-0.038001	...	0.043234	-0.047534	-0.055868	-0.068108	0.093615	0.174647	-0.071765	-0.661734	-1.250166e-16	-7.870196e-17
WoodDeckSF	0.143203	-0.080347	0.026808	-0.130274	0.032124	0.124216	-0.415463	-0.097533	-0.245668	-0.098987	...	-0.071801	0.023371	-0.011065	0.041786	0.017460	-0.022240	0.009055	-0.008855	-2.874402e-17	-7.055744e-17
OpenPorchSF	0.157123	0.032851	-0.030642	-0.051540	-0.168419	0.222530	0.247311	-0.104758	0.109436	0.051553	...	-0.121842	-0.096081	-0.063059	0.035880	-0.019722	-0.046911	-0.001614	0.019555	1.489467e-16	-1.279181e-17
EnclosedPorch	-0.078153	0.133888	0.189849	0.088117	0.052846	0.177938	0.093753	0.274212	-0.384546	0.045739	...	-0.279374	0.016991	-0.081325	0.050293	-0.039060	-0.091441	-0.018020	-0.014813	-1.067559e-16	-1.048654e-17
ThreeSeasonPorch	0.017815	-0.029300	-0.010249	0.051825	-0.015986	-0.146480	-0.156811	0.251542	0.294107	0.740957	...	-0.052798	-0.028223	-0.013881	-0.014587	0.017693	-0.020268	0.005028	-0.003691	-1.962016e-16	-8.539753e-17
ScreenPorch	0.033865	0.018995	0.129476	-0.056144	-0.396016	-0.228505	0.374622	-0.356169	0.286082	-0.060506	...	-0.062729	0.034665	0.009268	-0.012743	-0.021391	-0.030371	-0.005561	-0.008727	-1.336017e-16	9.678733e-17
PoolArea	0.056248	0.015671	0.173336	-0.043408	-0.070314	0.437068	-0.006034	0.160995	0.251939	-0.034551	...	-0.017078	-0.096091	-0.017533	0.013090	-0.035157	0.002135	0.050281	0.008329	-2.305244e-17	1.146277e-16
MiscVal	-0.009783	0.027873	0.041178	-0.014929	0.071630	-0.057189	-0.007184	-0.163109	0.434906	-0.049990	...	-0.015745	-0.009245	0.031066	0.004311	-0.016885	0.021342	-0.011060	0.017330	6.958583e-17	1.348093e-17
YrMoSold	-0.009412	-0.032160	-0.000523	-0.034115	0.204185	-0.257055	0.005309	-0.417092	-0.228796	0.489550	...	-0.058924	0.021664	-0.020420	-0.009636	-0.007322	-0.019581	0.001051	0.008029	-3.241059e-17	-5.822417e-17

	PC1	PC2	PC3	PC4
TotalBsmtSF	0.442480	0.682135	0.171340	0.556368
BsmtFinSF2	0.171720	-0.182214	0.946285	-0.204586
BsmtFinSF1	0.772885	0.035364	-0.258499	-0.578426
BsmtUnfSF	-0.421156	0.707276	0.091464	-0.560378

	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	...	PoolArea	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition	SalePrice
Id
1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	2	2008	WD	Normal	208500.0
2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	FR2	...	0	NaN	NaN	NaN	0	5	2007	WD	Normal	181500.0
3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	9	2008	WD	Normal	223500.0
4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	Corner	...	0	NaN	NaN	NaN	0	2	2006	WD	Abnorml	140000.0
5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	FR2	...	0	NaN	NaN	NaN	0	12	2008	WD	Normal	250000.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2915	160	RM	21.0	1936	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	6	2006	WD	Normal	NaN
2916	160	RM	21.0	1894	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	4	2006	WD	Abnorml	NaN
2917	20	RL	160.0	20000	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	9	2006	WD	Abnorml	NaN
2918	85	RL	62.0	10441	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0	NaN	MnPrv	Shed	700	7	2006	WD	Normal	NaN
2919	60	RL	74.0	9627	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	11	2006	WD	Normal	NaN

	0
PoolQC	2909
MiscFeature	2814
Alley	2721
Fence	2348
MasVnrType	1766
SalePrice	1459
FireplaceQu	1420
LotFrontage	486
GarageFinish	159
GarageQual	159
GarageCond	159
GarageYrBlt	159
GarageType	157
BsmtCond	82
BsmtExposure	82
BsmtQual	81
BsmtFinType2	80
BsmtFinType1	79
MasVnrArea	23
MSZoning	4
BsmtHalfBath	2
Utilities	2
BsmtFullBath	2
Functional	2
Exterior2nd	1
Exterior1st	1
GarageArea	1
GarageCars	1
SaleType	1
KitchenQual	1
BsmtFinSF1	1
Electrical	1
BsmtFinSF2	1
BsmtUnfSF	1
TotalBsmtSF	1
TotRmsAbvGrd	0
Fireplaces	0
SaleCondition	0
PavedDrive	0
MoSold	0

	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	...	ScreenPorch	PoolArea	PoolQC	Fence	MiscFeature	MiscVal	SaleType	SaleCondition	SalePrice	YrMoSold
Id
1	60	RL	65.0	8450.0	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	208500.0	1201824000000000000
2	20	RL	80.0	9600.0	Pave	NaN	Reg	Lvl	AllPub	FR2	...	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	181500.0	1177977600000000000
3	60	RL	68.0	11250.0	Pave	NaN	IR1	Lvl	AllPub	Inside	...	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	223500.0	1220227200000000000
4	70	RL	60.0	9550.0	Pave	NaN	IR1	Lvl	AllPub	Corner	...	0.0	0.0	NaN	NaN	NaN	0.0	WD	Abnorml	140000.0	1138752000000000000
5	60	RL	84.0	14260.0	Pave	NaN	IR1	Lvl	AllPub	FR2	...	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	250000.0	1228089600000000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1456	60	RL	62.0	7917.0	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	175000.0	1185926400000000000
1457	20	RL	85.0	13175.0	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0.0	0.0	NaN	MnPrv	NaN	0.0	WD	Normal	210000.0	1264982400000000000
1458	70	RL	66.0	9042.0	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0.0	0.0	NaN	GdPrv	Shed	2500.0	WD	Normal	266500.0	1272672000000000000
1459	20	RL	68.0	9717.0	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	142125.0	1270080000000000000
1460	20	RL	75.0	9937.0	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	147500.0	1212278400000000000

	PCA_outlier_score	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	ThreeSeasonPorch	ScreenPorch	PoolArea	PoolQC	Fence	MiscFeature	MiscVal	SaleType	SaleCondition	YrMoSold
Id
1	0.013569	60	RL	65.0	8450.0	Pave	NaN	Reg	Lvl	AllPub	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	1201824000000000000
2	0.620547	20	RL	80.0	9600.0	Pave	NaN	Reg	Lvl	AllPub	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	1177977600000000000
3	0.013017	60	RL	68.0	11250.0	Pave	NaN	IR1	Lvl	AllPub	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	1220227200000000000
4	0.803558	70	RL	60.0	9550.0	Pave	NaN	IR1	Lvl	AllPub	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Abnorml	1138752000000000000
5	0.305013	60	RL	84.0	14260.0	Pave	NaN	IR1	Lvl	AllPub	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	1228089600000000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1456	0.000540	60	RL	62.0	7917.0	Pave	NaN	Reg	Lvl	AllPub	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	1185926400000000000
1457	0.567560	20	RL	85.0	13175.0	Pave	NaN	Reg	Lvl	AllPub	...	0.0	0.0	0.0	NaN	MnPrv	NaN	0.0	WD	Normal	1264982400000000000
1458	0.894904	70	RL	66.0	9042.0	Pave	NaN	Reg	Lvl	AllPub	...	0.0	0.0	0.0	NaN	GdPrv	Shed	2500.0	WD	Normal	1272672000000000000
1459	0.927156	20	RL	68.0	9717.0	Pave	NaN	Reg	Lvl	AllPub	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	1270080000000000000
1460	0.798901	20	RL	75.0	9937.0	Pave	NaN	Reg	Lvl	AllPub	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	1212278400000000000

	OverallQual	Neighborhood	GrLivArea	YearBuilt	TotalBsmtSF	GarageArea	GarageCars	BsmtQual	KitchenQual	ExterQual	...	Condition1	MiscFeature	ScreenPorch	Functional	LowQualFinSF	Condition2	BsmtFinSF2	RoofMatl	Utilities	ThreeSeasonPorch
Id
1	7	CollgCr	1710.0	2003.0	856.0	548.0	2.0	Gd	Gd	Gd	...	Norm	NaN	0.0	Typ	0.0	Norm	0.0	CompShg	AllPub	0.0
2	6	Veenker	1262.0	1976.0	1262.0	460.0	2.0	Gd	TA	TA	...	Feedr	NaN	0.0	Typ	0.0	Norm	0.0	CompShg	AllPub	0.0
3	7	CollgCr	1786.0	2001.0	920.0	608.0	2.0	Gd	Gd	Gd	...	Norm	NaN	0.0	Typ	0.0	Norm	0.0	CompShg	AllPub	0.0
4	7	Crawfor	1717.0	1915.0	756.0	642.0	3.0	TA	Gd	TA	...	Norm	NaN	0.0	Typ	0.0	Norm	0.0	CompShg	AllPub	0.0
5	8	NoRidge	2198.0	2000.0	1145.0	836.0	3.0	Gd	Gd	Gd	...	Norm	NaN	0.0	Typ	0.0	Norm	0.0	CompShg	AllPub	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1456	6	Gilbert	1647.0	1999.0	953.0	460.0	2.0	Gd	TA	TA	...	Norm	NaN	0.0	Typ	0.0	Norm	0.0	CompShg	AllPub	0.0
1457	6	NWAmes	2073.0	1978.0	1542.0	500.0	2.0	Gd	TA	TA	...	Norm	NaN	0.0	Min1	0.0	Norm	163.0	CompShg	AllPub	0.0
1458	7	Crawfor	2340.0	1941.0	1152.0	252.0	1.0	TA	Gd	Ex	...	Norm	Shed	0.0	Typ	0.0	Norm	0.0	CompShg	AllPub	0.0
1459	5	Names	1078.0	1950.0	1078.0	240.0	1.0	TA	Gd	TA	...	Norm	NaN	0.0	Typ	0.0	Norm	1029.0	CompShg	AllPub	0.0
1460	5	Edwards	1256.0	1965.0	1256.0	276.0	1.0	TA	TA	Gd	...	Norm	NaN	0.0	Typ	0.0	Norm	290.0	CompShg	AllPub	0.0

	TargetEncoded_Neighborhood	Neighborhood	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	...	ThreeSeasonPorch	ScreenPorch	PoolArea	PoolQC	Fence	MiscFeature	MiscVal	SaleType	SaleCondition	YrMoSold
Id
1	197965.734807	CollgCr	60	RL	65.0	8450.0	Pave	NaN	Reg	Lvl	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	1201824000000000000
2	197643.209810	Veenker	20	RL	80.0	9600.0	Pave	NaN	Reg	Lvl	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	1177977600000000000
3	197965.734807	CollgCr	60	RL	68.0	11250.0	Pave	NaN	IR1	Lvl	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	1220227200000000000
4	209344.287867	Crawfor	70	RL	60.0	9550.0	Pave	NaN	IR1	Lvl	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Abnorml	1138752000000000000
5	318453.591177	NoRidge	60	RL	84.0	14260.0	Pave	NaN	IR1	Lvl	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	1228089600000000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1456	192821.904993	Gilbert	60	RL	62.0	7917.0	Pave	NaN	Reg	Lvl	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	1185926400000000000
1457	189009.693995	NWAmes	20	RL	85.0	13175.0	Pave	NaN	Reg	Lvl	...	0.0	0.0	0.0	NaN	MnPrv	NaN	0.0	WD	Normal	1264982400000000000
1458	209344.287867	Crawfor	70	RL	66.0	9042.0	Pave	NaN	Reg	Lvl	...	0.0	0.0	0.0	NaN	GdPrv	Shed	2500.0	WD	Normal	1272672000000000000
1459	145847.080044	Names	20	RL	68.0	9717.0	Pave	NaN	Reg	Lvl	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	1270080000000000000
1460	128237.373454	Edwards	20	RL	75.0	9937.0	Pave	NaN	Reg	Lvl	...	0.0	0.0	0.0	NaN	NaN	NaN	0.0	WD	Normal	1212278400000000000

1. Intro¶

2. Data preprocessing¶

2.1. Clean data¶

2.2. Encode the categories¶

2.3. Encode time¶

2.4. Load data¶

3. Examine the dataset¶

3.1. Feature distributions¶

3.1.1. Numeric features¶

3.1.2. Ordinal features¶

3.1.3. Nominative features¶

3.2. NaN values¶

3.3. Pairwise correlation for features¶

3.4. Mutual information scores between features¶

3.5. Correlation with hierarchy, only numeric features¶

3.6. Target distribution¶

4. Start pipeline¶

4.1. Build pipeline functions and classes¶

4.2. Build performance testing functions¶

5. Baseline score with pipeline¶

6. Ensemble regression baseline¶

6.1. Voting regressor¶

6.2. Stacking regressors¶

7. Log-transform skewed numeric features¶

8. Quantile-transform numeric features¶

9. PCA for outliers¶

9.1. Check for outliers in PCA space¶

9.2. Check performance with PCA outliers scored¶

10. Mutual information¶

10.1. Check feature/target MI scores¶

10.2. Drop features with MI score below some threshold¶

11. Mathematical transforms¶

11.1. Two or more features, combined mathematically¶

11.2. Single feature transforms: area to linear¶

12. Coalesce complex feature categories into fewer / simpler categories¶

13. Interactions¶

13.1. Interaction between GrLivArea and BldgType¶

13.2. Qual / Cond interactions¶

13.3. Categorical / numeric interactions¶

14. Group one feature by another feature¶

15. k-Means clustering for area features¶

15.1. Visualize area feature clusters¶

15.2. Create cluster labels feature¶

15.3. Check cluster labels against the target¶

15.4. Create cluster distance features¶

16. PCA for numeric features¶

16.1. Visualize PCA¶

16.2. Replace all numeric features with some PCA components¶

17. PCA-based clustering for numeric features¶

17.1. Visualize clustering in PCA space¶

17.2. Cluster PCA components, use cluster labels as a feature¶

17.3. Cluster PCA components, use cluster distances as features¶

18. PCA-inspired feature creation¶

18.1. Basement square-footage features¶

19. Target Encoding¶

20. Drop columns¶

20.1. Drop columns for excessive NaN¶

21. Impute NaN¶

22. Build model pipelines with all steps included¶

22.1. Pipeline helper functions¶

22.2. Create / load Optuna studies¶

22.3. Run loop¶

22.4. Check loop results¶

22.4.1. Best parameters¶

22.4.2. Optuna plots¶

22.4.3. Slice plots¶

22.4.3.1. One model, choose five parameters¶

22.4.3.2. One parameter, all models¶

22.4.4. All model-specific parameters¶

22.4.4.1. xgboost¶

22.4.4.2. lgbm¶

22.4.4.3. catboost¶

22.4.4.4. ridge¶

22.4.4.5. enet¶

22.5. Rebuild best models¶

22.6. Best models performance test¶

22.7. Feature importance¶

22.7.1. xgboost¶

22.7.2. lgbm¶

22.7.3. catboost¶

1. Intro ¶

2. Data preprocessing ¶

2.1. Clean data ¶

2.2. Encode the categories ¶

2.3. Encode time ¶

2.4. Load data ¶

3. Examine the dataset ¶

3.1. Feature distributions ¶

3.1.1. Numeric features ¶

3.1.2. Ordinal features ¶

3.1.3. Nominative features ¶

3.2. NaN values ¶

3.3. Pairwise correlation for features ¶

3.4. Mutual information scores between features ¶

3.5. Correlation with hierarchy, only numeric features ¶

3.6. Target distribution ¶

4. Start pipeline ¶

4.1. Build pipeline functions and classes ¶

4.2. Build performance testing functions ¶

5. Baseline score with pipeline ¶

6. Ensemble regression baseline ¶

6.1. Voting regressor ¶

6.2. Stacking regressors ¶

7. Log-transform skewed numeric features ¶

8. Quantile-transform numeric features ¶

9. PCA for outliers ¶

9.1. Check for outliers in PCA space ¶

9.2. Check performance with PCA outliers scored ¶

10. Mutual information ¶

10.1. Check feature/target MI scores ¶

10.2. Drop features with MI score below some threshold ¶

11. Mathematical transforms ¶

11.1. Two or more features, combined mathematically ¶

11.2. Single feature transforms: area to linear ¶

12. Coalesce complex feature categories into fewer / simpler categories ¶

13. Interactions ¶

13.1. Interaction between GrLivArea and BldgType ¶

13.2. Qual / Cond interactions ¶

13.3. Categorical / numeric interactions ¶

14. Group one feature by another feature ¶

15. k-Means clustering for area features ¶

15.1. Visualize area feature clusters ¶

15.2. Create cluster labels feature ¶

15.3. Check cluster labels against the target ¶

15.4. Create cluster distance features ¶

16. PCA for numeric features ¶

16.1. Visualize PCA ¶

16.2. Replace all numeric features with some PCA components ¶

17. PCA-based clustering for numeric features ¶

17.1. Visualize clustering in PCA space ¶

17.2. Cluster PCA components, use cluster labels as a feature ¶

17.3. Cluster PCA components, use cluster distances as features ¶

18. PCA-inspired feature creation ¶

18.1. Basement square-footage features ¶

19. Target Encoding ¶

20. Drop columns ¶

20.1. Drop columns for excessive NaN ¶

21. Impute NaN ¶

22. Build model pipelines with all steps included ¶

22.1. Pipeline helper functions ¶

22.2. Create / load Optuna studies ¶

22.3. Run loop ¶

22.4. Check loop results ¶

22.4.1. Best parameters ¶

22.4.2. Optuna plots ¶

22.4.3. Slice plots ¶

22.4.3.1. One model, choose five parameters ¶

22.4.3.2. One parameter, all models ¶

22.4.4. All model-specific parameters ¶

22.4.4.1. xgboost ¶

22.4.4.2. lgbm ¶

22.4.4.3. catboost ¶

22.4.4.4. ridge ¶

22.4.4.5. enet ¶

22.5. Rebuild best models ¶

22.6. Best models performance test ¶

22.7. Feature importance ¶

22.7.1. xgboost ¶

22.7.2. lgbm ¶

22.7.3. catboost ¶