Mii_Kislitsa_Egor_Pibd_33/lec4_reg.ipynb at Lab5

Загрузка данных¶

In [30]:

import pandas as pd

from sklearn.preprocessing import OneHotEncoder
import numpy as np  # type: ignore

from sklearn import set_config

set_config(transform_output="pandas")

random_state = 9

df = pd.read_csv("data/kc_house_data.csv", index_col=False)

encoder = OneHotEncoder(sparse_output=False, drop="first")

encoded_values = encoder.fit_transform(df[["date","price","yr_built", "zipcode"]])

encoded_columns = encoder.get_feature_names_out(["date","price", "yr_built", "zipcode"])

encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)

df = pd.concat([df, encoded_values_df], axis=1)

df = df.drop(
    [
        "yr_built",
        "date",
        "lat",
        "sqft_living15",
        "sqft_lot15",
        "zipcode",
        "sqft_basement",
        "sqft_above",
        "sqft_living",
    ],
    axis=1,
)

df

Out[30]:

	id	price	bedrooms	bathrooms	sqft_lot	floors	waterfront	view	condition	grade	...	zipcode_98146	zipcode_98148	zipcode_98155	zipcode_98166	zipcode_98168	zipcode_98177	zipcode_98178	zipcode_98188	zipcode_98198	zipcode_98199
0	7129300520	221900.0	3	1.00	5650	1.0	0	0	3	7	...	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0
1	6414100192	538000.0	3	2.25	7242	2.0	0	0	3	7	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	5631500400	180000.0	2	1.00	10000	1.0	0	0	3	6	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	2487200875	604000.0	4	3.00	5000	1.0	0	0	5	7	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	1954400510	510000.0	3	2.00	8080	1.0	0	0	3	8	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
21608	263000018	360000.0	3	2.50	1131	3.0	0	0	3	8	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21609	6600060120	400000.0	4	2.50	5813	2.0	0	0	3	8	...	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21610	1523300141	402101.0	2	0.75	1350	2.0	0	0	3	7	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21611	291310100	400000.0	3	2.50	2388	2.0	0	0	3	8	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21612	1523300157	325000.0	2	0.75	1076	2.0	0	0	3	7	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

21613 rows × 4594 columns

Формирование выборок¶

In [31]:

from utils import split_stratified_into_train_val_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df,
    stratify_colname="floors",
    frac_train=0.80,
    frac_val=0,
    frac_test=0.20,
    random_state=random_state,
)

X_train = X_train.drop(["floors"], axis=1)
X_test = X_test.drop(["floors"], axis=1)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)

'X_train'

	id	price	bedrooms	bathrooms	sqft_lot	waterfront	view	condition	grade	yr_renovated	...	zipcode_98146	zipcode_98148	zipcode_98155	zipcode_98166	zipcode_98168	zipcode_98177	zipcode_98178	zipcode_98188	zipcode_98198	zipcode_98199
20358	9265880170	550000.0	4	2.50	5954	0	0	3	8	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
10171	98030400	790000.0	4	3.50	6098	0	0	3	10	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21214	3343903611	615000.0	5	3.25	7069	0	0	3	9	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
8181	1139000215	416000.0	2	1.75	7560	0	0	4	7	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
15657	7893805650	475000.0	5	2.00	10200	0	0	3	6	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
14527	3888100133	360000.0	3	1.00	10988	0	0	3	7	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
13392	7137800085	185000.0	3	1.75	9085	0	0	3	7	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
15693	2402100675	645000.0	3	3.75	6000	0	0	5	7	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
19716	3943600070	400000.0	3	2.50	4788	0	0	3	8	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
7612	6679000720	296000.0	3	2.50	5845	0	0	3	7	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

17290 rows × 4593 columns

'y_train'

	floors
20358	2.0
10171	2.0
21214	2.0
8181	1.5
15657	1.0
...	...
14527	1.0
13392	1.0
15693	2.0
19716	2.0
7612	2.0

17290 rows × 1 columns

'X_test'

	id	price	bedrooms	bathrooms	sqft_lot	waterfront	view	condition	grade	yr_renovated	...	zipcode_98146	zipcode_98148	zipcode_98155	zipcode_98166	zipcode_98168	zipcode_98177	zipcode_98178	zipcode_98188	zipcode_98198	zipcode_98199
12435	1217000340	340000.0	3	1.00	8100	0	0	4	7	0	...	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
19037	984200690	299000.0	5	2.50	9360	0	0	4	7	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
6609	7971300020	800000.0	5	2.00	10960	0	0	4	7	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
11869	1498303905	615000.0	4	1.50	3240	0	0	4	8	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
13414	4402700230	352500.0	3	1.50	7680	0	0	3	7	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
12113	3861500340	279900.0	3	1.75	6620	0	0	3	7	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5776	1870400470	637800.0	4	1.75	4750	0	0	4	7	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
8671	4022900571	385000.0	5	2.00	11750	0	0	3	7	0	...	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21551	1561750040	1375000.0	5	4.50	13405	0	0	3	11	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
6634	1853200190	612000.0	4	2.50	5974	0	0	3	8	0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

4323 rows × 4593 columns

'y_test'

	floors
12435	1.0
19037	1.0
6609	1.0
11869	1.5
13414	1.0
...	...
12113	1.0
5776	1.5
8671	1.0
21551	2.0
6634	2.0

4323 rows × 1 columns

Определение перечня алгоритмов решения задачи аппроксимации (регрессии)¶

In [32]:

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble, neural_network

random_state = 9

models = {
    "linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
    "linear_poly": {
        "model": make_pipeline(
            PolynomialFeatures(degree=2),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "linear_interact": {
        "model": make_pipeline(
            PolynomialFeatures(interaction_only=True),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "ridge": {"model": linear_model.RidgeCV()},
    "decision_tree": {
        "model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
    "random_forest": {
        "model": ensemble.RandomForestRegressor(
            max_depth=7, random_state=random_state, n_jobs=-1
        )
    },
    "mlp": {
        "model": neural_network.MLPRegressor(
            activation="tanh",
            hidden_layer_sizes=(3,),
            max_iter=500,
            early_stopping=True,
            random_state=random_state,
        )
    },
}

Определение функции для стандартизации значений в столбце "Температура" для MLP¶

In [34]:

from pandas import DataFrame
from sklearn import preprocessing


stndart_scaler = preprocessing.StandardScaler()


def std_q(df: DataFrame) -> DataFrame:
    df["price"] = np.array(
        stndart_scaler.fit_transform(df["price"].to_numpy().reshape(-1, 1))
    ).reshape(df["price"].shape)
    df["bedrooms"] = np.array(
        stndart_scaler.fit_transform(df["bedrooms"].to_numpy().reshape(-1, 1))
    ).reshape(df["bedrooms"].shape)
    return df

Обучение и оценка моделей с помощью различных алгоритмов¶

In [35]:

from sklearn.decomposition import PCA
import math
from pandas import DataFrame
from sklearn import metrics

# Adding PCA to reduce dimensionality
pca = PCA(n_components=100)  # Adjust based on memory constraints

# Transform X_train and X_test
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

for model_name in models.keys():
    print(f"Model: {model_name}")

    X_train_model = X_train_reduced
    X_test_model = X_test_reduced

    if model_name == "mlp":
        X_train_model = std_q(X_train)
        X_test_model = std_q(X_test)

    fitted_model = models[model_name]["model"].fit(
        X_train_model, y_train.values.ravel()
    )
    y_train_pred = fitted_model.predict(X_train_model)
    y_test_pred = fitted_model.predict(X_test_model)
    models[model_name]["fitted"] = fitted_model
    models[model_name]["train_preds"] = y_train_pred
    models[model_name]["preds"] = y_test_pred
    models[model_name]["RMSE_train"] = math.sqrt(
        metrics.mean_squared_error(y_train, y_train_pred)
    )
    models[model_name]["RMSE_test"] = math.sqrt(
        metrics.mean_squared_error(y_test, y_test_pred)
    )
    models[model_name]["RMAE_test"] = math.sqrt(
        metrics.mean_absolute_error(y_test, y_test_pred)
    )
    models[model_name]["R2_test"] = metrics.r2_score(y_test, y_test_pred)

Model: linear
Model: linear_poly
Model: linear_interact
Model: ridge
Model: decision_tree
Model: knn
Model: random_forest
Model: mlp

Вывод результатов оценки¶

In [36]:

reg_metrics = pd.DataFrame.from_dict(models, "index")[
    ["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
    cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])

Out[36]:

	RMSE_train	RMSE_test	RMAE_test	R2_test
random_forest	0.345951	0.377087	0.535564	0.513150
linear	0.395435	0.394567	0.556776	0.466967
decision_tree	0.370941	0.419609	0.546781	0.397161
knn	0.407775	0.480928	0.599105	0.208099
mlp	0.542558	0.545098	0.700044	-0.017329
linear_interact	0.689103	0.688884	0.681739	-0.624815
linear_poly	4.966961	5.119087	1.798987	-88.721546
ridge	76554.119536	77087.654118	261.058309	-20346113161.492393

Вывод реального и "спрогнозированного" результата для обучающей и тестовой выборок¶

Получение лучшей модели

In [37]:

best_model = str(reg_metrics.sort_values(by="RMSE_test").iloc[0].name)

display(best_model)

'random_forest'

Вывод для обучающей выборки

In [ ]:

pd.concat(
    [
        X_train,
        y_train,
        pd.Series(
            models[best_model]["train_preds"],
            index=y_train.index,
            name="FloorPred",
        ),
    ],
    axis=1,
).head(5)

Out[ ]:

	id	price	bedrooms	bathrooms	sqft_lot	condition	grade	...	zipcode_98198	floors	DensityPred
20358	9265880170	0.026934	0.674642	2.50	5954	3	8	...	0.0	2.0	1.763424
10171	98030400	0.684474	0.674642	3.50	6098	3	10	...	0.0	2.0	1.951525
21214	3343903611	0.205017	1.742826	3.25	7069	3	9	...	0.0	2.0	1.802658
8181	1139000215	-0.340193	-1.461725	1.75	7560	4	7	...	0.0	1.5	1.183226
15657	7893805650	-0.178548	1.742826	2.00	10200	3	6	...	1.0	1.0	1.201648

5 rows × 4595 columns

Вывод для тестовой выборки