72 KiB
Raw Permalink Blame History

Загрузка данных

In [30]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
import numpy as np  # type: ignore

from sklearn import set_config

set_config(transform_output="pandas")

random_state = 9

df = pd.read_csv("data/kc_house_data.csv", index_col=False)

encoder = OneHotEncoder(sparse_output=False, drop="first")

encoded_values = encoder.fit_transform(df[["date","price","yr_built", "zipcode"]])

encoded_columns = encoder.get_feature_names_out(["date","price", "yr_built", "zipcode"])

encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)

df = pd.concat([df, encoded_values_df], axis=1)

df = df.drop(
    [
        "yr_built",
        "date",
        "lat",
        "sqft_living15",
        "sqft_lot15",
        "zipcode",
        "sqft_basement",
        "sqft_above",
        "sqft_living",
    ],
    axis=1,
)

df
Out[30]:
id price bedrooms bathrooms sqft_lot floors waterfront view condition grade ... zipcode_98146 zipcode_98148 zipcode_98155 zipcode_98166 zipcode_98168 zipcode_98177 zipcode_98178 zipcode_98188 zipcode_98198 zipcode_98199
0 7129300520 221900.0 3 1.00 5650 1.0 0 0 3 7 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
1 6414100192 538000.0 3 2.25 7242 2.0 0 0 3 7 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 5631500400 180000.0 2 1.00 10000 1.0 0 0 3 6 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 2487200875 604000.0 4 3.00 5000 1.0 0 0 5 7 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 1954400510 510000.0 3 2.00 8080 1.0 0 0 3 8 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
21608 263000018 360000.0 3 2.50 1131 3.0 0 0 3 8 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21609 6600060120 400000.0 4 2.50 5813 2.0 0 0 3 8 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21610 1523300141 402101.0 2 0.75 1350 2.0 0 0 3 7 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21611 291310100 400000.0 3 2.50 2388 2.0 0 0 3 8 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21612 1523300157 325000.0 2 0.75 1076 2.0 0 0 3 7 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

21613 rows × 4594 columns

Формирование выборок

In [31]:
from utils import split_stratified_into_train_val_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df,
    stratify_colname="floors",
    frac_train=0.80,
    frac_val=0,
    frac_test=0.20,
    random_state=random_state,
)

X_train = X_train.drop(["floors"], axis=1)
X_test = X_test.drop(["floors"], axis=1)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)
'X_train'
id price bedrooms bathrooms sqft_lot waterfront view condition grade yr_renovated ... zipcode_98146 zipcode_98148 zipcode_98155 zipcode_98166 zipcode_98168 zipcode_98177 zipcode_98178 zipcode_98188 zipcode_98198 zipcode_98199
20358 9265880170 550000.0 4 2.50 5954 0 0 3 8 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
10171 98030400 790000.0 4 3.50 6098 0 0 3 10 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21214 3343903611 615000.0 5 3.25 7069 0 0 3 9 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
8181 1139000215 416000.0 2 1.75 7560 0 0 4 7 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
15657 7893805650 475000.0 5 2.00 10200 0 0 3 6 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14527 3888100133 360000.0 3 1.00 10988 0 0 3 7 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
13392 7137800085 185000.0 3 1.75 9085 0 0 3 7 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
15693 2402100675 645000.0 3 3.75 6000 0 0 5 7 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
19716 3943600070 400000.0 3 2.50 4788 0 0 3 8 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
7612 6679000720 296000.0 3 2.50 5845 0 0 3 7 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

17290 rows × 4593 columns

'y_train'
floors
20358 2.0
10171 2.0
21214 2.0
8181 1.5
15657 1.0
... ...
14527 1.0
13392 1.0
15693 2.0
19716 2.0
7612 2.0

17290 rows × 1 columns

'X_test'
id price bedrooms bathrooms sqft_lot waterfront view condition grade yr_renovated ... zipcode_98146 zipcode_98148 zipcode_98155 zipcode_98166 zipcode_98168 zipcode_98177 zipcode_98178 zipcode_98188 zipcode_98198 zipcode_98199
12435 1217000340 340000.0 3 1.00 8100 0 0 4 7 0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
19037 984200690 299000.0 5 2.50 9360 0 0 4 7 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
6609 7971300020 800000.0 5 2.00 10960 0 0 4 7 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
11869 1498303905 615000.0 4 1.50 3240 0 0 4 8 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
13414 4402700230 352500.0 3 1.50 7680 0 0 3 7 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
12113 3861500340 279900.0 3 1.75 6620 0 0 3 7 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5776 1870400470 637800.0 4 1.75 4750 0 0 4 7 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
8671 4022900571 385000.0 5 2.00 11750 0 0 3 7 0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21551 1561750040 1375000.0 5 4.50 13405 0 0 3 11 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
6634 1853200190 612000.0 4 2.50 5974 0 0 3 8 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

4323 rows × 4593 columns

'y_test'
floors
12435 1.0
19037 1.0
6609 1.0
11869 1.5
13414 1.0
... ...
12113 1.0
5776 1.5
8671 1.0
21551 2.0
6634 2.0

4323 rows × 1 columns

Определение перечня алгоритмов решения задачи аппроксимации (регрессии)

In [32]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble, neural_network

random_state = 9

models = {
    "linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
    "linear_poly": {
        "model": make_pipeline(
            PolynomialFeatures(degree=2),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "linear_interact": {
        "model": make_pipeline(
            PolynomialFeatures(interaction_only=True),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "ridge": {"model": linear_model.RidgeCV()},
    "decision_tree": {
        "model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
    "random_forest": {
        "model": ensemble.RandomForestRegressor(
            max_depth=7, random_state=random_state, n_jobs=-1
        )
    },
    "mlp": {
        "model": neural_network.MLPRegressor(
            activation="tanh",
            hidden_layer_sizes=(3,),
            max_iter=500,
            early_stopping=True,
            random_state=random_state,
        )
    },
}

Определение функции для стандартизации значений в столбце "Температура" для MLP

In [34]:
from pandas import DataFrame
from sklearn import preprocessing


stndart_scaler = preprocessing.StandardScaler()


def std_q(df: DataFrame) -> DataFrame:
    df["price"] = np.array(
        stndart_scaler.fit_transform(df["price"].to_numpy().reshape(-1, 1))
    ).reshape(df["price"].shape)
    df["bedrooms"] = np.array(
        stndart_scaler.fit_transform(df["bedrooms"].to_numpy().reshape(-1, 1))
    ).reshape(df["bedrooms"].shape)
    return df

Обучение и оценка моделей с помощью различных алгоритмов

In [35]:
from sklearn.decomposition import PCA
import math
from pandas import DataFrame
from sklearn import metrics

# Adding PCA to reduce dimensionality
pca = PCA(n_components=100)  # Adjust based on memory constraints

# Transform X_train and X_test
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

for model_name in models.keys():
    print(f"Model: {model_name}")

    X_train_model = X_train_reduced
    X_test_model = X_test_reduced

    if model_name == "mlp":
        X_train_model = std_q(X_train)
        X_test_model = std_q(X_test)

    fitted_model = models[model_name]["model"].fit(
        X_train_model, y_train.values.ravel()
    )
    y_train_pred = fitted_model.predict(X_train_model)
    y_test_pred = fitted_model.predict(X_test_model)
    models[model_name]["fitted"] = fitted_model
    models[model_name]["train_preds"] = y_train_pred
    models[model_name]["preds"] = y_test_pred
    models[model_name]["RMSE_train"] = math.sqrt(
        metrics.mean_squared_error(y_train, y_train_pred)
    )
    models[model_name]["RMSE_test"] = math.sqrt(
        metrics.mean_squared_error(y_test, y_test_pred)
    )
    models[model_name]["RMAE_test"] = math.sqrt(
        metrics.mean_absolute_error(y_test, y_test_pred)
    )
    models[model_name]["R2_test"] = metrics.r2_score(y_test, y_test_pred)
Model: linear
Model: linear_poly
Model: linear_interact
Model: ridge
Model: decision_tree
Model: knn
Model: random_forest
Model: mlp

Вывод результатов оценки

In [36]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
    ["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
    cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[36]:
  RMSE_train RMSE_test RMAE_test R2_test
random_forest 0.345951 0.377087 0.535564 0.513150
linear 0.395435 0.394567 0.556776 0.466967
decision_tree 0.370941 0.419609 0.546781 0.397161
knn 0.407775 0.480928 0.599105 0.208099
mlp 0.542558 0.545098 0.700044 -0.017329
linear_interact 0.689103 0.688884 0.681739 -0.624815
linear_poly 4.966961 5.119087 1.798987 -88.721546
ridge 76554.119536 77087.654118 261.058309 -20346113161.492393

Вывод реального и "спрогнозированного" результата для обучающей и тестовой выборок

Получение лучшей модели

In [37]:
best_model = str(reg_metrics.sort_values(by="RMSE_test").iloc[0].name)

display(best_model)
'random_forest'

Вывод для обучающей выборки

In [ ]:
pd.concat(
    [
        X_train,
        y_train,
        pd.Series(
            models[best_model]["train_preds"],
            index=y_train.index,
            name="FloorPred",
        ),
    ],
    axis=1,
).head(5)
Out[ ]:
id price bedrooms bathrooms sqft_lot waterfront view condition grade yr_renovated ... zipcode_98155 zipcode_98166 zipcode_98168 zipcode_98177 zipcode_98178 zipcode_98188 zipcode_98198 zipcode_98199 floors DensityPred
20358 9265880170 0.026934 0.674642 2.50 5954 0 0 3 8 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.763424
10171 98030400 0.684474 0.674642 3.50 6098 0 0 3 10 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.951525
21214 3343903611 0.205017 1.742826 3.25 7069 0 0 3 9 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.802658
8181 1139000215 -0.340193 -1.461725 1.75 7560 0 0 4 7 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.5 1.183226
15657 7893805650 -0.178548 1.742826 2.00 10200 0 0 3 6 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.201648

5 rows × 4595 columns

Вывод для тестовой выборки

In [ ]:
pd.concat(
    [
        X_test,
        y_test,
        pd.Series(
            models[best_model]["preds"],
            index=y_test.index,
            name="FloorsPred",
        ),
    ],
    axis=1,
).head(5)
Out[ ]:
T Al2O3 TiO2 Density DensityPred
0 30 0.00 0.0 1.05696 1.057040
1 55 0.00 0.0 1.04158 1.041341
2 25 0.05 0.0 1.08438 1.084063
3 30 0.05 0.0 1.08112 1.080764
4 35 0.05 0.0 1.07781 1.077444