72 KiB
72 KiB
Загрузка данных¶
In [30]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np # type: ignore
from sklearn import set_config
set_config(transform_output="pandas")
random_state = 9
df = pd.read_csv("data/kc_house_data.csv", index_col=False)
encoder = OneHotEncoder(sparse_output=False, drop="first")
encoded_values = encoder.fit_transform(df[["date","price","yr_built", "zipcode"]])
encoded_columns = encoder.get_feature_names_out(["date","price", "yr_built", "zipcode"])
encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)
df = pd.concat([df, encoded_values_df], axis=1)
df = df.drop(
[
"yr_built",
"date",
"lat",
"sqft_living15",
"sqft_lot15",
"zipcode",
"sqft_basement",
"sqft_above",
"sqft_living",
],
axis=1,
)
df
Out[30]:
Формирование выборок¶
In [31]:
from utils import split_stratified_into_train_val_test
X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
df,
stratify_colname="floors",
frac_train=0.80,
frac_val=0,
frac_test=0.20,
random_state=random_state,
)
X_train = X_train.drop(["floors"], axis=1)
X_test = X_test.drop(["floors"], axis=1)
display("X_train", X_train)
display("y_train", y_train)
display("X_test", X_test)
display("y_test", y_test)
Определение перечня алгоритмов решения задачи аппроксимации (регрессии)¶
In [32]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble, neural_network
random_state = 9
models = {
"linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
"linear_poly": {
"model": make_pipeline(
PolynomialFeatures(degree=2),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"linear_interact": {
"model": make_pipeline(
PolynomialFeatures(interaction_only=True),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"ridge": {"model": linear_model.RidgeCV()},
"decision_tree": {
"model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
},
"knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
"random_forest": {
"model": ensemble.RandomForestRegressor(
max_depth=7, random_state=random_state, n_jobs=-1
)
},
"mlp": {
"model": neural_network.MLPRegressor(
activation="tanh",
hidden_layer_sizes=(3,),
max_iter=500,
early_stopping=True,
random_state=random_state,
)
},
}
Определение функции для стандартизации значений в столбце "Температура" для MLP¶
In [34]:
from pandas import DataFrame
from sklearn import preprocessing
stndart_scaler = preprocessing.StandardScaler()
def std_q(df: DataFrame) -> DataFrame:
df["price"] = np.array(
stndart_scaler.fit_transform(df["price"].to_numpy().reshape(-1, 1))
).reshape(df["price"].shape)
df["bedrooms"] = np.array(
stndart_scaler.fit_transform(df["bedrooms"].to_numpy().reshape(-1, 1))
).reshape(df["bedrooms"].shape)
return df
Обучение и оценка моделей с помощью различных алгоритмов¶
In [35]:
from sklearn.decomposition import PCA
import math
from pandas import DataFrame
from sklearn import metrics
# Adding PCA to reduce dimensionality
pca = PCA(n_components=100) # Adjust based on memory constraints
# Transform X_train and X_test
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)
for model_name in models.keys():
print(f"Model: {model_name}")
X_train_model = X_train_reduced
X_test_model = X_test_reduced
if model_name == "mlp":
X_train_model = std_q(X_train)
X_test_model = std_q(X_test)
fitted_model = models[model_name]["model"].fit(
X_train_model, y_train.values.ravel()
)
y_train_pred = fitted_model.predict(X_train_model)
y_test_pred = fitted_model.predict(X_test_model)
models[model_name]["fitted"] = fitted_model
models[model_name]["train_preds"] = y_train_pred
models[model_name]["preds"] = y_test_pred
models[model_name]["RMSE_train"] = math.sqrt(
metrics.mean_squared_error(y_train, y_train_pred)
)
models[model_name]["RMSE_test"] = math.sqrt(
metrics.mean_squared_error(y_test, y_test_pred)
)
models[model_name]["RMAE_test"] = math.sqrt(
metrics.mean_absolute_error(y_test, y_test_pred)
)
models[model_name]["R2_test"] = metrics.r2_score(y_test, y_test_pred)
Вывод результатов оценки¶
In [36]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[36]:
Вывод реального и "спрогнозированного" результата для обучающей и тестовой выборок¶
Получение лучшей модели
In [37]:
best_model = str(reg_metrics.sort_values(by="RMSE_test").iloc[0].name)
display(best_model)
Вывод для обучающей выборки
In [ ]:
pd.concat(
[
X_train,
y_train,
pd.Series(
models[best_model]["train_preds"],
index=y_train.index,
name="FloorPred",
),
],
axis=1,
).head(5)
Out[ ]:
Вывод для тестовой выборки
In [ ]:
pd.concat(
[
X_test,
y_test,
pd.Series(
models[best_model]["preds"],
index=y_test.index,
name="FloorsPred",
),
],
axis=1,
).head(5)
Out[ ]: