148 KiB
148 KiB
In [17]:
import pandas as pd
house = pd.read_csv("data/kc_house_data.csv", index_col="id")
house = house.reset_index(drop=True)
house
Out[17]:
In [18]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
encoder = OneHotEncoder(sparse_output=False, drop="first")
encoded_values = encoder.fit_transform(house[["yr_built", "price"]])
encoded_columns = encoder.get_feature_names_out(["yr_built", "price"])
encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)
encoded_values_df
Out[18]:
In [5]:
house = pd.concat([house, encoded_values_df], axis=1)
house
Out[5]:
In [6]:
labels = ["old", "middle", "new"]
num_bins = 3
In [19]:
hist1, bins1 = np.histogram(
house["yr_built"].fillna(house["yr_built"].median()), bins=num_bins
)
bins1, hist1
Out[19]:
In [20]:
pd.concat([house["yr_built"], pd.cut(house["yr_built"], list(bins1))], axis=1).head(20)
Out[20]:
In [21]:
pd.concat(
[house["yr_built"], pd.cut(house["yr_built"], list(bins1), labels=labels)], axis=1
).head(20)
Out[21]:
In [11]:
bins2 = np.linspace(1899, 2015, 5)
tmp_bins2 = np.digitize(house["yr_built"].fillna(house["yr_built"].median()), bins2)
hist2 = np.bincount(tmp_bins2 - 1)
bins2, hist2
Out[11]:
In [22]:
pd.concat([house["yr_built"], pd.cut(house["yr_built"], list(bins2))], axis=1).head(20)
Out[22]:
In [23]:
hist3, bins3 = np.histogram(
house["yr_built"].fillna(house["yr_built"].median()), bins=[1899, 1957, 2001, 2015]
)
bins3, hist3
Out[23]:
In [24]:
pd.concat([house["yr_built"], pd.cut(house["yr_built"], list(bins3))], axis=1).head(20)
Out[24]:
In [25]:
pd.concat(
[house["yr_built"], pd.cut(house["yr_built"], list(bins3), labels=labels)],
axis=1,
).head(20)
Out[25]:
In [26]:
pd.concat(
[house["yr_built"], pd.qcut(house["yr_built"], q=3, labels=False)], axis=1
).head(20)
Out[26]:
In [27]:
pd.concat(
[house["yr_built"], pd.qcut(house["yr_built"], q=3, labels=labels)], axis=1
).head(20)
Out[27]:
Пример конструирования признаков на основе существующих
In [28]:
house_cleaned = house.drop(["waterfront", "view", "condition"], axis=1, errors="ignore")
house_cleaned = house_cleaned.dropna()
# Признак "Price_category": разделение домов на категории цен
house_cleaned["Price_category"] = pd.qcut(
house_cleaned["price"], q=3, labels=["Low", "Medium", "High"]
)
# Признак "Renovated_flag": 1, если дом был отремонтирован, иначе 0
house_cleaned["Renovated_flag"] = house_cleaned["yr_renovated"].apply(
lambda x: 1 if x > 0 else 0
)
# Признак "Zipcode_area": используем первые три цифры из почтового индекса
house_cleaned["Zipcode_area"] = house_cleaned["zipcode"].apply(lambda x: str(x)[:3])
house_cleaned
Out[28]:
In [30]:
import featuretools as ft
from woodwork.logical_types import Categorical, Datetime
import featuretools as ft
from woodwork.logical_types import Categorical, Datetime
import pandas as pd
# Загрузка данных
df = pd.read_csv("data/kc_house_data.csv")
# Убедимся, что есть уникальный идентификатор для каждой строки (если нет, создаем)
df["id"] = range(len(df))
# Создаем EntitySet для данных о домах
es = ft.EntitySet(id="house_sales")
# Добавляем основной DataFrame в EntitySet с указанием типов данных
es = es.add_dataframe(
dataframe_name="houses",
dataframe=df,
index="id", # Уникальный идентификатор для домов
logical_types={
"date": Datetime,
"zipcode": Categorical,
"condition": Categorical,
"grade": Categorical,
"view": Categorical,
"waterfront": Categorical,
},
)
ft.primitives.list_primitives()
# Автоматическое конструирование признаков с применением корректных примитивов
feature_matrix, feature_defs = ft.dfs(
entityset=es,
target_dataframe_name="houses", # Название основной таблицы
agg_primitives=["mean", "count", "mode", "sum"], # Агрегирующие примитивы
trans_primitives=[
"year",
"month",
"weekday",
"hour",
], # Корректные трансформационные примитивы
max_depth=2, # Максимальная глубина для генерации признаков
)
# Просмотр полученной feature_matrix
print(feature_matrix.head())
In [31]:
house.boxplot(column="condition")
Out[31]:
In [33]:
house_norm = house.copy()
house_norm["ConditionClip"] = house["condition"].clip(2, 5)
house_norm[house_norm["condition"] < 2][["date", "price", "ConditionClip"]]
Out[33]:
In [34]:
from scipy.stats.mstats import winsorize
print(house_norm["condition"].quantile(q=0.95))
house_norm["ConditionWinsorize"] = winsorize(
house_norm["condition"].fillna(house_norm["condition"].mean()), (0.01, 0.05), inplace=False
)
house_norm[house_norm["condition"] < 2][["date", "condition", "ConditionWinsorize"]]
Out[34]:
In [35]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))
house_norm["ConditionNorm"] = min_max_scaler.fit_transform(
house_norm["condition"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)
house_norm["ConditionClipNorm"] = min_max_scaler.fit_transform(
house_norm["ConditionClip"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)
house_norm["ConditionWinsorizeNorm"] = min_max_scaler.fit_transform(
house_norm["ConditionWinsorize"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)
house_norm["ConditionWinsorizeNorm2"] = min_max_scaler_2.fit_transform(
house_norm["ConditionWinsorize"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)
house_norm[
[
"price",
"condition",
"ConditionNorm",
"ConditionClipNorm",
"ConditionWinsorizeNorm",
"ConditionWinsorizeNorm2",
]
].head(20)
Out[35]:
In [36]:
from sklearn import preprocessing
stndart_scaler = preprocessing.StandardScaler()
house_norm["ConditionStand"] = stndart_scaler.fit_transform(
house_norm["condition"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)
house_norm["ConditionClipStand"] = stndart_scaler.fit_transform(
house_norm["ConditionClip"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)
house_norm["ConditionWinsorizeStand"] = stndart_scaler.fit_transform(
house_norm["ConditionWinsorize"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)
house_norm[
[
"price",
"condition",
"ConditionStand",
"ConditionClipStand",
"ConditionWinsorizeStand",
]
].head(20)
Out[36]: