AIM-PIbd-31-Yakovlev-M-G/lab_4/lab_4.ipynb

87 KiB
Raw Blame History

In [112]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import set_config

df = pd.read_csv("data/house_data.csv", sep=",", nrows=10000)
df.dropna()
Out[112]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 7129300520 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 ... 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
1 6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 ... 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
2 5631500400 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 ... 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
3 2487200875 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 ... 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
4 1954400510 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 ... 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9995 322059264 20140926T000000 279000.0 2 1.00 1020 47044 1.0 0 0 ... 7 1020 0 1904 1958 98042 47.4206 -122.155 1930 12139
9996 5557500270 20150209T000000 262000.0 3 1.50 1700 9579 1.0 0 0 ... 7 1100 600 1962 0 98023 47.3209 -122.338 1700 9628
9997 9164100125 20140807T000000 533000.0 4 1.00 1550 4750 1.5 0 0 ... 7 1550 0 1919 0 98117 47.6824 -122.389 1320 4750
9998 7370600045 20150402T000000 640000.0 3 1.75 1680 8100 1.0 0 2 ... 8 1680 0 1950 0 98177 47.7212 -122.364 1880 7750
9999 8594400060 20140609T000000 285000.0 3 2.25 1680 35127 2.0 0 0 ... 7 1680 0 1987 0 98092 47.3025 -122.067 1820 35166

10000 rows × 21 columns

Устраняем выбросы в колонке цены и добавляем колонку с категориями цены

In [113]:
q1 = df['price'].quantile(0.25)  # Находим 1-й квартиль (Q1)
q3 = df['price'].quantile(0.75)  # Находим 3-й квартиль (Q3)
iqr = q3 - q1  # Вычисляем межквартильный размах (IQR)

# Определяем границы для выбросов
lower_bound = q1 - 1.5 * iqr  # Нижняя граница
upper_bound = q3 + 1.5 * iqr  # Верхняя граница

# Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю
df['price'] = df['price'].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)

# Добавляем столбец с категорями цены
df['price_category'] = pd.cut(df['price'], bins=[75000,338750,602750,866750,1130750], labels=['low','middle','high','very_high'], include_lowest=True)
df.tail(20)
Out[113]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15 price_category
9980 6840700036 20140728T000000 497000.0 2 1.00 770 3325 1.0 0 0 ... 770 0 1918 0 98122 47.6102 -122.299 960 4800 middle
9981 1824069083 20150429T000000 835000.0 3 1.00 3060 30166 1.0 0 0 ... 3060 0 1959 0 98027 47.5656 -122.093 1880 19602 high
9982 1836980240 20141015T000000 730000.0 4 2.75 2920 4500 2.0 0 0 ... 2920 0 1999 0 98006 47.5646 -122.124 2920 4505 high
9983 3528900160 20141001T000000 655000.0 3 1.00 1370 5250 1.0 0 0 ... 1070 300 1939 0 98109 47.6421 -122.348 2410 4200 high
9984 1442800060 20141120T000000 205000.0 3 2.50 1870 3118 2.0 0 0 ... 1870 0 1993 0 98038 47.3739 -122.056 1580 3601 low
9985 8722100030 20150407T000000 632750.0 4 2.00 1800 4800 1.5 0 0 ... 1800 0 1918 0 98112 47.6388 -122.302 1950 4800 high
9986 1723049624 20140512T000000 330000.0 5 3.00 2100 7715 1.0 0 0 ... 1250 850 2013 0 98168 47.4866 -122.319 2100 7959 low
9987 4040400200 20141007T000000 527500.0 5 2.25 2530 8250 2.0 0 0 ... 2530 0 1961 0 98007 47.6117 -122.134 2020 8250 middle
9988 8691391090 20140508T000000 716500.0 4 2.50 3290 6465 2.0 0 0 ... 3290 0 2002 0 98075 47.5981 -121.976 3100 5929 high
9989 7853302190 20141217T000000 388500.0 4 2.50 1890 5395 2.0 0 0 ... 1890 0 2006 0 98065 47.5415 -121.883 2060 5395 middle
9990 3260000700 20140904T000000 530000.0 3 1.75 1680 7770 1.0 0 0 ... 1680 0 1967 0 98005 47.6028 -122.167 1880 7770 middle
9991 5126300510 20150108T000000 419000.0 3 2.50 2170 4517 2.0 0 0 ... 2170 0 2002 0 98059 47.4819 -122.140 2610 4770 middle
9992 7199330370 20150309T000000 385000.0 3 1.75 1200 7360 1.0 0 0 ... 1200 0 1978 0 98052 47.6979 -122.130 1200 7500 middle
9993 1854900240 20140528T000000 655000.0 4 2.50 2990 5669 2.0 0 0 ... 2990 0 2003 0 98074 47.6119 -122.011 3110 5058 high
9994 6738700335 20140701T000000 1127312.5 4 2.75 3770 10900 2.0 0 2 ... 3070 700 1924 0 98144 47.5849 -122.290 3000 5000 very_high
9995 322059264 20140926T000000 279000.0 2 1.00 1020 47044 1.0 0 0 ... 1020 0 1904 1958 98042 47.4206 -122.155 1930 12139 low
9996 5557500270 20150209T000000 262000.0 3 1.50 1700 9579 1.0 0 0 ... 1100 600 1962 0 98023 47.3209 -122.338 1700 9628 low
9997 9164100125 20140807T000000 533000.0 4 1.00 1550 4750 1.5 0 0 ... 1550 0 1919 0 98117 47.6824 -122.389 1320 4750 middle
9998 7370600045 20150402T000000 640000.0 3 1.75 1680 8100 1.0 0 2 ... 1680 0 1950 0 98177 47.7212 -122.364 1880 7750 high
9999 8594400060 20140609T000000 285000.0 3 2.25 1680 35127 2.0 0 0 ... 1680 0 1987 0 98092 47.3025 -122.067 1820 35166 low

20 rows × 22 columns

Бизнес-цели

  1. Прогноз класса цены недвижимости (Классификация)
  2. Оценка состояния недвижимости (Регрессия)

Определение достижимого уровня качества модели для первой задачи

Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации (Целевой признак - price)

In [114]:
from typing import Tuple
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split

def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
   
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )
    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.
    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )
    if frac_val <= 0:
        assert len(df_input) == len(df_train) + len(df_temp)
        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )
    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    return df_train, df_val, df_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df, stratify_colname="price_category", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42
)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)
'X_train'
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15 price_category
9843 3260000340 20140622T000000 732600.0 4 2.50 2130 7300 1.0 0 0 ... 1230 900 1963 0 98005 47.6050 -122.167 2130 7560 high
9623 9828702055 20140508T000000 358000.0 2 1.50 960 1808 2.0 0 0 ... 960 0 1993 0 98122 47.6183 -122.298 1290 1668 middle
3095 3438500625 20140519T000000 210000.0 3 1.00 1080 21043 1.0 0 0 ... 1080 0 1942 0 98106 47.5515 -122.357 1380 7620 low
411 2422029094 20140716T000000 517534.0 2 1.00 833 143947 1.0 0 0 ... 833 0 2006 0 98070 47.3889 -122.482 1380 143947 middle
3060 7462900015 20150108T000000 387000.0 3 2.25 1760 45133 2.0 0 0 ... 1760 0 1984 0 98065 47.5124 -121.866 1910 51773 middle
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1750 2787720140 20150407T000000 416000.0 3 2.50 1790 11542 1.0 0 0 ... 1190 600 1969 0 98059 47.5124 -122.160 1790 9131 middle
2354 6192400400 20140728T000000 775000.0 4 2.50 3090 7112 2.0 0 0 ... 3090 0 2001 0 98052 47.7050 -122.118 3050 6000 high
857 2296500036 20150310T000000 450000.0 4 2.75 2980 13260 1.0 0 0 ... 1800 1180 1979 0 98056 47.5152 -122.197 1920 10731 middle
6181 2787310130 20141212T000000 289950.0 4 1.75 2090 7416 1.0 0 0 ... 1050 1040 1970 0 98031 47.4107 -122.179 1710 7527 low
3141 8567300110 20140604T000000 485000.0 3 2.50 2340 59058 1.0 0 0 ... 2340 0 1985 0 98038 47.4052 -122.028 2700 37263 middle

8000 rows × 22 columns

'y_train'
price_category
9843 high
9623 middle
3095 low
411 middle
3060 middle
... ...
1750 middle
2354 high
857 middle
6181 low
3141 middle

8000 rows × 1 columns

'X_test'
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15 price_category
5341 6632900574 20150225T000000 595000.0 5 3.00 2980 10064 1.0 0 0 ... 1680 1300 1940 0 98155 47.7372 -122.316 1590 7800 middle
4384 2423029245 20140617T000000 550000.0 3 1.75 2240 78225 2.0 0 0 ... 2240 0 1976 0 98070 47.4638 -122.484 2030 202554 middle
5795 2473370050 20140604T000000 327500.0 4 1.75 1650 7800 1.0 0 0 ... 1650 0 1968 0 98058 47.4507 -122.139 1750 10400 low
4956 9528104985 20141104T000000 611000.0 2 1.00 1270 5100 1.0 0 0 ... 1100 170 1900 0 98115 47.6771 -122.328 1670 3900 high
7723 3972900025 20150313T000000 499000.0 6 1.75 2400 7500 1.5 0 0 ... 1400 1000 1975 0 98155 47.7661 -122.313 1980 7500 middle
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8517 3876600120 20150422T000000 265000.0 3 1.50 1780 10196 1.0 0 0 ... 1270 510 1967 0 98001 47.3375 -122.291 1320 7875 low
6914 6821600005 20150403T000000 710000.0 4 1.75 2120 5400 1.0 0 0 ... 1060 1060 1941 0 98199 47.6501 -122.395 2052 6000 high
4499 2767603931 20140818T000000 469000.0 3 3.25 1370 1194 3.0 0 0 ... 1370 0 2004 0 98107 47.6718 -122.388 1800 2678 middle
8651 8802400411 20140619T000000 249000.0 3 1.00 1050 8498 1.0 0 0 ... 1050 0 1959 0 98031 47.4043 -122.202 1050 8498 low
4234 5452800735 20140722T000000 780000.0 4 2.50 2270 13449 1.0 0 0 ... 1310 960 1975 0 98040 47.5416 -122.232 2810 13475 high

2000 rows × 22 columns

'y_test'
price_category
5341 middle
4384 middle
5795 low
4956 high
7723 middle
... ...
8517 low
6914 high
4499 middle
8651 low
4234 high

2000 rows × 1 columns

Формирование конвейера

preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация

preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование

features_preprocessing -- трансформер для предобработки признаков

features_engineering -- трансформер для конструирования признаков

drop_columns -- трансформер для удаления колонок

pipeline_end -- основной конвейер предобработки данных и конструирования признаков

In [ ]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor  # Пример регрессионной модели
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

class HousesFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self


    def transform(self, X, y=None):

        def get_price_type(category) -> int:
            if pd.isna(category):
                return "unknown"
            if category == 'low':
                return 1
            elif category == 'middle':
                return 2
            elif category == 'high':
                return 3
            elif category == 'very_high':
                return 4

        # Преобразование категориальных столбцов в числовые 1/0
        X["price_category"] = [get_price_type(category) for category in X["price_category"]]
        return X

    def get_feature_names_out(self, features_in):
        return np.append(features_in, ["price_type"], axis=0)

# Указываем столбцы, которые нужно удалить и обрабатывать
columns_to_drop = ["date", "view", "waterfront"]
num_columns = [
    column
    for column in df.columns
    if column not in columns_to_drop and df[column].dtype != "object" and df[column].dtype != "category"
]
cat_columns = [
    column
    for column in df.columns
    if column not in columns_to_drop and df[column].dtype == "object" or df[column].dtype == "category"
]

# Определяем предобработку для численных данных
num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
    [
        ("imputer", num_imputer),
        ("scaler", num_scaler),
    ]
)

# Определяем предобработку для категориальных данных
cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", cat_encoder),
    ]
)

features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num, num_columns),
        ("prepocessing_cat", preprocessing_cat, cat_columns),
    ],
    remainder="passthrough"
)

# features_engineering = ColumnTransformer(
#     verbose_feature_names_out=False,
#     transformers=[
#         ("add_features", HousesFeatures(), ["price_category"]),
#     ],
#     remainder="passthrough",
# )

drop_columns = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("drop_columns", "drop", columns_to_drop),
    ],
    remainder="passthrough",
)

features_postprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_cat", preprocessing_cat, ["price_category"]),
    ],
    remainder="passthrough",
)

pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
        ("features_engineering", features_engineering),
        ("drop_columns", drop_columns),
        ("features_postprocessing", features_postprocessing),
    ]

)
cols = ['a', 'b']
preprocessing_result = drop_columns.fit_transform(X_train)
preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)
preprocessing_result = features_preprocessing.fit_transform(preprocessing_result)
preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns + cols)
preprocessing_result = features_engineering.fit_transform(preprocessing_result)
preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)

# preprocessing_result = features_postprocessing.fit_transform(preprocessing_result)

# preprocessing_result = pipeline_end.fit_transform(X_train)
# preprocessed_df = pd.DataFrame(
#     preprocessing_result,
#     columns=pipeline_end.get_feature_names_out(),
# )
# preprocessed_df
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[184], line 123
    121 preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns + cols)
    122 preprocessing_result = features_engineering.fit_transform(preprocessing_result)
--> 123 preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)
    125 # preprocessing_result = features_postprocessing.fit_transform(preprocessing_result)
    126 
    127 # preprocessing_result = pipeline_end.fit_transform(X_train)
   (...)
    131 # )
    132 # preprocessed_df

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\pandas\core\frame.py:827, in DataFrame.__init__(self, data, index, columns, dtype, copy)
    816         mgr = dict_to_mgr(
    817             # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
    818             # attribute "name"
   (...)
    824             copy=_copy,
    825         )
    826     else:
--> 827         mgr = ndarray_to_mgr(
    828             data,
    829             index,
    830             columns,
    831             dtype=dtype,
    832             copy=copy,
    833             typ=manager,
    834         )
    836 # For data is list-like, or Iterable (will consume into list)
    837 elif is_list_like(data):

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\pandas\core\internals\construction.py:336, in ndarray_to_mgr(values, index, columns, dtype, copy, typ)
    331 # _prep_ndarraylike ensures that values.ndim == 2 at this point
    332 index, columns = _get_axes(
    333     values.shape[0], values.shape[1], index=index, columns=columns
    334 )
--> 336 _check_values_indices_shape_match(values, index, columns)
    338 if typ == "array":
    339     if issubclass(values.dtype.type, str):

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\pandas\core\internals\construction.py:420, in _check_values_indices_shape_match(values, index, columns)
    418 passed = values.shape
    419 implied = (len(index), len(columns))
--> 420 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")

ValueError: Shape of passed values is (8000, 21), indices imply (8000, 19)