AIM-PIbd-31-Yakovlev-M-G/lab_4.ipynb at da729ef74ed9ff6a407e30513dd20034ccc9c91e

Максим Яковлев da729ef74e ниче не работает T.T

2024-11-15 16:44:23 +04:00

87 KiB

Raw Blame History

In [112]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import set_config

df = pd.read_csv("data/house_data.csv", sep=",", nrows=10000)
df.dropna()

Out[112]:

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900.0	3	1.00	1180	5650	1.0	0	0	...	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	0	0	...	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000.0	2	1.00	770	10000	1.0	0	0	...	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	0	0	...	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	0	0	...	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9995	322059264	20140926T000000	279000.0	2	1.00	1020	47044	1.0	0	0	...	7	1020	0	1904	1958	98042	47.4206	-122.155	1930	12139
9996	5557500270	20150209T000000	262000.0	3	1.50	1700	9579	1.0	0	0	...	7	1100	600	1962	0	98023	47.3209	-122.338	1700	9628
9997	9164100125	20140807T000000	533000.0	4	1.00	1550	4750	1.5	0	0	...	7	1550	0	1919	0	98117	47.6824	-122.389	1320	4750
9998	7370600045	20150402T000000	640000.0	3	1.75	1680	8100	1.0	0	2	...	8	1680	0	1950	0	98177	47.7212	-122.364	1880	7750
9999	8594400060	20140609T000000	285000.0	3	2.25	1680	35127	2.0	0	0	...	7	1680	0	1987	0	98092	47.3025	-122.067	1820	35166

10000 rows × 21 columns

Устраняем выбросы в колонке цены и добавляем колонку с категориями цены¶

In [113]:

q1 = df['price'].quantile(0.25)  # Находим 1-й квартиль (Q1)
q3 = df['price'].quantile(0.75)  # Находим 3-й квартиль (Q3)
iqr = q3 - q1  # Вычисляем межквартильный размах (IQR)

# Определяем границы для выбросов
lower_bound = q1 - 1.5 * iqr  # Нижняя граница
upper_bound = q3 + 1.5 * iqr  # Верхняя граница

# Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю
df['price'] = df['price'].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)

# Добавляем столбец с категорями цены
df['price_category'] = pd.cut(df['price'], bins=[75000,338750,602750,866750,1130750], labels=['low','middle','high','very_high'], include_lowest=True)
df.tail(20)

Out[113]:

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	view	...	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15	price_category
9980	6840700036	20140728T000000	497000.0	2	1.00	770	3325	1.0	0	...	770	0	1918	0	98122	47.6102	-122.299	960	4800	middle
9981	1824069083	20150429T000000	835000.0	3	1.00	3060	30166	1.0	0	...	3060	0	1959	0	98027	47.5656	-122.093	1880	19602	high
9982	1836980240	20141015T000000	730000.0	4	2.75	2920	4500	2.0	0	...	2920	0	1999	0	98006	47.5646	-122.124	2920	4505	high
9983	3528900160	20141001T000000	655000.0	3	1.00	1370	5250	1.0	0	...	1070	300	1939	0	98109	47.6421	-122.348	2410	4200	high
9984	1442800060	20141120T000000	205000.0	3	2.50	1870	3118	2.0	0	...	1870	0	1993	0	98038	47.3739	-122.056	1580	3601	low
9985	8722100030	20150407T000000	632750.0	4	2.00	1800	4800	1.5	0	...	1800	0	1918	0	98112	47.6388	-122.302	1950	4800	high
9986	1723049624	20140512T000000	330000.0	5	3.00	2100	7715	1.0	0	...	1250	850	2013	0	98168	47.4866	-122.319	2100	7959	low
9987	4040400200	20141007T000000	527500.0	5	2.25	2530	8250	2.0	0	...	2530	0	1961	0	98007	47.6117	-122.134	2020	8250	middle
9988	8691391090	20140508T000000	716500.0	4	2.50	3290	6465	2.0	0	...	3290	0	2002	0	98075	47.5981	-121.976	3100	5929	high
9989	7853302190	20141217T000000	388500.0	4	2.50	1890	5395	2.0	0	...	1890	0	2006	0	98065	47.5415	-121.883	2060	5395	middle
9990	3260000700	20140904T000000	530000.0	3	1.75	1680	7770	1.0	0	...	1680	0	1967	0	98005	47.6028	-122.167	1880	7770	middle
9991	5126300510	20150108T000000	419000.0	3	2.50	2170	4517	2.0	0	...	2170	0	2002	0	98059	47.4819	-122.140	2610	4770	middle
9992	7199330370	20150309T000000	385000.0	3	1.75	1200	7360	1.0	0	...	1200	0	1978	0	98052	47.6979	-122.130	1200	7500	middle
9993	1854900240	20140528T000000	655000.0	4	2.50	2990	5669	2.0	0	...	2990	0	2003	0	98074	47.6119	-122.011	3110	5058	high
9994	6738700335	20140701T000000	1127312.5	4	2.75	3770	10900	2.0	2	...	3070	700	1924	0	98144	47.5849	-122.290	3000	5000	very_high
9995	322059264	20140926T000000	279000.0	2	1.00	1020	47044	1.0	0	...	1020	0	1904	1958	98042	47.4206	-122.155	1930	12139	low
9996	5557500270	20150209T000000	262000.0	3	1.50	1700	9579	1.0	0	...	1100	600	1962	0	98023	47.3209	-122.338	1700	9628	low
9997	9164100125	20140807T000000	533000.0	4	1.00	1550	4750	1.5	0	...	1550	0	1919	0	98117	47.6824	-122.389	1320	4750	middle
9998	7370600045	20150402T000000	640000.0	3	1.75	1680	8100	1.0	2	...	1680	0	1950	0	98177	47.7212	-122.364	1880	7750	high
9999	8594400060	20140609T000000	285000.0	3	2.25	1680	35127	2.0	0	...	1680	0	1987	0	98092	47.3025	-122.067	1820	35166	low

20 rows × 22 columns

Бизнес-цели¶

Прогноз класса цены недвижимости (Классификация)
Оценка состояния недвижимости (Регрессия)

Определение достижимого уровня качества модели для первой задачи¶

Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации (Целевой признак - price)¶

In [114]:

from typing import Tuple
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split

def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
   
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )
    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.
    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )
    if frac_val <= 0:
        assert len(df_input) == len(df_train) + len(df_temp)
        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )
    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    return df_train, df_val, df_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df, stratify_colname="price_category", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42
)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)

'X_train'

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	...	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15	price_category
9843	3260000340	20140622T000000	732600.0	4	2.50	2130	7300	1.0	0	0	...	1230	900	1963	0	98005	47.6050	-122.167	2130	7560	high
9623	9828702055	20140508T000000	358000.0	2	1.50	960	1808	2.0	0	0	...	960	0	1993	0	98122	47.6183	-122.298	1290	1668	middle
3095	3438500625	20140519T000000	210000.0	3	1.00	1080	21043	1.0	0	0	...	1080	0	1942	0	98106	47.5515	-122.357	1380	7620	low
411	2422029094	20140716T000000	517534.0	2	1.00	833	143947	1.0	0	0	...	833	0	2006	0	98070	47.3889	-122.482	1380	143947	middle
3060	7462900015	20150108T000000	387000.0	3	2.25	1760	45133	2.0	0	0	...	1760	0	1984	0	98065	47.5124	-121.866	1910	51773	middle
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1750	2787720140	20150407T000000	416000.0	3	2.50	1790	11542	1.0	0	0	...	1190	600	1969	0	98059	47.5124	-122.160	1790	9131	middle
2354	6192400400	20140728T000000	775000.0	4	2.50	3090	7112	2.0	0	0	...	3090	0	2001	0	98052	47.7050	-122.118	3050	6000	high
857	2296500036	20150310T000000	450000.0	4	2.75	2980	13260	1.0	0	0	...	1800	1180	1979	0	98056	47.5152	-122.197	1920	10731	middle
6181	2787310130	20141212T000000	289950.0	4	1.75	2090	7416	1.0	0	0	...	1050	1040	1970	0	98031	47.4107	-122.179	1710	7527	low
3141	8567300110	20140604T000000	485000.0	3	2.50	2340	59058	1.0	0	0	...	2340	0	1985	0	98038	47.4052	-122.028	2700	37263	middle

8000 rows × 22 columns

'y_train'

	price_category
9843	high
9623	middle
3095	low
411	middle
3060	middle
...	...
1750	middle
2354	high
857	middle
6181	low
3141	middle

8000 rows × 1 columns

'X_test'

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	...	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15	price_category
5341	6632900574	20150225T000000	595000.0	5	3.00	2980	10064	1.0	0	0	...	1680	1300	1940	0	98155	47.7372	-122.316	1590	7800	middle
4384	2423029245	20140617T000000	550000.0	3	1.75	2240	78225	2.0	0	0	...	2240	0	1976	0	98070	47.4638	-122.484	2030	202554	middle
5795	2473370050	20140604T000000	327500.0	4	1.75	1650	7800	1.0	0	0	...	1650	0	1968	0	98058	47.4507	-122.139	1750	10400	low
4956	9528104985	20141104T000000	611000.0	2	1.00	1270	5100	1.0	0	0	...	1100	170	1900	0	98115	47.6771	-122.328	1670	3900	high
7723	3972900025	20150313T000000	499000.0	6	1.75	2400	7500	1.5	0	0	...	1400	1000	1975	0	98155	47.7661	-122.313	1980	7500	middle
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
8517	3876600120	20150422T000000	265000.0	3	1.50	1780	10196	1.0	0	0	...	1270	510	1967	0	98001	47.3375	-122.291	1320	7875	low
6914	6821600005	20150403T000000	710000.0	4	1.75	2120	5400	1.0	0	0	...	1060	1060	1941	0	98199	47.6501	-122.395	2052	6000	high
4499	2767603931	20140818T000000	469000.0	3	3.25	1370	1194	3.0	0	0	...	1370	0	2004	0	98107	47.6718	-122.388	1800	2678	middle
8651	8802400411	20140619T000000	249000.0	3	1.00	1050	8498	1.0	0	0	...	1050	0	1959	0	98031	47.4043	-122.202	1050	8498	low
4234	5452800735	20140722T000000	780000.0	4	2.50	2270	13449	1.0	0	0	...	1310	960	1975	0	98040	47.5416	-122.232	2810	13475	high

2000 rows × 22 columns

'y_test'

	price_category
5341	middle
4384	middle
5795	low
4956	high
7723	middle
...	...
8517	low
6914	high
4499	middle
8651	low
4234	high

2000 rows × 1 columns

Формирование конвейера¶

preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация

preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование

features_preprocessing -- трансформер для предобработки признаков

features_engineering -- трансформер для конструирования признаков

drop_columns -- трансформер для удаления колонок

pipeline_end -- основной конвейер предобработки данных и конструирования признаков

In [ ]:

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor  # Пример регрессионной модели
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

class HousesFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self


    def transform(self, X, y=None):

        def get_price_type(category) -> int:
            if pd.isna(category):
                return "unknown"
            if category == 'low':
                return 1
            elif category == 'middle':
                return 2
            elif category == 'high':
                return 3
            elif category == 'very_high':
                return 4

        # Преобразование категориальных столбцов в числовые 1/0
        X["price_category"] = [get_price_type(category) for category in X["price_category"]]
        return X

    def get_feature_names_out(self, features_in):
        return np.append(features_in, ["price_type"], axis=0)

# Указываем столбцы, которые нужно удалить и обрабатывать
columns_to_drop = ["date", "view", "waterfront"]
num_columns = [
    column
    for column in df.columns
    if column not in columns_to_drop and df[column].dtype != "object" and df[column].dtype != "category"
]
cat_columns = [
    column
    for column in df.columns
    if column not in columns_to_drop and df[column].dtype == "object" or df[column].dtype == "category"
]

# Определяем предобработку для численных данных
num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
    [
        ("imputer", num_imputer),
        ("scaler", num_scaler),
    ]
)

# Определяем предобработку для категориальных данных
cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", cat_encoder),
    ]
)

features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num, num_columns),
        ("prepocessing_cat", preprocessing_cat, cat_columns),
    ],
    remainder="passthrough"
)

# features_engineering = ColumnTransformer(
#     verbose_feature_names_out=False,
#     transformers=[
#         ("add_features", HousesFeatures(), ["price_category"]),
#     ],
#     remainder="passthrough",
# )

drop_columns = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("drop_columns", "drop", columns_to_drop),
    ],
    remainder="passthrough",
)

features_postprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_cat", preprocessing_cat, ["price_category"]),
    ],
    remainder="passthrough",
)

pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
        ("features_engineering", features_engineering),
        ("drop_columns", drop_columns),
        ("features_postprocessing", features_postprocessing),
    ]

)
cols = ['a', 'b']
preprocessing_result = drop_columns.fit_transform(X_train)
preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)
preprocessing_result = features_preprocessing.fit_transform(preprocessing_result)
preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns + cols)
preprocessing_result = features_engineering.fit_transform(preprocessing_result)
preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)

# preprocessing_result = features_postprocessing.fit_transform(preprocessing_result)

# preprocessing_result = pipeline_end.fit_transform(X_train)
# preprocessed_df = pd.DataFrame(
#     preprocessing_result,
#     columns=pipeline_end.get_feature_names_out(),
# )
# preprocessed_df

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[184], line 123
    121 preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns + cols)
    122 preprocessing_result = features_engineering.fit_transform(preprocessing_result)
--> 123 preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)
    125 # preprocessing_result = features_postprocessing.fit_transform(preprocessing_result)
    126 
    127 # preprocessing_result = pipeline_end.fit_transform(X_train)
   (...)
    131 # )
    132 # preprocessed_df

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\pandas\core\frame.py:827, in DataFrame.__init__(self, data, index, columns, dtype, copy)
    816         mgr = dict_to_mgr(
    817             # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
    818             # attribute "name"
   (...)
    824             copy=_copy,
    825         )
    826     else:
--> 827         mgr = ndarray_to_mgr(
    828             data,
    829             index,
    830             columns,
    831             dtype=dtype,
    832             copy=copy,
    833             typ=manager,
    834         )
    836 # For data is list-like, or Iterable (will consume into list)
    837 elif is_list_like(data):

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\pandas\core\internals\construction.py:336, in ndarray_to_mgr(values, index, columns, dtype, copy, typ)
    331 # _prep_ndarraylike ensures that values.ndim == 2 at this point
    332 index, columns = _get_axes(
    333     values.shape[0], values.shape[1], index=index, columns=columns
    334 )
--> 336 _check_values_indices_shape_match(values, index, columns)
    338 if typ == "array":
    339     if issubclass(values.dtype.type, str):

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\pandas\core\internals\construction.py:420, in _check_values_indices_shape_match(values, index, columns)
    418 passed = values.shape
    419 implied = (len(index), len(columns))
--> 420 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")

ValueError: Shape of passed values is (8000, 21), indices imply (8000, 19)

87 KiB Raw Blame History Unescape Escape