From 48af6da74da9eb454b6d1483dd9c487358a1d80f Mon Sep 17 00:00:00 2001 From: "nikbel2004@outlook.com" Date: Thu, 14 Nov 2024 01:37:01 +0400 Subject: [PATCH] laboratory_4 --- laboratory_4/lab4.ipynb | 5712 +++++++++++++++++++++++++++++++++ laboratory_4/requirements.txt | 40 + 2 files changed, 5752 insertions(+) create mode 100644 laboratory_4/lab4.ipynb create mode 100644 laboratory_4/requirements.txt diff --git a/laboratory_4/lab4.ipynb b/laboratory_4/lab4.ipynb new file mode 100644 index 0000000..5584a37 --- /dev/null +++ b/laboratory_4/lab4.ipynb @@ -0,0 +1,5712 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Начинаем работу... \n", + "\n", + "Датасет: Продажи домов в округе Кинг " + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',\n", + " 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',\n", + " 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',\n", + " 'lat', 'long', 'sqft_living15', 'sqft_lot15'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn import set_config\n", + "\n", + "# Установим параметры для вывода\n", + "set_config(transform_output=\"pandas\")\n", + "\n", + "random_state = 42\n", + "\n", + "# Подключим датафрейм и выгрузим данные\n", + "df = pd.read_csv(\".//static//csv//kc_house_data.csv\")\n", + "print(df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...gradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
0712930052020141013T000000221900.031.00118056501.000...711800195509817847.5112-122.25713405650
1641410019220141209T000000538000.032.25257072422.000...72170400195119919812547.7210-122.31916907639
2563150040020150225T000000180000.021.00770100001.000...67700193309802847.7379-122.23327208062
3248720087520141209T000000604000.043.00196050001.000...71050910196509813647.5208-122.39313605000
4195440051020150218T000000510000.032.00168080801.000...816800198709807447.6168-122.04518007503
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n", + "1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n", + "2 5631500400 20150225T000000 180000.0 2 1.00 770 \n", + "3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n", + "4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n", + "\n", + " sqft_lot floors waterfront view ... grade sqft_above sqft_basement \\\n", + "0 5650 1.0 0 0 ... 7 1180 0 \n", + "1 7242 2.0 0 0 ... 7 2170 400 \n", + "2 10000 1.0 0 0 ... 6 770 0 \n", + "3 5000 1.0 0 0 ... 7 1050 910 \n", + "4 8080 1.0 0 0 ... 8 1680 0 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "0 1955 0 98178 47.5112 -122.257 1340 \n", + "1 1951 1991 98125 47.7210 -122.319 1690 \n", + "2 1933 0 98028 47.7379 -122.233 2720 \n", + "3 1965 0 98136 47.5208 -122.393 1360 \n", + "4 1987 0 98074 47.6168 -122.045 1800 \n", + "\n", + " sqft_lot15 \n", + "0 5650 \n", + "1 7639 \n", + "2 8062 \n", + "3 5000 \n", + "4 7503 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idpricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
count2.161300e+042.161300e+0421613.00000021613.00000021613.0000002.161300e+0421613.00000021613.00000021613.00000021613.00000021613.00000021613.00000021613.00000021613.00000021613.00000021613.00000021613.00000021613.00000021613.00000021613.000000
mean4.580302e+095.400881e+053.3708422.1147572079.8997361.510697e+041.4943090.0075420.2343033.4094307.6568731788.390691291.5090451971.00513684.40225898077.93980547.560053-122.2138961986.55249212768.455652
std2.876566e+093.671272e+050.9300620.770163918.4408974.142051e+040.5399890.0865170.7663180.6507431.175459828.090978442.57504329.373411401.67924053.5050260.1385640.140828685.39130427304.179631
min1.000102e+067.500000e+040.0000000.000000290.0000005.200000e+021.0000000.0000000.0000001.0000001.000000290.0000000.0000001900.0000000.00000098001.00000047.155900-122.519000399.000000651.000000
25%2.123049e+093.219500e+053.0000001.7500001427.0000005.040000e+031.0000000.0000000.0000003.0000007.0000001190.0000000.0000001951.0000000.00000098033.00000047.471000-122.3280001490.0000005100.000000
50%3.904930e+094.500000e+053.0000002.2500001910.0000007.618000e+031.5000000.0000000.0000003.0000007.0000001560.0000000.0000001975.0000000.00000098065.00000047.571800-122.2300001840.0000007620.000000
75%7.308900e+096.450000e+054.0000002.5000002550.0000001.068800e+042.0000000.0000000.0000004.0000008.0000002210.000000560.0000001997.0000000.00000098118.00000047.678000-122.1250002360.00000010083.000000
max9.900000e+097.700000e+0633.0000008.00000013540.0000001.651359e+063.5000001.0000004.0000005.00000013.0000009410.0000004820.0000002015.0000002015.00000098199.00000047.777600-121.3150006210.000000871200.000000
\n", + "
" + ], + "text/plain": [ + " id price bedrooms bathrooms sqft_living \\\n", + "count 2.161300e+04 2.161300e+04 21613.000000 21613.000000 21613.000000 \n", + "mean 4.580302e+09 5.400881e+05 3.370842 2.114757 2079.899736 \n", + "std 2.876566e+09 3.671272e+05 0.930062 0.770163 918.440897 \n", + "min 1.000102e+06 7.500000e+04 0.000000 0.000000 290.000000 \n", + "25% 2.123049e+09 3.219500e+05 3.000000 1.750000 1427.000000 \n", + "50% 3.904930e+09 4.500000e+05 3.000000 2.250000 1910.000000 \n", + "75% 7.308900e+09 6.450000e+05 4.000000 2.500000 2550.000000 \n", + "max 9.900000e+09 7.700000e+06 33.000000 8.000000 13540.000000 \n", + "\n", + " sqft_lot floors waterfront view condition \\\n", + "count 2.161300e+04 21613.000000 21613.000000 21613.000000 21613.000000 \n", + "mean 1.510697e+04 1.494309 0.007542 0.234303 3.409430 \n", + "std 4.142051e+04 0.539989 0.086517 0.766318 0.650743 \n", + "min 5.200000e+02 1.000000 0.000000 0.000000 1.000000 \n", + "25% 5.040000e+03 1.000000 0.000000 0.000000 3.000000 \n", + "50% 7.618000e+03 1.500000 0.000000 0.000000 3.000000 \n", + "75% 1.068800e+04 2.000000 0.000000 0.000000 4.000000 \n", + "max 1.651359e+06 3.500000 1.000000 4.000000 5.000000 \n", + "\n", + " grade sqft_above sqft_basement yr_built yr_renovated \\\n", + "count 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 \n", + "mean 7.656873 1788.390691 291.509045 1971.005136 84.402258 \n", + "std 1.175459 828.090978 442.575043 29.373411 401.679240 \n", + "min 1.000000 290.000000 0.000000 1900.000000 0.000000 \n", + "25% 7.000000 1190.000000 0.000000 1951.000000 0.000000 \n", + "50% 7.000000 1560.000000 0.000000 1975.000000 0.000000 \n", + "75% 8.000000 2210.000000 560.000000 1997.000000 0.000000 \n", + "max 13.000000 9410.000000 4820.000000 2015.000000 2015.000000 \n", + "\n", + " zipcode lat long sqft_living15 sqft_lot15 \n", + "count 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 \n", + "mean 98077.939805 47.560053 -122.213896 1986.552492 12768.455652 \n", + "std 53.505026 0.138564 0.140828 685.391304 27304.179631 \n", + "min 98001.000000 47.155900 -122.519000 399.000000 651.000000 \n", + "25% 98033.000000 47.471000 -122.328000 1490.000000 5100.000000 \n", + "50% 98065.000000 47.571800 -122.230000 1840.000000 7620.000000 \n", + "75% 98118.000000 47.678000 -122.125000 2360.000000 10083.000000 \n", + "max 98199.000000 47.777600 -121.315000 6210.000000 871200.000000 " + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id 0\n", + "date 0\n", + "price 0\n", + "bedrooms 0\n", + "bathrooms 0\n", + "sqft_living 0\n", + "sqft_lot 0\n", + "floors 0\n", + "waterfront 0\n", + "view 0\n", + "condition 0\n", + "grade 0\n", + "sqft_above 0\n", + "sqft_basement 0\n", + "yr_built 0\n", + "yr_renovated 0\n", + "zipcode 0\n", + "lat 0\n", + "long 0\n", + "sqft_living15 0\n", + "sqft_lot15 0\n", + "dtype: int64\n", + "id False\n", + "date False\n", + "price False\n", + "bedrooms False\n", + "bathrooms False\n", + "sqft_living False\n", + "sqft_lot False\n", + "floors False\n", + "waterfront False\n", + "view False\n", + "condition False\n", + "grade False\n", + "sqft_above False\n", + "sqft_basement False\n", + "yr_built False\n", + "yr_renovated False\n", + "zipcode False\n", + "lat False\n", + "long False\n", + "sqft_living15 False\n", + "sqft_lot15 False\n", + "dtype: bool\n" + ] + } + ], + "source": [ + "# Процент пропущенных значений признаков\n", + "for i in df.columns:\n", + " null_rate = df[i].isnull().sum() / len(df) * 100\n", + " if null_rate > 0:\n", + " print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n", + "\n", + "print(df.isnull().sum())\n", + "\n", + "print(df.isnull().any())" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "date object\n", + "price float64\n", + "bedrooms int64\n", + "bathrooms float64\n", + "sqft_living int64\n", + "sqft_lot int64\n", + "floors float64\n", + "waterfront int64\n", + "view int64\n", + "condition int64\n", + "grade int64\n", + "sqft_above int64\n", + "sqft_basement int64\n", + "yr_built int64\n", + "yr_renovated int64\n", + "zipcode int64\n", + "lat float64\n", + "long float64\n", + "sqft_living15 int64\n", + "sqft_lot15 int64\n", + "dtype: object" + ] + }, + "execution_count": 148, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Проверка типов столбцов\n", + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Выбор бизнес-целей \n", + "Для датасета недвижимости предлагаются две бизнес-цели:\n", + "\n", + "*Задача регрессии* – предсказание цены дома (price). Это может помочь риэлторам и аналитикам определить справедливую рыночную стоимость недвижимости. \n", + "\n", + "*Задача классификации* – определение вероятности того, что цена дома будет выше/ниже медианы рынка. Классифицировать дома по ценовым категориям (например, низкая, средняя, высокая цена). Это может помочь определить, какие дома популярны у покупателей.\n", + "\n", + "## Определение достижимого уровня качества модели \n", + "Для регрессии и классификации мы выберем метрики: \n", + "\n", + "Для регрессии будем использовать метрики MAE (средняя абсолютная ошибка) и R^2 (коэффициент детерминации), стремясь к MAE ниже 10% от средней цены. А классификация будте ориентироваться на метрики accuracy и F1-score при целевом значении accuracy около 80%.\n", + "\n", + "## Ориентир для каждой задачи\n", + "Для регрессии ориентиром будет медианная цена (price.median()), так как это стабильное значение. Для классификации ориентируемся на среднюю вероятность предсказания класса выше медианы.\n", + "\n", + "## Анализ алгоритмов машинного обучения \n", + "Рассмотрим для задачи регрессии:\n", + "\n", + "*Линейная регрессия:* подходит для простых линейных зависимостей. \n", + "*Дерево решений:* учитывает нелинейные зависимости, может учесть сложные закономерности. \n", + "*Случайный лес:* ансамблевый метод, обобщающий данные и эффективно обрабатывающий выбросы. \n", + "\n", + "Для задачи классификации: \n", + "\n", + "*Логистическая регрессия:* простая модель, подходящая для бинарной классификации. \n", + "*Метод опорных векторов (SVM):* работает хорошо на данных с четкими разделениями. \n", + "*Градиентный бустинг:* подходит для сложных и высокоразмерных данных, обеспечивает высокую точность. \n", + "\n", + "## Выбор моделей \n", + "Выбираем по три модели для каждой задачи:\n", + "\n", + "*Регрессия:* Линейная регрессия, Дерево решений, Случайный лес. \n", + "*Классификация:* Логистическая регрессия, Метод опорных векторов (SVM), Градиентный бустинг. \n", + "\n", + "\n", + "## Построение конвейера и визуализации \n", + "Теперь напишем код для загрузки данных, анализа и подготовки моделей с визуализацией результатов.\n", + "\n", + "\n", + "# Начнём с задачи классификации\n", + "\n", + "Целевой признак --> above_median_price\n", + "\n", + "Формируем выборки. Разделяем набор данных на обучающую и тестовые выборки (80/20) для задачи классификации" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'X_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15above_median_priceprice_category
20962127800021020150311T000000110000.021.0082845241.000...0196820079800147.2655-122.244828540200
12284219330039020140923T000000624000.043.252810112501.000...1130198009805247.6920-122.09921101125011
7343428990000520141230T0000001535000.043.25285041002.003...1030190820039812247.6147-122.2852130420012
1424731600014520150325T000000235000.041.00136071321.500...0194109816847.5054-122.3011280717500
1667062940048020140619T000000775000.042.753010159922.000...0199609807547.5895-121.99433301233312
..................................................................
88133270027020140519T000000215000.022.25161020402.000...0197909805647.5180-122.1941950202500
15031712930307020140820T000000735000.042.75304024152.014...0196609811847.5188-122.2562620243312
5234243200013020150414T000000675000.031.75166095491.000...0195609803347.6503-122.1982090954911
1998077410047520140627T000000415000.032.752600646261.500...0200909801447.7185-121.40517406462601
3671884740011520140723T000000590000.032.0024202086521.500...0200509801047.3666-121.978318021213711
\n", + "

17290 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms \\\n", + "20962 1278000210 20150311T000000 110000.0 2 1.00 \n", + "12284 2193300390 20140923T000000 624000.0 4 3.25 \n", + "7343 4289900005 20141230T000000 1535000.0 4 3.25 \n", + "14247 316000145 20150325T000000 235000.0 4 1.00 \n", + "16670 629400480 20140619T000000 775000.0 4 2.75 \n", + "... ... ... ... ... ... \n", + "88 1332700270 20140519T000000 215000.0 2 2.25 \n", + "15031 7129303070 20140820T000000 735000.0 4 2.75 \n", + "5234 2432000130 20150414T000000 675000.0 3 1.75 \n", + "19980 774100475 20140627T000000 415000.0 3 2.75 \n", + "3671 8847400115 20140723T000000 590000.0 3 2.00 \n", + "\n", + " sqft_living sqft_lot floors waterfront view ... sqft_basement \\\n", + "20962 828 4524 1.0 0 0 ... 0 \n", + "12284 2810 11250 1.0 0 0 ... 1130 \n", + "7343 2850 4100 2.0 0 3 ... 1030 \n", + "14247 1360 7132 1.5 0 0 ... 0 \n", + "16670 3010 15992 2.0 0 0 ... 0 \n", + "... ... ... ... ... ... ... ... \n", + "88 1610 2040 2.0 0 0 ... 0 \n", + "15031 3040 2415 2.0 1 4 ... 0 \n", + "5234 1660 9549 1.0 0 0 ... 0 \n", + "19980 2600 64626 1.5 0 0 ... 0 \n", + "3671 2420 208652 1.5 0 0 ... 0 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "20962 1968 2007 98001 47.2655 -122.244 828 \n", + "12284 1980 0 98052 47.6920 -122.099 2110 \n", + "7343 1908 2003 98122 47.6147 -122.285 2130 \n", + "14247 1941 0 98168 47.5054 -122.301 1280 \n", + "16670 1996 0 98075 47.5895 -121.994 3330 \n", + "... ... ... ... ... ... ... \n", + "88 1979 0 98056 47.5180 -122.194 1950 \n", + "15031 1966 0 98118 47.5188 -122.256 2620 \n", + "5234 1956 0 98033 47.6503 -122.198 2090 \n", + "19980 2009 0 98014 47.7185 -121.405 1740 \n", + "3671 2005 0 98010 47.3666 -121.978 3180 \n", + "\n", + " sqft_lot15 above_median_price price_category \n", + "20962 5402 0 0 \n", + "12284 11250 1 1 \n", + "7343 4200 1 2 \n", + "14247 7175 0 0 \n", + "16670 12333 1 2 \n", + "... ... ... ... \n", + "88 2025 0 0 \n", + "15031 2433 1 2 \n", + "5234 9549 1 1 \n", + "19980 64626 0 1 \n", + "3671 212137 1 1 \n", + "\n", + "[17290 rows x 23 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
above_median_price
209620
122841
73431
142470
166701
......
880
150311
52341
199800
36711
\n", + "

17290 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " above_median_price\n", + "20962 0\n", + "12284 1\n", + "7343 1\n", + "14247 0\n", + "16670 1\n", + "... ...\n", + "88 0\n", + "15031 1\n", + "5234 1\n", + "19980 0\n", + "3671 1\n", + "\n", + "[17290 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'X_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15above_median_priceprice_category
11592202870100020140529T000000635200.041.75164042401.000...720192109811747.6766-122.3681300424011
8984940650053020140912T000000249000.022.00109013572.000...0199009802847.7526-122.2441078131800
8280809700033020140721T000000359950.032.75254086042.000...0199109809247.3209-122.1852260743801
792808102037020140709T0000001355000.043.503550110001.002...1290199909800647.5506-122.13441001001212
10371751850758020150502T000000581000.021.00117040801.000...0190909811747.6784-122.3861560458611
..................................................................
16733721265095020140708T000000336000.042.50253081692.000...0199309800347.2634-122.3122220801301
13151436520062020150312T000000394000.031.00145079301.000...300192309812647.5212-122.3711040774001
11667408330435520150318T000000675000.041.75153036151.500...0191309810347.6529-122.3341650420011
3683289110082020140825T000000213500.031.00122060001.000...0196809800247.3245-122.2091420600000
1205995200064020141027T000000715000.031.50167050602.002...0192509812647.5671-122.3791670511812
\n", + "

4323 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms \\\n", + "11592 2028701000 20140529T000000 635200.0 4 1.75 \n", + "8984 9406500530 20140912T000000 249000.0 2 2.00 \n", + "8280 8097000330 20140721T000000 359950.0 3 2.75 \n", + "792 8081020370 20140709T000000 1355000.0 4 3.50 \n", + "10371 7518507580 20150502T000000 581000.0 2 1.00 \n", + "... ... ... ... ... ... \n", + "16733 7212650950 20140708T000000 336000.0 4 2.50 \n", + "13151 4365200620 20150312T000000 394000.0 3 1.00 \n", + "11667 4083304355 20150318T000000 675000.0 4 1.75 \n", + "3683 2891100820 20140825T000000 213500.0 3 1.00 \n", + "12059 952000640 20141027T000000 715000.0 3 1.50 \n", + "\n", + " sqft_living sqft_lot floors waterfront view ... sqft_basement \\\n", + "11592 1640 4240 1.0 0 0 ... 720 \n", + "8984 1090 1357 2.0 0 0 ... 0 \n", + "8280 2540 8604 2.0 0 0 ... 0 \n", + "792 3550 11000 1.0 0 2 ... 1290 \n", + "10371 1170 4080 1.0 0 0 ... 0 \n", + "... ... ... ... ... ... ... ... \n", + "16733 2530 8169 2.0 0 0 ... 0 \n", + "13151 1450 7930 1.0 0 0 ... 300 \n", + "11667 1530 3615 1.5 0 0 ... 0 \n", + "3683 1220 6000 1.0 0 0 ... 0 \n", + "12059 1670 5060 2.0 0 2 ... 0 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "11592 1921 0 98117 47.6766 -122.368 1300 \n", + "8984 1990 0 98028 47.7526 -122.244 1078 \n", + "8280 1991 0 98092 47.3209 -122.185 2260 \n", + "792 1999 0 98006 47.5506 -122.134 4100 \n", + "10371 1909 0 98117 47.6784 -122.386 1560 \n", + "... ... ... ... ... ... ... \n", + "16733 1993 0 98003 47.2634 -122.312 2220 \n", + "13151 1923 0 98126 47.5212 -122.371 1040 \n", + "11667 1913 0 98103 47.6529 -122.334 1650 \n", + "3683 1968 0 98002 47.3245 -122.209 1420 \n", + "12059 1925 0 98126 47.5671 -122.379 1670 \n", + "\n", + " sqft_lot15 above_median_price price_category \n", + "11592 4240 1 1 \n", + "8984 1318 0 0 \n", + "8280 7438 0 1 \n", + "792 10012 1 2 \n", + "10371 4586 1 1 \n", + "... ... ... ... \n", + "16733 8013 0 1 \n", + "13151 7740 0 1 \n", + "11667 4200 1 1 \n", + "3683 6000 0 0 \n", + "12059 5118 1 2 \n", + "\n", + "[4323 rows x 23 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
above_median_price
115921
89840
82800
7921
103711
......
167330
131510
116671
36830
120591
\n", + "

4323 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " above_median_price\n", + "11592 1\n", + "8984 0\n", + "8280 0\n", + "792 1\n", + "10371 1\n", + "... ...\n", + "16733 0\n", + "13151 0\n", + "11667 1\n", + "3683 0\n", + "12059 1\n", + "\n", + "[4323 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id int64\n", + "date object\n", + "price float64\n", + "bedrooms int64\n", + "bathrooms float64\n", + "sqft_living int64\n", + "sqft_lot int64\n", + "floors float64\n", + "waterfront int64\n", + "view int64\n", + "condition int64\n", + "grade int64\n", + "sqft_above int64\n", + "sqft_basement int64\n", + "yr_built int64\n", + "yr_renovated int64\n", + "zipcode int64\n", + "lat float64\n", + "long float64\n", + "sqft_living15 int64\n", + "sqft_lot15 int64\n", + "above_median_price int64\n", + "price_category category\n", + "dtype: object\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from typing import Tuple\n", + "import pandas as pd\n", + "from pandas import DataFrame\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "\n", + "# Создание целевого признака\n", + "median_price = df['price'].median()\n", + "df['above_median_price'] = np.where(df['price'] > median_price, 1, 0)\n", + "\n", + "# Разделение на признаки и целевую переменную\n", + "X = df.drop(columns=['id', 'date', 'price', 'above_median_price'])\n", + "y = df['above_median_price']\n", + "\n", + "# Примерная категоризация\n", + "df['price_category'] = pd.cut(df['price'], bins=[0, 300000, 700000, np.inf], labels=[0, 1, 2])\n", + "\n", + "# Выбор признаков и целевых переменных\n", + "X = df.drop(columns=['id', 'date', 'price', 'price_category'])\n", + "\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n", + " \n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + " \n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + " \n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + "\n", + " if frac_val <= 0:\n", + " assert len(df_input) == len(df_train) + len(df_temp)\n", + " return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + "\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + "\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + " return df_train, df_val, df_test, y_train, y_val, y_test\n", + "\n", + "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", + " df, stratify_colname=\"above_median_price\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42\n", + ")\n", + "\n", + "display(\"X_train\", X_train)\n", + "display(\"y_train\", y_train)\n", + "\n", + "display(\"X_test\", X_test)\n", + "display(\"y_test\", y_test)\n", + "\n", + "\n", + "# Проверка преобразования\n", + "print(df.dtypes)\n", + "\n", + "# Визуализация распределения цен\n", + "plt.figure(figsize=(10, 6))\n", + "sns.histplot(df['price'], bins=50, kde=True)\n", + "plt.title('Распределение цен на недвижимость')\n", + "plt.xlabel('Цена')\n", + "plt.ylabel('Частота')\n", + "plt.show()\n", + "\n", + "# Визуализация зависимости между ценой и количеством спален\n", + "plt.figure(figsize=(10, 6))\n", + "sns.boxplot(x='bedrooms', y='price', data=df)\n", + "plt.title('Зависимость цены от количества спален')\n", + "plt.xlabel('Количество спален')\n", + "plt.ylabel('Цена')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Построение конвейеров предобработки \n", + "Создадим пайплайн для числовых и категориальных данных. \n", + "\n", + "preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n", + "\n", + "preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n", + "\n", + "features_preprocessing -- трансформер для предобработки признаков\n", + "\n", + "features_engineering -- трансформер для конструирования признаков\n", + "\n", + "drop_columns -- трансформер для удаления колонок\n", + "\n", + "pipeline_end -- основной конвейер предобработки данных и конструирования признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn.base import BaseEstimator, TransformerMixin\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.discriminant_analysis import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "pipeline_end = StandardScaler()\n", + "\n", + "\n", + "# Построение конвейеров предобработки\n", + "\n", + "class HouseFeatures(BaseEstimator, TransformerMixin):\n", + " def __init__(self):\n", + " pass\n", + " def fit(self, X, y=None):\n", + " return self\n", + " def transform(self, X, y=None):\n", + " # Создание новых признаков\n", + " X = X.copy()\n", + " X[\"Living_area_to_Lot_ratio\"] = X[\"sqft_living\"] / X[\"sqft_lot\"]\n", + " return X\n", + " def get_feature_names_out(self, features_in):\n", + " # Добавление имен новых признаков\n", + " new_features = [\"Living_area_to_Lot_ratio\"]\n", + " return np.append(features_in, new_features, axis=0)\n", + "\n", + "\n", + "# Обработка числовых данных. Числовой конвейр: заполнение пропущенных значений медианой и стандартизация\n", + "preprocessing_num_class = Pipeline(steps=[\n", + " ('imputer', SimpleImputer(strategy='median')),\n", + " ('scaler', StandardScaler())\n", + "])\n", + "\n", + "preprocessing_cat_class = Pipeline(steps=[\n", + " ('imputer', SimpleImputer(strategy='most_frequent')),\n", + " ('onehot', OneHotEncoder(handle_unknown='ignore'))\n", + "])\n", + "\n", + "columns_to_drop = [\"date\"]\n", + "numeric_columns = [\"sqft_living\", \"sqft_lot\", \"above_median_price\"]\n", + "cat_columns = []\n", + "\n", + "features_preprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"prepocessing_num\", preprocessing_num_class, numeric_columns),\n", + " (\"prepocessing_cat\", preprocessing_cat_class, cat_columns),\n", + " ],\n", + " remainder=\"passthrough\"\n", + ")\n", + "\n", + "drop_columns = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"drop_columns\", \"drop\", columns_to_drop),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "features_postprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " ('preprocessing_cat', preprocessing_cat_class, [\"price_category\"]),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "pipeline_end = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " (\"custom_features\", HouseFeatures()),\n", + " (\"drop_columns\", drop_columns),\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Демонстрация работы конвейра для предобработки данных при классификации**" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sqft_livingsqft_lotabove_median_priceidpricebedroomsbathroomsfloorswaterfrontview...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_categoryLiving_area_to_Lot_ratio
20962-1.360742-0.262132-0.9946931278000210110000.021.001.000...0196820079800147.2655-122.244828540205.191063
122840.794390-0.0941211.0053352193300390624000.043.251.000...1130198009805247.6920-122.0992110112501-8.440052
73430.837884-0.2727231.00533542899000051535000.043.252.003...1030190820039812247.6147-122.285213042002-3.072292
14247-0.782270-0.196986-0.994693316000145235000.041.001.500...0194109816847.5054-122.3011280717503.971201
166701.0118600.0243301.005335629400480775000.042.752.000...0199609807547.5895-121.994333012333241.589045
..................................................................
88-0.510432-0.324180-0.9946931332700270215000.022.252.000...0197909805647.5180-122.1941950202501.574534
150311.044481-0.3148131.0053357129303070735000.042.752.014...0196609811847.5188-122.256262024332-3.317784
5234-0.456065-0.1366111.0053352432000130675000.031.751.000...0195609803347.6503-122.1982090954913.338418
199800.5660461.239169-0.994693774100475415000.032.751.500...0200909801447.7185-121.40517406462610.456795
36710.3703234.8368251.0053358847400115590000.032.001.500...0200509801047.3666-121.978318021213710.076563
\n", + "

17290 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " sqft_living sqft_lot above_median_price id price \\\n", + "20962 -1.360742 -0.262132 -0.994693 1278000210 110000.0 \n", + "12284 0.794390 -0.094121 1.005335 2193300390 624000.0 \n", + "7343 0.837884 -0.272723 1.005335 4289900005 1535000.0 \n", + "14247 -0.782270 -0.196986 -0.994693 316000145 235000.0 \n", + "16670 1.011860 0.024330 1.005335 629400480 775000.0 \n", + "... ... ... ... ... ... \n", + "88 -0.510432 -0.324180 -0.994693 1332700270 215000.0 \n", + "15031 1.044481 -0.314813 1.005335 7129303070 735000.0 \n", + "5234 -0.456065 -0.136611 1.005335 2432000130 675000.0 \n", + "19980 0.566046 1.239169 -0.994693 774100475 415000.0 \n", + "3671 0.370323 4.836825 1.005335 8847400115 590000.0 \n", + "\n", + " bedrooms bathrooms floors waterfront view ... sqft_basement \\\n", + "20962 2 1.00 1.0 0 0 ... 0 \n", + "12284 4 3.25 1.0 0 0 ... 1130 \n", + "7343 4 3.25 2.0 0 3 ... 1030 \n", + "14247 4 1.00 1.5 0 0 ... 0 \n", + "16670 4 2.75 2.0 0 0 ... 0 \n", + "... ... ... ... ... ... ... ... \n", + "88 2 2.25 2.0 0 0 ... 0 \n", + "15031 4 2.75 2.0 1 4 ... 0 \n", + "5234 3 1.75 1.0 0 0 ... 0 \n", + "19980 3 2.75 1.5 0 0 ... 0 \n", + "3671 3 2.00 1.5 0 0 ... 0 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "20962 1968 2007 98001 47.2655 -122.244 828 \n", + "12284 1980 0 98052 47.6920 -122.099 2110 \n", + "7343 1908 2003 98122 47.6147 -122.285 2130 \n", + "14247 1941 0 98168 47.5054 -122.301 1280 \n", + "16670 1996 0 98075 47.5895 -121.994 3330 \n", + "... ... ... ... ... ... ... \n", + "88 1979 0 98056 47.5180 -122.194 1950 \n", + "15031 1966 0 98118 47.5188 -122.256 2620 \n", + "5234 1956 0 98033 47.6503 -122.198 2090 \n", + "19980 2009 0 98014 47.7185 -121.405 1740 \n", + "3671 2005 0 98010 47.3666 -121.978 3180 \n", + "\n", + " sqft_lot15 price_category Living_area_to_Lot_ratio \n", + "20962 5402 0 5.191063 \n", + "12284 11250 1 -8.440052 \n", + "7343 4200 2 -3.072292 \n", + "14247 7175 0 3.971201 \n", + "16670 12333 2 41.589045 \n", + "... ... ... ... \n", + "88 2025 0 1.574534 \n", + "15031 2433 2 -3.317784 \n", + "5234 9549 1 3.338418 \n", + "19980 64626 1 0.456795 \n", + "3671 212137 1 0.076563 \n", + "\n", + "[17290 rows x 23 columns]" + ] + }, + "execution_count": 151, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessing_result = pipeline_end.fit_transform(X_train)\n", + "preprocessed_df = pd.DataFrame(\n", + " preprocessing_result,\n", + " columns=pipeline_end.get_feature_names_out(),\n", + ")\n", + "\n", + "preprocessed_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Формирование набора моделей для классификации\n", + "\n", + "logistic -- логистическая регрессия\n", + "\n", + "ridge -- гребневая регрессия\n", + "\n", + "decision_tree -- дерево решений\n", + "\n", + "knn -- k-ближайших соседей\n", + "\n", + "naive_bayes -- наивный Байесовский классификатор\n", + "\n", + "gradient_boosting -- метод градиентного бустинга (набор деревьев решений)\n", + "\n", + "random_forest -- метод случайного леса (набор деревьев решений)\n", + "\n", + "mlp -- многослойный персептрон (нейронная сеть)" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree, svm\n", + "\n", + "class_models = {\n", + " \"logistic\": {\"model\": linear_model.LogisticRegression(max_iter=150)},\n", + " \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n", + " \"ridge\": {\"model\": linear_model.LogisticRegression(max_iter=150, solver='lbfgs', penalty=\"l2\", class_weight=\"balanced\")},\n", + " \"decision_tree\": {\n", + " \"model\": tree.DecisionTreeClassifier(max_depth=5, min_samples_split=10, random_state=random_state)\n", + " },\n", + "\n", + " \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n", + " \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n", + " \"gradient_boosting\": {\n", + " \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n", + " },\n", + "\n", + " \"random_forest\": {\n", + " \"model\": ensemble.RandomForestClassifier(\n", + " max_depth=5, class_weight=\"balanced\", random_state=random_state\n", + " )\n", + " },\n", + "\n", + " \"mlp\": {\n", + " \"model\": neural_network.MLPClassifier(\n", + " hidden_layer_sizes=(7,),\n", + " max_iter=200,\n", + " early_stopping=True,\n", + " random_state=random_state,\n", + " )\n", + " },\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Обучение моделей на обучающем наборе данных и оценка на тестовом**" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: logistic\n", + "Model: ridge\n", + "Model: decision_tree\n", + "Model: knn\n", + "Model: naive_bayes\n", + "Model: gradient_boosting\n", + "Model: random_forest\n", + "Model: mlp\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn import metrics\n", + "\n", + "for model_name in class_models.keys():\n", + " print(f\"Model: {model_name}\")\n", + " model = class_models[model_name][\"model\"]\n", + "\n", + " model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n", + " model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n", + "\n", + " y_train_predict = model_pipeline.predict(X_train)\n", + " y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n", + " y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n", + "\n", + " class_models[model_name][\"pipeline\"] = model_pipeline\n", + " class_models[model_name][\"probs\"] = y_test_probs\n", + " class_models[model_name][\"preds\"] = y_test_predict\n", + "\n", + " class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n", + " y_train, y_train_predict, zero_division=1\n", + " )\n", + " class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n", + " y_test, y_test_predict, zero_division=1\n", + " )\n", + " class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n", + " y_train, y_train_predict\n", + " )\n", + " class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n", + " y_train, y_train_predict\n", + " )\n", + " class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n", + " y_test, y_test_probs\n", + " )\n", + " class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n", + " class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n", + " class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n", + " y_test, y_test_predict\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Сводная таблица оценок качества для использованных моделей классификации¶\n", + "Матрица неточностей**" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import ConfusionMatrixDisplay\n", + "import matplotlib.pyplot as plt\n", + "\n", + "_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n", + "for index, key in enumerate(class_models.keys()):\n", + " c_matrix = class_models[key][\"Confusion_matrix\"]\n", + " disp = ConfusionMatrixDisplay(\n", + " confusion_matrix=c_matrix, display_labels=[\"Less\", \"More\"]\n", + " ).plot(ax=ax.flat[index])\n", + " disp.ax_.set_title(key)\n", + "\n", + "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Значение 2173 в желтом квадрате представляет собой количество объектов, относимых к классу \"Less\", которые модель правильно классифицировала. Это свидетельствует о высоком уровне точности в идентификации этого класса. Значение 2150 в жёлтом нижнем правом квадрате указывает на количество правильно классифицированных объектов класса \"More\". Хотя это также является положительным результатом, мы можем заметить, что он местами ниже, чем для класса \"Less\", а местами и выше.\n", + "\n", + "Точность, полнота, верность (аккуратность), F-мера" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Precision_trainPrecision_testRecall_trainRecall_testAccuracy_trainAccuracy_testF1_trainF1_test
logistic1.0000001.0000000.9997671.0000000.9998841.0000000.9998841.000000
ridge1.0000001.0000000.9996511.0000000.9998261.0000000.9998261.000000
decision_tree1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
gradient_boosting1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
random_forest1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
naive_bayes1.0000001.0000000.7867190.7939530.8939270.8975250.8806300.885144
knn0.8724860.8274730.8577740.8209300.8669170.8258150.8650680.824189
mlp0.6875000.6153850.0025580.0037210.5033550.5033540.0050980.007397
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 155, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n", + " [\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " \"Accuracy_train\",\n", + " \"Accuracy_test\",\n", + " \"F1_train\",\n", + " \"F1_test\",\n", + " ]\n", + "]\n", + "class_metrics.sort_values(\n", + " by=\"Accuracy_test\", ascending=False\n", + ").style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Действительно, если модели, включая логистическую регрессию (есть исключения), ридж-регрессию (есть исключения), дерево решений, случайный лес и градиентный бустинг, показывают 100% точность на обучающей выборке, это может свидетельствовать о переобучении. Переобучение (overfitting) происходит, когда модель слишком хорошо подстраивается под обучающие данные, включая шум и случайные вариации, и начинает плохо работать на новых данных (например, на тестовой выборке). \n", + "\n", + "ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Accuracy_testF1_testROC_AUC_testCohen_kappa_testMCC_test
logistic1.0000001.0000001.0000001.0000001.000000
ridge1.0000001.0000001.0000001.0000001.000000
decision_tree1.0000001.0000001.0000001.0000001.000000
gradient_boosting1.0000001.0000001.0000001.0000001.000000
random_forest1.0000001.0000001.0000001.0000001.000000
naive_bayes0.8975250.8851440.9995660.7948200.812098
knn0.8258150.8241890.9108230.6516060.651627
mlp0.5033540.0073970.4970710.0014270.012966
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n", + " [\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " \"ROC_AUC_test\",\n", + " \"Cohen_kappa_test\",\n", + " \"MCC_test\",\n", + " ]\n", + "]\n", + "class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\n", + " \"ROC_AUC_test\",\n", + " \"MCC_test\",\n", + " \"Cohen_kappa_test\",\n", + " ],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'logistic'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n", + "\n", + "display(best_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Вывод данных с ошибкой предсказания для оценки**" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Error items count: 0'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idPredicteddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfront...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15above_median_priceprice_category
\n", + "

0 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [id, Predicted, date, price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition, grade, sqft_above, sqft_basement, yr_built, yr_renovated, zipcode, lat, long, sqft_living15, sqft_lot15, above_median_price, price_category]\n", + "Index: []\n", + "\n", + "[0 rows x 24 columns]" + ] + }, + "execution_count": 158, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessing_result = pipeline_end.transform(X_test)\n", + "preprocessed_df = pd.DataFrame(\n", + " preprocessing_result,\n", + " columns=pipeline_end.get_feature_names_out(),\n", + ")\n", + "\n", + "y_pred = class_models[best_model][\"preds\"]\n", + "\n", + "error_index = y_test[y_test[\"above_median_price\"] != y_pred].index.tolist()\n", + "display(f\"Error items count: {len(error_index)}\")\n", + "\n", + "error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n", + "error_df = X_test.loc[error_index].copy()\n", + "error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n", + "error_df.sort_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15above_median_priceprice_category
6863112400005020140729T000000461000.041.0126085051.500...0195109817747.7181-122.3711480810011
\n", + "

1 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "6863 1124000050 20140729T000000 461000.0 4 1.0 1260 \n", + "\n", + " sqft_lot floors waterfront view ... sqft_basement yr_built yr_renovated \\\n", + "6863 8505 1.5 0 0 ... 0 1951 0 \n", + "\n", + " zipcode lat long sqft_living15 sqft_lot15 above_median_price \\\n", + "6863 98177 47.7181 -122.371 1480 8100 1 \n", + "\n", + " price_category \n", + "6863 1 \n", + "\n", + "[1 rows x 23 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sqft_livingsqft_lotabove_median_priceidpricebedroomsbathroomsfloorswaterfrontview...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_categoryLiving_area_to_Lot_ratio
6863-0.891006-0.1626891.0053351.124000e+09461000.04.01.01.50.00.0...0.01951.00.098177.047.7181-122.3711480.08100.01.05.476729
\n", + "

1 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " sqft_living sqft_lot above_median_price id price \\\n", + "6863 -0.891006 -0.162689 1.005335 1.124000e+09 461000.0 \n", + "\n", + " bedrooms bathrooms floors waterfront view ... sqft_basement \\\n", + "6863 4.0 1.0 1.5 0.0 0.0 ... 0.0 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "6863 1951.0 0.0 98177.0 47.7181 -122.371 1480.0 \n", + "\n", + " sqft_lot15 price_category Living_area_to_Lot_ratio \n", + "6863 8100.0 1.0 5.476729 \n", + "\n", + "[1 rows x 23 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'predicted: 1 (proba: [0. 1.])'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'real: 1'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "model = class_models[best_model][\"pipeline\"]\n", + "\n", + "example_id = 6863\n", + "test = pd.DataFrame(X_test.loc[example_id, :]).T\n", + "test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T\n", + "display(test)\n", + "display(test_preprocessed)\n", + "result_proba = model.predict_proba(test)[0]\n", + "result = model.predict(test)[0]\n", + "real = int(y_test.loc[example_id].values[0])\n", + "display(f\"predicted: {result} (proba: {result_proba})\")\n", + "display(f\"real: {real}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Подбор гиперпараметров методом поиска по сетке**" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "e:\\MII\\laboratory\\mai\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n", + " _data = np.array(data, dtype=dtype, copy=copy,\n" + ] + }, + { + "data": { + "text/plain": [ + "{'model__criterion': 'gini',\n", + " 'model__max_depth': 5,\n", + " 'model__max_features': 'sqrt',\n", + " 'model__n_estimators': 10}" + ] + }, + "execution_count": 160, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "optimized_model_type = \"random_forest\"\n", + "\n", + "random_forest_model = class_models[optimized_model_type][\"pipeline\"]\n", + "\n", + "param_grid = {\n", + " \"model__n_estimators\": [10, 50, 100],\n", + " \"model__max_features\": [\"sqrt\", \"log2\"],\n", + " \"model__max_depth\": [5, 7, 10],\n", + " \"model__criterion\": [\"gini\", \"entropy\"],\n", + "}\n", + "\n", + "gs_optomizer = GridSearchCV(\n", + " estimator=random_forest_model, param_grid=param_grid, n_jobs=-1\n", + ")\n", + "gs_optomizer.fit(X_train, y_train.values.ravel())\n", + "gs_optomizer.best_params_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Обучение модели с новыми гиперпараметрами" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [], + "source": [ + "optimized_model = ensemble.RandomForestClassifier(\n", + " random_state=random_state,\n", + " criterion=\"gini\",\n", + " max_depth=5,\n", + " max_features=\"log2\",\n", + " n_estimators=10,\n", + ")\n", + "\n", + "result = {}\n", + "\n", + "result[\"pipeline\"] = Pipeline([(\"pipeline\", pipeline_end), (\"model\", optimized_model)]).fit(X_train, y_train.values.ravel())\n", + "result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n", + "result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n", + "result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n", + "\n", + "result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n", + "result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n", + "result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n", + "result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n", + "result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n", + "result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n", + "result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n", + "result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n", + "result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n", + "result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n", + "result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n", + "result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Формирование данных для оценки старой и новой версии модели**" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [], + "source": [ + "optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n", + "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n", + " data=class_models[optimized_model_type]\n", + ")\n", + "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n", + " data=result\n", + ")\n", + "optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n", + "optimized_metrics = optimized_metrics.set_index(\"Name\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Оценка параметров старой и новой модели**" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Precision_trainPrecision_testRecall_trainRecall_testAccuracy_trainAccuracy_testF1_trainF1_test
Name        
Old1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
New1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 163, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "optimized_metrics[\n", + " [\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " \"Accuracy_train\",\n", + " \"Accuracy_test\",\n", + " \"F1_train\",\n", + " \"F1_test\",\n", + " ]\n", + "].style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Как для обучающей (Precision_train), так и для тестовой (Precision_test) выборки обе модели достигли идеальных значений 1.000000. Это указывает на то, что модели очень точно классифицируют положительные образцы, не пропуская их." + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 Accuracy_testF1_testROC_AUC_testCohen_kappa_testMCC_test
Name     
Old1.0000001.0000001.0000001.0000001.000000
New1.0000001.0000001.0000001.0000001.000000
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 164, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "optimized_metrics[\n", + " [\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " \"ROC_AUC_test\",\n", + " \"Cohen_kappa_test\",\n", + " \"MCC_test\",\n", + " ]\n", + "].style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\n", + " \"ROC_AUC_test\",\n", + " \"MCC_test\",\n", + " \"Cohen_kappa_test\",\n", + " ],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Оба варианта модели продемонстрировали безупречную точность классификации, достигнув значения 1.000000. Это свидетельствует о том, что модели точно классифицировали все тестовые примеры, не допустив никаких ошибок в предсказаниях." + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False\n", + ")\n", + "\n", + "for index in range(0, len(optimized_metrics)):\n", + " c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n", + " disp = ConfusionMatrixDisplay(\n", + " confusion_matrix=c_matrix, display_labels=[\"Less\", \"More\"]\n", + " ).plot(ax=ax.flat[index])\n", + "\n", + "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В желтом квадрате мы видим значение 2173, что обозначает количество правильно классифицированных объектов, отнесенных к классу \"Less\". Это свидетельствует о том, что модель успешно идентифицирует объекты этого класса, минимизируя количество ложных положительных срабатываний.\n", + "\n", + "В правом нижнем жёлтом квадрате значение 2150 указывает на количество правильно классифицированных объектов, отнесенных к классу \"More\". Это также является показателем высокой точности модели в определении объектов данного класса." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Задача регресии: предсказание цены дома (price).\n", + "\n", + "Описание: Оценить, какая будет цена дома (price) на основе исторических данных о характеристиках домов, таких как площадь. Целевая переменная: Цена дома (price). (среднее значение)" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Среднее значение поля: 2079.8997362698374\n", + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n", + "1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n", + "2 5631500400 20150225T000000 180000.0 2 1.00 770 \n", + "3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n", + "4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n", + "\n", + " sqft_lot floors waterfront view ... yr_built yr_renovated zipcode \\\n", + "0 5650 1.0 0 0 ... 1955 0 98178 \n", + "1 7242 2.0 0 0 ... 1951 1991 98125 \n", + "2 10000 1.0 0 0 ... 1933 0 98028 \n", + "3 5000 1.0 0 0 ... 1965 0 98136 \n", + "4 8080 1.0 0 0 ... 1987 0 98074 \n", + "\n", + " lat long sqft_living15 sqft_lot15 above_median_price \\\n", + "0 47.5112 -122.257 1340 5650 0 \n", + "1 47.7210 -122.319 1690 7639 1 \n", + "2 47.7379 -122.233 2720 8062 0 \n", + "3 47.5208 -122.393 1360 5000 1 \n", + "4 47.6168 -122.045 1800 7503 1 \n", + "\n", + " price_category average_price \n", + "0 0 0 \n", + "1 1 1 \n", + "2 0 0 \n", + "3 1 0 \n", + "4 1 0 \n", + "\n", + "[5 rows x 24 columns]\n", + "Статистическое описание DataFrame:\n", + " id price bedrooms bathrooms sqft_living \\\n", + "count 2.161300e+04 2.161300e+04 21613.000000 21613.000000 21613.000000 \n", + "mean 4.580302e+09 5.400881e+05 3.370842 2.114757 2079.899736 \n", + "std 2.876566e+09 3.671272e+05 0.930062 0.770163 918.440897 \n", + "min 1.000102e+06 7.500000e+04 0.000000 0.000000 290.000000 \n", + "25% 2.123049e+09 3.219500e+05 3.000000 1.750000 1427.000000 \n", + "50% 3.904930e+09 4.500000e+05 3.000000 2.250000 1910.000000 \n", + "75% 7.308900e+09 6.450000e+05 4.000000 2.500000 2550.000000 \n", + "max 9.900000e+09 7.700000e+06 33.000000 8.000000 13540.000000 \n", + "\n", + " sqft_lot floors waterfront view condition \\\n", + "count 2.161300e+04 21613.000000 21613.000000 21613.000000 21613.000000 \n", + "mean 1.510697e+04 1.494309 0.007542 0.234303 3.409430 \n", + "std 4.142051e+04 0.539989 0.086517 0.766318 0.650743 \n", + "min 5.200000e+02 1.000000 0.000000 0.000000 1.000000 \n", + "25% 5.040000e+03 1.000000 0.000000 0.000000 3.000000 \n", + "50% 7.618000e+03 1.500000 0.000000 0.000000 3.000000 \n", + "75% 1.068800e+04 2.000000 0.000000 0.000000 4.000000 \n", + "max 1.651359e+06 3.500000 1.000000 4.000000 5.000000 \n", + "\n", + " ... sqft_basement yr_built yr_renovated zipcode \\\n", + "count ... 21613.000000 21613.000000 21613.000000 21613.000000 \n", + "mean ... 291.509045 1971.005136 84.402258 98077.939805 \n", + "std ... 442.575043 29.373411 401.679240 53.505026 \n", + "min ... 0.000000 1900.000000 0.000000 98001.000000 \n", + "25% ... 0.000000 1951.000000 0.000000 98033.000000 \n", + "50% ... 0.000000 1975.000000 0.000000 98065.000000 \n", + "75% ... 560.000000 1997.000000 0.000000 98118.000000 \n", + "max ... 4820.000000 2015.000000 2015.000000 98199.000000 \n", + "\n", + " lat long sqft_living15 sqft_lot15 \\\n", + "count 21613.000000 21613.000000 21613.000000 21613.000000 \n", + "mean 47.560053 -122.213896 1986.552492 12768.455652 \n", + "std 0.138564 0.140828 685.391304 27304.179631 \n", + "min 47.155900 -122.519000 399.000000 651.000000 \n", + "25% 47.471000 -122.328000 1490.000000 5100.000000 \n", + "50% 47.571800 -122.230000 1840.000000 7620.000000 \n", + "75% 47.678000 -122.125000 2360.000000 10083.000000 \n", + "max 47.777600 -121.315000 6210.000000 871200.000000 \n", + "\n", + " above_median_price average_price \n", + "count 21613.000000 21613.00000 \n", + "mean 0.497340 0.42752 \n", + "std 0.500004 0.49473 \n", + "min 0.000000 0.00000 \n", + "25% 0.000000 0.00000 \n", + "50% 0.000000 0.00000 \n", + "75% 1.000000 1.00000 \n", + "max 1.000000 1.00000 \n", + "\n", + "[8 rows x 22 columns]\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn import set_config\n", + "\n", + "set_config(transform_output=\"pandas\")\n", + "\n", + "# Опция для настройки генерации случайных чисел (если это нужно для других частей кода)\n", + "random_state = 42\n", + "\n", + "# Вычисление среднего значения поля \"Close\"\n", + "average_price = df['sqft_living'].mean()\n", + "print(f\"Среднее значение поля: {average_price}\")\n", + "\n", + "# Создание новой колонки, указывающей, выше или ниже среднего значение цена закрытия\n", + "df['average_price'] = (df['sqft_living'] > average_price).astype(int)\n", + "\n", + "# Удаление последней строки, где нет значения для следующего дня\n", + "df.dropna(inplace=True)\n", + "\n", + "# Вывод DataFrame с новой колонкой\n", + "print(df.head())\n", + "\n", + "# Примерный анализ данных\n", + "print(\"Статистическое описание DataFrame:\")\n", + "print(df.describe())" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'X_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15above_median_priceprice_category
6325546791019020140527T000000325000.031.751780130951.000...0198309804247.3670-122.15227501309501
13473933180058020150310T000000257000.021.00100037001.000...200192909811847.5520-122.2901270500000
17614240700040520150226T000000228500.031.00108074861.500...90194209814647.4838-122.3351170780000
16970546670029020150108T000000288000.032.25209075001.000...810197709803147.3951-122.1721800735000
20868302605936120150417T000000479000.022.50174114392.000...295200709803447.7043-122.20920901045411
..................................................................
11964527220004520141113T000000378000.031.50100069141.000...0194709812547.7144-122.3191000694701
21575957850079020141111T000000399950.032.50308750022.000...0201409802347.2974-122.3492927518301
5390720235048020140930T000000575000.032.50212047802.000...0200409805347.6810-122.0321690265011
860172304903320140620T000000245000.010.75380150001.000...0196309816847.4810-122.32311701500000
15795614765028020150325T000000315000.042.50313059992.000...0200609804247.3837-122.0993020599701
\n", + "

17290 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms \\\n", + "6325 5467910190 20140527T000000 325000.0 3 1.75 \n", + "13473 9331800580 20150310T000000 257000.0 2 1.00 \n", + "17614 2407000405 20150226T000000 228500.0 3 1.00 \n", + "16970 5466700290 20150108T000000 288000.0 3 2.25 \n", + "20868 3026059361 20150417T000000 479000.0 2 2.50 \n", + "... ... ... ... ... ... \n", + "11964 5272200045 20141113T000000 378000.0 3 1.50 \n", + "21575 9578500790 20141111T000000 399950.0 3 2.50 \n", + "5390 7202350480 20140930T000000 575000.0 3 2.50 \n", + "860 1723049033 20140620T000000 245000.0 1 0.75 \n", + "15795 6147650280 20150325T000000 315000.0 4 2.50 \n", + "\n", + " sqft_living sqft_lot floors waterfront view ... sqft_basement \\\n", + "6325 1780 13095 1.0 0 0 ... 0 \n", + "13473 1000 3700 1.0 0 0 ... 200 \n", + "17614 1080 7486 1.5 0 0 ... 90 \n", + "16970 2090 7500 1.0 0 0 ... 810 \n", + "20868 1741 1439 2.0 0 0 ... 295 \n", + "... ... ... ... ... ... ... ... \n", + "11964 1000 6914 1.0 0 0 ... 0 \n", + "21575 3087 5002 2.0 0 0 ... 0 \n", + "5390 2120 4780 2.0 0 0 ... 0 \n", + "860 380 15000 1.0 0 0 ... 0 \n", + "15795 3130 5999 2.0 0 0 ... 0 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "6325 1983 0 98042 47.3670 -122.152 2750 \n", + "13473 1929 0 98118 47.5520 -122.290 1270 \n", + "17614 1942 0 98146 47.4838 -122.335 1170 \n", + "16970 1977 0 98031 47.3951 -122.172 1800 \n", + "20868 2007 0 98034 47.7043 -122.209 2090 \n", + "... ... ... ... ... ... ... \n", + "11964 1947 0 98125 47.7144 -122.319 1000 \n", + "21575 2014 0 98023 47.2974 -122.349 2927 \n", + "5390 2004 0 98053 47.6810 -122.032 1690 \n", + "860 1963 0 98168 47.4810 -122.323 1170 \n", + "15795 2006 0 98042 47.3837 -122.099 3020 \n", + "\n", + " sqft_lot15 above_median_price price_category \n", + "6325 13095 0 1 \n", + "13473 5000 0 0 \n", + "17614 7800 0 0 \n", + "16970 7350 0 0 \n", + "20868 10454 1 1 \n", + "... ... ... ... \n", + "11964 6947 0 1 \n", + "21575 5183 0 1 \n", + "5390 2650 1 1 \n", + "860 15000 0 0 \n", + "15795 5997 0 1 \n", + "\n", + "[17290 rows x 23 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
average_price
63250
134730
176140
169701
208680
......
119640
215751
53901
8600
157951
\n", + "

17290 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " average_price\n", + "6325 0\n", + "13473 0\n", + "17614 0\n", + "16970 1\n", + "20868 0\n", + "... ...\n", + "11964 0\n", + "21575 1\n", + "5390 1\n", + "860 0\n", + "15795 1\n", + "\n", + "[17290 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'X_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15above_median_priceprice_category
735259182031020141006T000000365000.042.25207088932.000...0198609805847.4388-122.1622390770001
2830797420082020140821T000000865000.053.00290067301.000...1070197709811547.6784-122.2852370628312
4106770145011020140815T0000001038000.042.503770108932.002...0199709800647.5646-122.1293710968512
16218952230001020150331T0000001490000.033.504560146082.002...0199009803447.6995-122.22840501422612
19964951086114020140714T000000711000.032.50255053762.000...0200409805247.6647-122.0832250405012
..................................................................
13674616390033320141110T000000338000.031.75125077101.000...0194709815547.7623-122.3171340771001
20377352896002020140708T000000673000.032.75283034962.000...0201209802947.5606-122.0112160350111
8805168700022020141016T000000285000.042.50243444002.000...0200709800147.2874-122.2832434440000
10168414140003020141201T000000605000.041.752250101081.000...0196709800847.5922-122.1182050975011
2522182250016020141212T000000356500.042.502570114732.000...0200809800347.2809-122.2962430599701
\n", + "

4323 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms \\\n", + "735 2591820310 20141006T000000 365000.0 4 2.25 \n", + "2830 7974200820 20140821T000000 865000.0 5 3.00 \n", + "4106 7701450110 20140815T000000 1038000.0 4 2.50 \n", + "16218 9522300010 20150331T000000 1490000.0 3 3.50 \n", + "19964 9510861140 20140714T000000 711000.0 3 2.50 \n", + "... ... ... ... ... ... \n", + "13674 6163900333 20141110T000000 338000.0 3 1.75 \n", + "20377 3528960020 20140708T000000 673000.0 3 2.75 \n", + "8805 1687000220 20141016T000000 285000.0 4 2.50 \n", + "10168 4141400030 20141201T000000 605000.0 4 1.75 \n", + "2522 1822500160 20141212T000000 356500.0 4 2.50 \n", + "\n", + " sqft_living sqft_lot floors waterfront view ... sqft_basement \\\n", + "735 2070 8893 2.0 0 0 ... 0 \n", + "2830 2900 6730 1.0 0 0 ... 1070 \n", + "4106 3770 10893 2.0 0 2 ... 0 \n", + "16218 4560 14608 2.0 0 2 ... 0 \n", + "19964 2550 5376 2.0 0 0 ... 0 \n", + "... ... ... ... ... ... ... ... \n", + "13674 1250 7710 1.0 0 0 ... 0 \n", + "20377 2830 3496 2.0 0 0 ... 0 \n", + "8805 2434 4400 2.0 0 0 ... 0 \n", + "10168 2250 10108 1.0 0 0 ... 0 \n", + "2522 2570 11473 2.0 0 0 ... 0 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "735 1986 0 98058 47.4388 -122.162 2390 \n", + "2830 1977 0 98115 47.6784 -122.285 2370 \n", + "4106 1997 0 98006 47.5646 -122.129 3710 \n", + "16218 1990 0 98034 47.6995 -122.228 4050 \n", + "19964 2004 0 98052 47.6647 -122.083 2250 \n", + "... ... ... ... ... ... ... \n", + "13674 1947 0 98155 47.7623 -122.317 1340 \n", + "20377 2012 0 98029 47.5606 -122.011 2160 \n", + "8805 2007 0 98001 47.2874 -122.283 2434 \n", + "10168 1967 0 98008 47.5922 -122.118 2050 \n", + "2522 2008 0 98003 47.2809 -122.296 2430 \n", + "\n", + " sqft_lot15 above_median_price price_category \n", + "735 7700 0 1 \n", + "2830 6283 1 2 \n", + "4106 9685 1 2 \n", + "16218 14226 1 2 \n", + "19964 4050 1 2 \n", + "... ... ... ... \n", + "13674 7710 0 1 \n", + "20377 3501 1 1 \n", + "8805 4400 0 0 \n", + "10168 9750 1 1 \n", + "2522 5997 0 1 \n", + "\n", + "[4323 rows x 23 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
average_price
7350
28301
41061
162181
199641
......
136740
203771
88051
101681
25221
\n", + "

4323 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " average_price\n", + "735 0\n", + "2830 1\n", + "4106 1\n", + "16218 1\n", + "19964 1\n", + "... ...\n", + "13674 0\n", + "20377 1\n", + "8805 1\n", + "10168 1\n", + "2522 1\n", + "\n", + "[4323 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from typing import Tuple\n", + "from pandas import DataFrame\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "def split_into_train_test(\n", + " df_input: DataFrame,\n", + " target_colname: str = \"average_price\",\n", + " frac_train: float = 0.8,\n", + " random_state: int = None,\n", + ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame]:\n", + " \n", + " if not (0 < frac_train < 1):\n", + " raise ValueError(\"Fraction must be between 0 and 1.\")\n", + " \n", + " # Проверка наличия целевого признака\n", + " if target_colname not in df_input.columns:\n", + " raise ValueError(f\"{target_colname} is not a column in the DataFrame.\")\n", + " \n", + " # Разделяем данные на признаки и целевую переменную\n", + " X = df_input.drop(columns=[target_colname]) # Признаки\n", + " y = df_input[[target_colname]] # Целевая переменная\n", + "\n", + " # Разделяем данные на обучающую и тестовую выборки\n", + " X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y,\n", + " test_size=(1.0 - frac_train),\n", + " random_state=random_state\n", + " )\n", + " \n", + " return X_train, X_test, y_train, y_test\n", + "\n", + "# Применение функции для разделения данных\n", + "X_train, X_test, y_train, y_test = split_into_train_test(\n", + " df, \n", + " target_colname=\"average_price\", \n", + " frac_train=0.8, \n", + " random_state=42 # Убедитесь, что вы задали нужное значение random_state\n", + ")\n", + "\n", + "# Для отображения результатов\n", + "display(\"X_train\", X_train)\n", + "display(\"y_train\", y_train)\n", + "\n", + "display(\"X_test\", X_test)\n", + "display(\"y_test\", y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Формирование конвейера для решения задачи регрессии" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn.base import BaseEstimator, TransformerMixin\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.ensemble import RandomForestRegressor # Пример регрессионной модели\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import make_pipeline\n", + "\n", + "class HouseFeatures(BaseEstimator, TransformerMixin):\n", + " def __init__(self):\n", + " pass\n", + " def fit(self, X, y=None):\n", + " return self\n", + " def transform(self, X, y=None):\n", + " # Создание новых признаков\n", + " X = X.copy()\n", + " X[\"Square\"] = X[\"sqft_living\"] / X[\"sqft_lot\"]\n", + " return X\n", + " def get_feature_names_out(self, features_in):\n", + " # Добавление имен новых признаков\n", + " new_features = [\"Square\"]\n", + " return np.append(features_in, new_features, axis=0)\n", + "\n", + "# Указываем столбцы, которые нужно удалить и обрабатывать\n", + "columns_to_drop = [\"date\"]\n", + "num_columns = [\"bathrooms\", \"floors\", \"waterfront\", \"view\"]\n", + "cat_columns = [] \n", + "\n", + "# Определяем предобработку для численных данных\n", + "num_imputer = SimpleImputer(strategy=\"median\")\n", + "num_scaler = StandardScaler()\n", + "preprocessing_num = Pipeline(\n", + " [\n", + " (\"imputer\", num_imputer),\n", + " (\"scaler\", num_scaler),\n", + " ]\n", + ")\n", + "\n", + "# Определяем предобработку для категориальных данных\n", + "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n", + "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n", + "preprocessing_cat = Pipeline(\n", + " [\n", + " (\"imputer\", cat_imputer),\n", + " (\"encoder\", cat_encoder),\n", + " ]\n", + ")\n", + "\n", + "# Подготовка признаков с использованием ColumnTransformer\n", + "features_preprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"preprocessing_num\", preprocessing_num, num_columns),\n", + " (\"preprocessing_cat\", preprocessing_cat, cat_columns),\n", + " ],\n", + " remainder=\"passthrough\"\n", + ")\n", + "\n", + "# Удаление нежелательных столбцов\n", + "drop_columns = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"drop_columns\", \"drop\", columns_to_drop),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "# Постобработка признаков\n", + "features_postprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"preprocessing_cat\", preprocessing_cat, [\"price_category\"]), \n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "# Создание окончательного конвейера\n", + "pipeline = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " (\"drop_columns\", drop_columns),\n", + " (\"custom_features\", HouseFeatures()),\n", + " (\"model\", RandomForestRegressor()) # Выбор модели для обучения\n", + " ]\n", + ")\n", + "\n", + "# Использование конвейера\n", + "def train_pipeline(X, y):\n", + " pipeline.fit(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Формирование набора моделей для регрессии \n", + "Определение перечня алгоритмов решения задачи аппроксимации (регрессии)" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import PolynomialFeatures\n", + "from sklearn import linear_model, tree, neighbors, ensemble, neural_network\n", + "\n", + "random_state = 9\n", + "\n", + "models = {\n", + " \"linear\": {\"model\": linear_model.LinearRegression(n_jobs=-1)},\n", + " \"linear_poly\": {\n", + " \"model\": make_pipeline(\n", + " PolynomialFeatures(degree=2),\n", + " linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n", + " )\n", + " },\n", + " \"linear_interact\": {\n", + " \"model\": make_pipeline(\n", + " PolynomialFeatures(interaction_only=True),\n", + " linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n", + " )\n", + " },\n", + " \"ridge\": {\"model\": linear_model.RidgeCV()},\n", + " \"decision_tree\": {\n", + " \"model\": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)\n", + " },\n", + " \"knn\": {\"model\": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},\n", + " \"random_forest\": {\n", + " \"model\": ensemble.RandomForestRegressor(\n", + " max_depth=7, random_state=random_state, n_jobs=-1\n", + " )\n", + " },\n", + " \"mlp\": {\n", + " \"model\": neural_network.MLPRegressor(\n", + " activation=\"tanh\",\n", + " hidden_layer_sizes=(3,),\n", + " max_iter=500,\n", + " early_stopping=True,\n", + " random_state=random_state,\n", + " )\n", + " },\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Формирование набора моделей для регрессии" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Random Forest: Mean Score = 1.0, Standard Deviation = 0.0\n", + "Linear Regression: Mean Score = 0.6396438910587428, Standard Deviation = 0.006348300027629372\n", + "Gradient Boosting: Mean Score = 0.9999999992943781, Standard Deviation = 6.609300428326041e-14\n", + "Support Vector Regression: Mean Score = -0.4335265257004087, Standard Deviation = 0.012071668862264313\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "from sklearn.svm import SVR\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "def train_multiple_models(X, y, models):\n", + " results = {}\n", + "\n", + " # Преобразуем y в одномерный массив numpy только при необходимости\n", + " if hasattr(y, 'values'):\n", + " y = y.values.ravel() # Если y - DataFrame, преобразуем в numpy array\n", + " else:\n", + " y = y.ravel() # Если y - numpy array, просто используем ravel()\n", + "\n", + " for model_name, model in models.items():\n", + " # Создаем конвейер для каждой модели\n", + " model_pipeline = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " (\"drop_columns\", drop_columns),\n", + " (\"model\", model) # Используем текущую модель\n", + " ]\n", + " )\n", + " \n", + " # Обучаем модель и вычисляем кросс-валидацию\n", + " scores = cross_val_score(model_pipeline, X, y, cv=5, error_score='raise') # 5-кратная кросс-валидация\n", + " results[model_name] = {\n", + " \"mean_score\": scores.mean(),\n", + " \"std_dev\": scores.std()\n", + " }\n", + " \n", + " return results\n", + "\n", + "models = {\n", + " \"Random Forest\": RandomForestRegressor(),\n", + " \"Linear Regression\": LinearRegression(),\n", + " \"Gradient Boosting\": GradientBoostingRegressor(),\n", + " \"Support Vector Regression\": SVR()\n", + "}\n", + "\n", + "results = train_multiple_models(X_train, y_train, models)\n", + "\n", + "# Вывод результатов\n", + "for model_name, scores in results.items():\n", + " print(f\"{model_name}: Mean Score = {scores['mean_score']}, Standard Deviation = {scores['std_dev']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: logistic\n", + "MSE (train): 0.24060150375939848\n", + "MSE (test): 0.23455933379597502\n", + "MAE (train): 0.24060150375939848\n", + "MAE (test): 0.23455933379597502\n", + "R2 (train): 0.015780807725750634\n", + "R2 (test): 0.045807954005714024\n", + "STD (train): 0.48387852043102103\n", + "STD (test): 0.4780359236045559\n", + "----------------------------------------\n", + "Model: ridge\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "e:\\MII\\laboratory\\mai\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSE (train): 0.11596298438403702\n", + "MSE (test): 0.11265325005783021\n", + "MAE (train): 0.11596298438403702\n", + "MAE (test): 0.11265325005783021\n", + "R2 (train): 0.5256347402620505\n", + "R2 (test): 0.541724332939628\n", + "STD (train): 0.3405113334365492\n", + "STD (test): 0.3356321137822519\n", + "----------------------------------------\n", + "Model: decision_tree\n", + "MSE (train): 0.0\n", + "MSE (test): 0.0\n", + "MAE (train): 0.0\n", + "MAE (test): 0.0\n", + "R2 (train): 1.0\n", + "R2 (test): 1.0\n", + "STD (train): 0.0\n", + "STD (test): 0.0\n", + "----------------------------------------\n", + "Model: knn\n", + "MSE (train): 0.1949681897050318\n", + "MSE (test): 0.27989821882951654\n", + "MAE (train): 0.1949681897050318\n", + "MAE (test): 0.27989821882951654\n", + "R2 (train): 0.20245122664507342\n", + "R2 (test): -0.13863153417464114\n", + "STD (train): 0.43948973967967464\n", + "STD (test): 0.5264647910268833\n", + "----------------------------------------\n", + "Model: naive_bayes\n", + "MSE (train): 0.26928860613071137\n", + "MSE (test): 0.2690261392551469\n", + "MAE (train): 0.26928860613071137\n", + "MAE (test): 0.2690261392551469\n", + "R2 (train): -0.10156840366079445\n", + "R2 (test): -0.09440369772322943\n", + "STD (train): 0.47316941542228536\n", + "STD (test): 0.47206502931490235\n", + "----------------------------------------\n", + "Model: gradient_boosting\n", + "MSE (train): 0.0\n", + "MSE (test): 0.0\n", + "MAE (train): 0.0\n", + "MAE (test): 0.0\n", + "R2 (train): 1.0\n", + "R2 (test): 1.0\n", + "STD (train): 0.0\n", + "STD (test): 0.0\n", + "----------------------------------------\n", + "Model: random_forest\n", + "MSE (train): 0.0\n", + "MSE (test): 0.0\n", + "MAE (train): 0.0\n", + "MAE (test): 0.0\n", + "R2 (train): 1.0\n", + "R2 (test): 1.0\n", + "STD (train): 0.0\n", + "STD (test): 0.0\n", + "----------------------------------------\n", + "Model: mlp\n", + "MSE (train): 0.4253903990746096\n", + "MSE (test): 0.4353458246588018\n", + "MAE (train): 0.4253903990746096\n", + "MAE (test): 0.4353458246588018\n", + "R2 (train): -0.7401279228791116\n", + "R2 (test): -0.7709954936501442\n", + "STD (train): 0.4959884986820156\n", + "STD (test): 0.49782384226978177\n", + "----------------------------------------\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn import metrics\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "# Проверка наличия необходимых переменных\n", + "if 'class_models' not in locals():\n", + " raise ValueError(\"class_models is not defined\")\n", + "if 'X_train' not in locals() or 'X_test' not in locals() or 'y_train' not in locals() or 'y_test' not in locals():\n", + " raise ValueError(\"Train/test data is not defined\")\n", + "\n", + "\n", + "y_train = np.ravel(y_train) \n", + "y_test = np.ravel(y_test) \n", + "\n", + "# Инициализация списка для хранения результатов\n", + "results = []\n", + "\n", + "# Проход по моделям и оценка их качества\n", + "for model_name in class_models.keys():\n", + " print(f\"Model: {model_name}\")\n", + " \n", + " # Извлечение модели из словаря\n", + " model = class_models[model_name][\"model\"]\n", + " \n", + " # Создание пайплайна\n", + " model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n", + " \n", + " # Обучение модели\n", + " model_pipeline.fit(X_train, y_train)\n", + "\n", + " # Предсказание для обучающей и тестовой выборки\n", + " y_train_predict = model_pipeline.predict(X_train)\n", + " y_test_predict = model_pipeline.predict(X_test)\n", + "\n", + " # Сохранение пайплайна и предсказаний\n", + " class_models[model_name][\"pipeline\"] = model_pipeline\n", + " class_models[model_name][\"preds\"] = y_test_predict\n", + "\n", + " # Вычисление метрик для регрессии\n", + " class_models[model_name][\"MSE_train\"] = metrics.mean_squared_error(y_train, y_train_predict)\n", + " class_models[model_name][\"MSE_test\"] = metrics.mean_squared_error(y_test, y_test_predict)\n", + " class_models[model_name][\"MAE_train\"] = metrics.mean_absolute_error(y_train, y_train_predict)\n", + " class_models[model_name][\"MAE_test\"] = metrics.mean_absolute_error(y_test, y_test_predict)\n", + " class_models[model_name][\"R2_train\"] = metrics.r2_score(y_train, y_train_predict)\n", + " class_models[model_name][\"R2_test\"] = metrics.r2_score(y_test, y_test_predict)\n", + "\n", + " # Дополнительные метрики\n", + " class_models[model_name][\"STD_train\"] = np.std(y_train - y_train_predict)\n", + " class_models[model_name][\"STD_test\"] = np.std(y_test - y_test_predict)\n", + "\n", + " # Вывод результатов для текущей модели\n", + " print(f\"MSE (train): {class_models[model_name]['MSE_train']}\")\n", + " print(f\"MSE (test): {class_models[model_name]['MSE_test']}\")\n", + " print(f\"MAE (train): {class_models[model_name]['MAE_train']}\")\n", + " print(f\"MAE (test): {class_models[model_name]['MAE_test']}\")\n", + " print(f\"R2 (train): {class_models[model_name]['R2_train']}\")\n", + " print(f\"R2 (test): {class_models[model_name]['R2_test']}\")\n", + " print(f\"STD (train): {class_models[model_name]['STD_train']}\")\n", + " print(f\"STD (test): {class_models[model_name]['STD_test']}\")\n", + " print(\"-\" * 40) # Разделитель для разных моделей" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import PolynomialFeatures\n", + "from sklearn import linear_model, tree, neighbors, ensemble, neural_network\n", + "\n", + "random_state = 9\n", + "\n", + "models = {\n", + " \"linear\": {\"model\": linear_model.LinearRegression(n_jobs=-1)},\n", + " \"linear_poly\": {\n", + " \"model\": make_pipeline(\n", + " PolynomialFeatures(degree=2),\n", + " linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n", + " )\n", + " },\n", + " \"linear_interact\": {\n", + " \"model\": make_pipeline(\n", + " PolynomialFeatures(interaction_only=True),\n", + " linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n", + " )\n", + " },\n", + " \"ridge\": {\"model\": linear_model.RidgeCV()},\n", + " \"decision_tree\": {\n", + " \"model\": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)\n", + " },\n", + " \"knn\": {\"model\": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},\n", + " \"random_forest\": {\n", + " \"model\": ensemble.RandomForestRegressor(\n", + " max_depth=7, random_state=random_state, n_jobs=-1\n", + " )\n", + " },\n", + " \"mlp\": {\n", + " \"model\": neural_network.MLPRegressor(\n", + " activation=\"tanh\",\n", + " hidden_layer_sizes=(3,),\n", + " max_iter=500,\n", + " early_stopping=True,\n", + " random_state=random_state,\n", + " )\n", + " },\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Обучение и оценка моделей с помощью различных алгоритмов" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: logistic\n", + "MSE (train): 0.24060150375939848\n", + "MSE (test): 0.23455933379597502\n", + "MAE (train): 0.24060150375939848\n", + "MAE (test): 0.23455933379597502\n", + "R2 (train): 0.015780807725750634\n", + "R2 (test): 0.045807954005714024\n", + "STD (train): 0.48387852043102103\n", + "STD (test): 0.4780359236045559\n", + "----------------------------------------\n", + "Model: ridge\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "e:\\MII\\laboratory\\mai\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSE (train): 0.11596298438403702\n", + "MSE (test): 0.11265325005783021\n", + "MAE (train): 0.11596298438403702\n", + "MAE (test): 0.11265325005783021\n", + "R2 (train): 0.5256347402620505\n", + "R2 (test): 0.541724332939628\n", + "STD (train): 0.3405113334365492\n", + "STD (test): 0.3356321137822519\n", + "----------------------------------------\n", + "Model: decision_tree\n", + "MSE (train): 0.0\n", + "MSE (test): 0.0\n", + "MAE (train): 0.0\n", + "MAE (test): 0.0\n", + "R2 (train): 1.0\n", + "R2 (test): 1.0\n", + "STD (train): 0.0\n", + "STD (test): 0.0\n", + "----------------------------------------\n", + "Model: knn\n", + "MSE (train): 0.1949681897050318\n", + "MSE (test): 0.27989821882951654\n", + "MAE (train): 0.1949681897050318\n", + "MAE (test): 0.27989821882951654\n", + "R2 (train): 0.20245122664507342\n", + "R2 (test): -0.13863153417464114\n", + "STD (train): 0.43948973967967464\n", + "STD (test): 0.5264647910268833\n", + "----------------------------------------\n", + "Model: naive_bayes\n", + "MSE (train): 0.26928860613071137\n", + "MSE (test): 0.2690261392551469\n", + "MAE (train): 0.26928860613071137\n", + "MAE (test): 0.2690261392551469\n", + "R2 (train): -0.10156840366079445\n", + "R2 (test): -0.09440369772322943\n", + "STD (train): 0.47316941542228536\n", + "STD (test): 0.47206502931490235\n", + "----------------------------------------\n", + "Model: gradient_boosting\n", + "MSE (train): 0.0\n", + "MSE (test): 0.0\n", + "MAE (train): 0.0\n", + "MAE (test): 0.0\n", + "R2 (train): 1.0\n", + "R2 (test): 1.0\n", + "STD (train): 0.0\n", + "STD (test): 0.0\n", + "----------------------------------------\n", + "Model: random_forest\n", + "MSE (train): 0.0\n", + "MSE (test): 0.0\n", + "MAE (train): 0.0\n", + "MAE (test): 0.0\n", + "R2 (train): 1.0\n", + "R2 (test): 1.0\n", + "STD (train): 0.0\n", + "STD (test): 0.0\n", + "----------------------------------------\n", + "Model: mlp\n", + "MSE (train): 0.4253903990746096\n", + "MSE (test): 0.4353458246588018\n", + "MAE (train): 0.4253903990746096\n", + "MAE (test): 0.4353458246588018\n", + "R2 (train): -0.7401279228791116\n", + "R2 (test): -0.7709954936501442\n", + "STD (train): 0.4959884986820156\n", + "STD (test): 0.49782384226978177\n", + "----------------------------------------\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn import metrics\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "# Проверка наличия необходимых переменных\n", + "if 'class_models' not in locals():\n", + " raise ValueError(\"class_models is not defined\")\n", + "if 'X_train' not in locals() or 'X_test' not in locals() or 'y_train' not in locals() or 'y_test' not in locals():\n", + " raise ValueError(\"Train/test data is not defined\")\n", + "\n", + "\n", + "y_train = np.ravel(y_train) \n", + "y_test = np.ravel(y_test) \n", + "\n", + "# Инициализация списка для хранения результатов\n", + "results = []\n", + "\n", + "# Проход по моделям и оценка их качества\n", + "for model_name in class_models.keys():\n", + " print(f\"Model: {model_name}\")\n", + " \n", + " # Извлечение модели из словаря\n", + " model = class_models[model_name][\"model\"]\n", + " \n", + " # Создание пайплайна\n", + " model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n", + " \n", + " # Обучение модели\n", + " model_pipeline.fit(X_train, y_train)\n", + "\n", + " # Предсказание для обучающей и тестовой выборки\n", + " y_train_predict = model_pipeline.predict(X_train)\n", + " y_test_predict = model_pipeline.predict(X_test)\n", + "\n", + " # Сохранение пайплайна и предсказаний\n", + " class_models[model_name][\"pipeline\"] = model_pipeline\n", + " class_models[model_name][\"preds\"] = y_test_predict\n", + "\n", + " # Вычисление метрик для регрессии\n", + " class_models[model_name][\"MSE_train\"] = metrics.mean_squared_error(y_train, y_train_predict)\n", + " class_models[model_name][\"MSE_test\"] = metrics.mean_squared_error(y_test, y_test_predict)\n", + " class_models[model_name][\"MAE_train\"] = metrics.mean_absolute_error(y_train, y_train_predict)\n", + " class_models[model_name][\"MAE_test\"] = metrics.mean_absolute_error(y_test, y_test_predict)\n", + " class_models[model_name][\"R2_train\"] = metrics.r2_score(y_train, y_train_predict)\n", + " class_models[model_name][\"R2_test\"] = metrics.r2_score(y_test, y_test_predict)\n", + "\n", + " # Дополнительные метрики\n", + " class_models[model_name][\"STD_train\"] = np.std(y_train - y_train_predict)\n", + " class_models[model_name][\"STD_test\"] = np.std(y_test - y_test_predict)\n", + "\n", + " # Вывод результатов для текущей модели\n", + " print(f\"MSE (train): {class_models[model_name]['MSE_train']}\")\n", + " print(f\"MSE (test): {class_models[model_name]['MSE_test']}\")\n", + " print(f\"MAE (train): {class_models[model_name]['MAE_train']}\")\n", + " print(f\"MAE (test): {class_models[model_name]['MAE_test']}\")\n", + " print(f\"R2 (train): {class_models[model_name]['R2_train']}\")\n", + " print(f\"R2 (test): {class_models[model_name]['R2_test']}\")\n", + " print(f\"STD (train): {class_models[model_name]['STD_train']}\")\n", + " print(f\"STD (test): {class_models[model_name]['STD_test']}\")\n", + " print(\"-\" * 40) # Разделитель для разных моделей" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Пример использования обученной модели (конвейера регрессии) для предсказания**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Подбор гиперпараметров методом поиска по сетке**" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 36 candidates, totalling 180 fits\n", + "Best parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}\n", + "Best MSE: 0.14752641202600872\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "# Convert the date column to a datetime object and extract numeric features\n", + "df['date'] = pd.to_datetime(df['date'], errors='coerce') # Coerce invalid dates to NaT\n", + "df.dropna(subset=['date'], inplace=True) # Drop rows with invalid dates\n", + "df['year'] = df['date'].dt.year\n", + "df['month'] = df['date'].dt.month\n", + "df['day'] = df['date'].dt.day\n", + "\n", + "# Prepare predictors and target\n", + "X = df[['yr_built', 'year', 'month', 'day', 'price', 'price_category']]\n", + "y = df['average_price']\n", + "\n", + "# Split data into training and testing sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Define model and parameter grid\n", + "model = RandomForestRegressor()\n", + "param_grid = {\n", + " 'n_estimators': [50, 100, 200],\n", + " 'max_depth': [None, 10, 20, 30],\n", + " 'min_samples_split': [2, 5, 10]\n", + "}\n", + "\n", + "# Hyperparameter tuning with GridSearchCV\n", + "grid_search = GridSearchCV(estimator=model, param_grid=param_grid,\n", + " scoring='neg_mean_squared_error', cv=5, n_jobs=-1, verbose=2)\n", + "\n", + "# Fit the model\n", + "grid_search.fit(X_train, y_train)\n", + "\n", + "# Output the best parameters and score\n", + "print(\"Best parameters:\", grid_search.best_params_)\n", + "print(\"Best MSE:\", -grid_search.best_score_)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Обучение модели с новыми гиперпараметрами и сравнение новых и старых данных**" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 36 candidates, totalling 180 fits\n", + "Старые параметры: {'max_depth': 10, 'min_samples_split': 15, 'n_estimators': 200}\n", + "Лучший результат (MSE) на старых параметрах: 0.14727400921908354\n", + "\n", + "Новые параметры: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}\n", + "Лучший результат (MSE) на новых параметрах: 0.148833681322309\n", + "Среднеквадратическая ошибка (MSE) на тестовых данных: 0.14451630134635543\n", + "Корень среднеквадратичной ошибки (RMSE) на тестовых данных: 0.3801529972870863\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn import metrics\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "# 1. Настройка параметров для старых значений\n", + "old_param_grid = {\n", + " 'n_estimators': [50, 100, 200], # Количество деревьев\n", + " 'max_depth': [None, 10, 20, 30], # Максимальная глубина дерева\n", + " 'min_samples_split': [2, 10, 15] # Минимальное количество образцов для разбиения узла\n", + "}\n", + "\n", + "# Подбор гиперпараметров с помощью Grid Search для старых параметров\n", + "old_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n", + " param_grid=old_param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1, verbose=2)\n", + "\n", + "# Обучение модели на тренировочных данных\n", + "old_grid_search.fit(X_train, y_train)\n", + "\n", + "# 2. Результаты подбора для старых параметров\n", + "old_best_params = old_grid_search.best_params_\n", + "old_best_mse = -old_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n", + "\n", + "# 3. Настройка параметров для новых значений\n", + "new_param_grid = {\n", + " 'n_estimators': [200],\n", + " 'max_depth': [10],\n", + " 'min_samples_split': [10]\n", + "}\n", + "\n", + "# Подбор гиперпараметров с помощью Grid Search для новых параметров\n", + "new_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n", + " param_grid=new_param_grid, scoring='neg_mean_squared_error', cv=2)\n", + "\n", + "# Обучение модели на тренировочных данных\n", + "new_grid_search.fit(X_train, y_train)\n", + "\n", + "# 4. Результаты подбора для новых параметров\n", + "new_best_params = new_grid_search.best_params_\n", + "new_best_mse = -new_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n", + "\n", + "# 5. Обучение модели с лучшими параметрами для новых значений\n", + "model_best = RandomForestRegressor(**new_best_params)\n", + "model_best.fit(X_train, y_train)\n", + "\n", + "# Прогнозирование на тестовой выборке\n", + "y_pred = model_best.predict(X_test)\n", + "\n", + "# Оценка производительности модели\n", + "mse = metrics.mean_squared_error(y_test, y_pred)\n", + "rmse = np.sqrt(mse)\n", + "\n", + "# Вывод результатов\n", + "print(\"Старые параметры:\", old_best_params)\n", + "print(\"Лучший результат (MSE) на старых параметрах:\", old_best_mse)\n", + "print(\"\\nНовые параметры:\", new_best_params)\n", + "print(\"Лучший результат (MSE) на новых параметрах:\", new_best_mse)\n", + "print(\"Среднеквадратическая ошибка (MSE) на тестовых данных:\", mse)\n", + "print(\"Корень среднеквадратичной ошибки (RMSE) на тестовых данных:\", rmse)\n", + "\n", + "# Визуализация ошибок\n", + "plt.figure(figsize=(10, 5))\n", + "plt.bar(['Старые параметры', 'Новые параметры'], [old_best_mse, new_best_mse], color=['blue', 'orange'])\n", + "plt.xlabel('Подбор параметров')\n", + "plt.ylabel('Среднеквадратическая ошибка (MSE)')\n", + "plt.title('Сравнение MSE для старых и новых параметров')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Сравнивая результаты старых и новых параметров, можно сказать, что старые параметры модели позволили добиться меньшей среднеквадратической ошибки, что указывает на более эффективное предсказание по сравнению с новыми настройками. Скорее всего модель обучена достаточно хорошо, учитывая следующие факторы:\n", + "1. Показатели MSE на тренировочных (0.159) и тестовых данных (0.1589) очень близки. Это говорит о том, что модель не переобучена и не недообучена — она хорошо обобщает на тестовой выборке, что является желаемым результатом. \n", + "2. Старые параметры дали наилучший результат, так что модель способна выдать высокую точность при настройке гиперпараметров. Попытка с новыми параметрами позволила оценить, как модель реагирует на изменения параметров, и выяснить, что увеличение max_depth и снижение min_samples_split улучшили результат. Этот процесс настройки параметров — часть процесса улучшения модели. \n", + "3. Старые параметры дали наилучший результат, так что модель способна выдать высокую точность при настройке гиперпараметров. Попытка с новыми параметрами позволила оценить, как модель реагирует на изменения параметров, и выяснить, что увеличение max_depth и снижение min_samples_split улучшили результат. Этот процесс настройки параметров — часть процесса улучшения модели. \n", + "\n", + "Таким образом, можно сказать, что модель обучена хорошо, но возможны дальнейшие небольшие улучшения за счет оптимизации гиперпараметров." + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10, 5))\n", + "plt.scatter(range(len(y_test)), y_test, label=\"Актуалочка\", color=\"black\", alpha=0.5)\n", + "plt.scatter(range(len(y_test)), y_pred, label=\"Предсказанные(новые параметры)\", color=\"blue\", alpha=0.5)\n", + "plt.scatter(range(len(y_test)), y_test_predict, label=\"Предсказанные(старые параметры)\", color=\"red\", alpha=0.5)\n", + "plt.xlabel(\"Выборка\")\n", + "plt.ylabel(\"Значения\")\n", + "plt.legend()\n", + "plt.title(\"Актуалочка vs Предсказанных значений (Новые and Старые Параметры)\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ураааа! Усёёёё, вроде бы всё ^_^" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mai", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/laboratory_4/requirements.txt b/laboratory_4/requirements.txt new file mode 100644 index 0000000..5f04788 --- /dev/null +++ b/laboratory_4/requirements.txt @@ -0,0 +1,40 @@ +asttokens==2.4.1 +colorama==0.4.6 +comm==0.2.2 +contourpy==1.3.0 +cycler==0.12.1 +debugpy==1.8.5 +decorator==5.1.1 +executing==2.1.0 +fonttools==4.53.1 +ipykernel==6.29.5 +ipython==8.27.0 +jedi==0.19.1 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +kiwisolver==1.4.7 +matplotlib==3.9.2 +matplotlib-inline==0.1.7 +nest-asyncio==1.6.0 +numpy==2.1.1 +packaging==24.1 +pandas==2.2.2 +parso==0.8.4 +pillow==10.4.0 +platformdirs==4.3.6 +prompt_toolkit==3.0.47 +psutil==6.0.0 +pure_eval==0.2.3 +Pygments==2.18.0 +pyparsing==3.1.4 +python-dateutil==2.9.0.post0 +pytz==2024.2 +pywin32==306 +pyzmq==26.2.0 +seaborn==0.13.2 +six==1.16.0 +stack-data==0.6.3 +tornado==6.4.1 +traitlets==5.14.3 +tzdata==2024.1 +wcwidth==0.2.13