2024-11-15 16:44:23 +04:00
{
"cells": [
{
"cell_type": "code",
2024-11-27 17:22:10 +04:00
"execution_count": 157,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import set_config\n",
"set_config(transform_output=\"pandas\")\n",
"\n",
"random_state = 42\n",
"\n",
"# Подключим датафрейм и выгрузим данные\n",
"df = pd.read_csv(\"data/house_data.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Устраняем выбросы в колонке цены и добавляем колонку с категориями цены"
]
},
{
"cell_type": "code",
"execution_count": 158,
2024-11-15 16:44:23 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>sqft_above</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
2024-11-27 17:22:10 +04:00
" <th>price_category</th>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>21593</th>\n",
" <td>8672200110</td>\n",
" <td>20150317T000000</td>\n",
" <td>1088000.0</td>\n",
" <td>5</td>\n",
" <td>3.75</td>\n",
" <td>4170</td>\n",
" <td>8142</td>\n",
" <td>2.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2</td>\n",
2024-11-15 16:44:23 +04:00
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>4170</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2006</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98056</td>\n",
" <td>47.5354</td>\n",
" <td>-122.181</td>\n",
" <td>3030</td>\n",
" <td>7980</td>\n",
" <td>very_high</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>21594</th>\n",
" <td>5087900040</td>\n",
" <td>20141017T000000</td>\n",
" <td>350000.0</td>\n",
" <td>4</td>\n",
" <td>2.75</td>\n",
" <td>2500</td>\n",
" <td>5995</td>\n",
2024-11-15 16:44:23 +04:00
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2500</td>\n",
" <td>0</td>\n",
" <td>2008</td>\n",
" <td>0</td>\n",
" <td>98042</td>\n",
" <td>47.3749</td>\n",
" <td>-122.107</td>\n",
" <td>2530</td>\n",
" <td>5988</td>\n",
" <td>middle</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>21595</th>\n",
" <td>1972201967</td>\n",
" <td>20141031T000000</td>\n",
" <td>520000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>2</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2.25</td>\n",
" <td>1530</td>\n",
" <td>981</td>\n",
" <td>3.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1480</td>\n",
" <td>50</td>\n",
" <td>2006</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98103</td>\n",
" <td>47.6533</td>\n",
" <td>-122.346</td>\n",
" <td>1530</td>\n",
" <td>1282</td>\n",
" <td>middle</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>21596</th>\n",
" <td>7502800100</td>\n",
" <td>20140813T000000</td>\n",
" <td>679950.0</td>\n",
" <td>5</td>\n",
" <td>2.75</td>\n",
" <td>3600</td>\n",
" <td>9437</td>\n",
" <td>2.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>3600</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2014</td>\n",
" <td>0</td>\n",
" <td>98059</td>\n",
" <td>47.4822</td>\n",
" <td>-122.131</td>\n",
" <td>3550</td>\n",
" <td>9421</td>\n",
" <td>high</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>21597</th>\n",
" <td>191100405</td>\n",
" <td>20150421T000000</td>\n",
" <td>1575000.0</td>\n",
" <td>4</td>\n",
" <td>3.25</td>\n",
" <td>3410</td>\n",
" <td>10125</td>\n",
" <td>2.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>3410</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2007</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98040</td>\n",
" <td>47.5653</td>\n",
" <td>-122.223</td>\n",
" <td>2290</td>\n",
" <td>10125</td>\n",
" <td>NaN</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>21598</th>\n",
" <td>8956200760</td>\n",
" <td>20141013T000000</td>\n",
" <td>541800.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>3118</td>\n",
" <td>7866</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
2024-11-15 16:44:23 +04:00
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>3118</td>\n",
" <td>0</td>\n",
" <td>2014</td>\n",
" <td>0</td>\n",
" <td>98001</td>\n",
" <td>47.2931</td>\n",
" <td>-122.264</td>\n",
" <td>2673</td>\n",
" <td>6500</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21599</th>\n",
" <td>7202300110</td>\n",
" <td>20140915T000000</td>\n",
" <td>810000.0</td>\n",
" <td>4</td>\n",
" <td>3.00</td>\n",
" <td>3990</td>\n",
" <td>7838</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>3990</td>\n",
" <td>0</td>\n",
" <td>2003</td>\n",
" <td>0</td>\n",
" <td>98053</td>\n",
" <td>47.6857</td>\n",
" <td>-122.046</td>\n",
" <td>3370</td>\n",
" <td>6814</td>\n",
" <td>high</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>21600</th>\n",
" <td>249000205</td>\n",
" <td>20141015T000000</td>\n",
" <td>1537000.0</td>\n",
" <td>5</td>\n",
" <td>3.75</td>\n",
" <td>4470</td>\n",
" <td>8088</td>\n",
" <td>2.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>4470</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2008</td>\n",
" <td>0</td>\n",
" <td>98004</td>\n",
" <td>47.6321</td>\n",
" <td>-122.200</td>\n",
" <td>2780</td>\n",
" <td>8964</td>\n",
" <td>NaN</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>21601</th>\n",
" <td>5100403806</td>\n",
" <td>20150407T000000</td>\n",
" <td>467000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>3</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2.50</td>\n",
" <td>1425</td>\n",
" <td>1179</td>\n",
" <td>3.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1425</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2008</td>\n",
" <td>0</td>\n",
" <td>98125</td>\n",
" <td>47.6963</td>\n",
" <td>-122.318</td>\n",
" <td>1285</td>\n",
" <td>1253</td>\n",
" <td>middle</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>21602</th>\n",
" <td>844000965</td>\n",
" <td>20140626T000000</td>\n",
" <td>224000.0</td>\n",
" <td>3</td>\n",
" <td>1.75</td>\n",
" <td>1500</td>\n",
" <td>11968</td>\n",
" <td>1.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1500</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2014</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98010</td>\n",
" <td>47.3095</td>\n",
" <td>-122.002</td>\n",
2024-11-15 16:44:23 +04:00
" <td>1320</td>\n",
2024-11-27 17:22:10 +04:00
" <td>11303</td>\n",
" <td>low</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>21603</th>\n",
" <td>7852140040</td>\n",
" <td>20140825T000000</td>\n",
" <td>507250.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>3</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2.50</td>\n",
" <td>2270</td>\n",
" <td>5536</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2270</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2003</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98065</td>\n",
" <td>47.5389</td>\n",
" <td>-121.881</td>\n",
" <td>2270</td>\n",
" <td>5731</td>\n",
" <td>middle</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>21604</th>\n",
" <td>9834201367</td>\n",
" <td>20150126T000000</td>\n",
" <td>429000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>3</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2.00</td>\n",
" <td>1490</td>\n",
" <td>1126</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1490</td>\n",
" <td>0</td>\n",
" <td>2014</td>\n",
" <td>0</td>\n",
" <td>98144</td>\n",
" <td>47.5699</td>\n",
" <td>-122.288</td>\n",
" <td>1400</td>\n",
" <td>1230</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21605</th>\n",
" <td>3448900210</td>\n",
" <td>20141014T000000</td>\n",
" <td>610685.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>2520</td>\n",
" <td>6023</td>\n",
2024-11-15 16:44:23 +04:00
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2520</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2014</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98056</td>\n",
" <td>47.5137</td>\n",
" <td>-122.167</td>\n",
" <td>2520</td>\n",
" <td>6023</td>\n",
" <td>high</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
2024-11-27 17:22:10 +04:00
" <tr>\n",
" <th>21606</th>\n",
" <td>7936000429</td>\n",
" <td>20150326T000000</td>\n",
" <td>1007500.0</td>\n",
" <td>4</td>\n",
" <td>3.50</td>\n",
" <td>3510</td>\n",
" <td>7200</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2600</td>\n",
" <td>910</td>\n",
" <td>2009</td>\n",
" <td>0</td>\n",
" <td>98136</td>\n",
" <td>47.5537</td>\n",
" <td>-122.398</td>\n",
" <td>2050</td>\n",
" <td>6200</td>\n",
" <td>very_high</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21607</th>\n",
" <td>2997800021</td>\n",
" <td>20150219T000000</td>\n",
" <td>475000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>1310</td>\n",
" <td>1294</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1180</td>\n",
" <td>130</td>\n",
" <td>2008</td>\n",
" <td>0</td>\n",
" <td>98116</td>\n",
" <td>47.5773</td>\n",
" <td>-122.409</td>\n",
" <td>1330</td>\n",
" <td>1265</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21608</th>\n",
" <td>263000018</td>\n",
" <td>20140521T000000</td>\n",
" <td>360000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>1530</td>\n",
" <td>1131</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1530</td>\n",
" <td>0</td>\n",
" <td>2009</td>\n",
" <td>0</td>\n",
" <td>98103</td>\n",
" <td>47.6993</td>\n",
" <td>-122.346</td>\n",
" <td>1530</td>\n",
" <td>1509</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21609</th>\n",
" <td>6600060120</td>\n",
" <td>20150223T000000</td>\n",
" <td>400000.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>2310</td>\n",
" <td>5813</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2310</td>\n",
" <td>0</td>\n",
" <td>2014</td>\n",
" <td>0</td>\n",
" <td>98146</td>\n",
" <td>47.5107</td>\n",
" <td>-122.362</td>\n",
" <td>1830</td>\n",
" <td>7200</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21610</th>\n",
" <td>1523300141</td>\n",
" <td>20140623T000000</td>\n",
" <td>402101.0</td>\n",
" <td>2</td>\n",
" <td>0.75</td>\n",
" <td>1020</td>\n",
" <td>1350</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1020</td>\n",
" <td>0</td>\n",
" <td>2009</td>\n",
" <td>0</td>\n",
" <td>98144</td>\n",
" <td>47.5944</td>\n",
" <td>-122.299</td>\n",
" <td>1020</td>\n",
" <td>2007</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21611</th>\n",
" <td>291310100</td>\n",
" <td>20150116T000000</td>\n",
" <td>400000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>1600</td>\n",
" <td>2388</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1600</td>\n",
" <td>0</td>\n",
" <td>2004</td>\n",
" <td>0</td>\n",
" <td>98027</td>\n",
" <td>47.5345</td>\n",
" <td>-122.069</td>\n",
" <td>1410</td>\n",
" <td>1287</td>\n",
" <td>middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21612</th>\n",
" <td>1523300157</td>\n",
" <td>20141015T000000</td>\n",
" <td>325000.0</td>\n",
" <td>2</td>\n",
" <td>0.75</td>\n",
" <td>1020</td>\n",
" <td>1076</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1020</td>\n",
" <td>0</td>\n",
" <td>2008</td>\n",
" <td>0</td>\n",
" <td>98144</td>\n",
" <td>47.5941</td>\n",
" <td>-122.299</td>\n",
" <td>1020</td>\n",
" <td>1357</td>\n",
" <td>low</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>20 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" id date price bedrooms bathrooms \\\n",
"21593 8672200110 20150317T000000 1088000.0 5 3.75 \n",
"21594 5087900040 20141017T000000 350000.0 4 2.75 \n",
"21595 1972201967 20141031T000000 520000.0 2 2.25 \n",
"21596 7502800100 20140813T000000 679950.0 5 2.75 \n",
"21597 191100405 20150421T000000 1575000.0 4 3.25 \n",
"21598 8956200760 20141013T000000 541800.0 4 2.50 \n",
"21599 7202300110 20140915T000000 810000.0 4 3.00 \n",
"21600 249000205 20141015T000000 1537000.0 5 3.75 \n",
"21601 5100403806 20150407T000000 467000.0 3 2.50 \n",
"21602 844000965 20140626T000000 224000.0 3 1.75 \n",
"21603 7852140040 20140825T000000 507250.0 3 2.50 \n",
"21604 9834201367 20150126T000000 429000.0 3 2.00 \n",
"21605 3448900210 20141014T000000 610685.0 4 2.50 \n",
"21606 7936000429 20150326T000000 1007500.0 4 3.50 \n",
"21607 2997800021 20150219T000000 475000.0 3 2.50 \n",
"21608 263000018 20140521T000000 360000.0 3 2.50 \n",
"21609 6600060120 20150223T000000 400000.0 4 2.50 \n",
"21610 1523300141 20140623T000000 402101.0 2 0.75 \n",
"21611 291310100 20150116T000000 400000.0 3 2.50 \n",
"21612 1523300157 20141015T000000 325000.0 2 0.75 \n",
"\n",
" sqft_living sqft_lot floors waterfront view ... sqft_above \\\n",
"21593 4170 8142 2.0 0 2 ... 4170 \n",
"21594 2500 5995 2.0 0 0 ... 2500 \n",
"21595 1530 981 3.0 0 0 ... 1480 \n",
"21596 3600 9437 2.0 0 0 ... 3600 \n",
"21597 3410 10125 2.0 0 0 ... 3410 \n",
"21598 3118 7866 2.0 0 2 ... 3118 \n",
"21599 3990 7838 2.0 0 0 ... 3990 \n",
"21600 4470 8088 2.0 0 0 ... 4470 \n",
"21601 1425 1179 3.0 0 0 ... 1425 \n",
"21602 1500 11968 1.0 0 0 ... 1500 \n",
"21603 2270 5536 2.0 0 0 ... 2270 \n",
"21604 1490 1126 3.0 0 0 ... 1490 \n",
"21605 2520 6023 2.0 0 0 ... 2520 \n",
"21606 3510 7200 2.0 0 0 ... 2600 \n",
"21607 1310 1294 2.0 0 0 ... 1180 \n",
"21608 1530 1131 3.0 0 0 ... 1530 \n",
"21609 2310 5813 2.0 0 0 ... 2310 \n",
"21610 1020 1350 2.0 0 0 ... 1020 \n",
"21611 1600 2388 2.0 0 0 ... 1600 \n",
"21612 1020 1076 2.0 0 0 ... 1020 \n",
"\n",
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
"21593 0 2006 0 98056 47.5354 -122.181 \n",
"21594 0 2008 0 98042 47.3749 -122.107 \n",
"21595 50 2006 0 98103 47.6533 -122.346 \n",
"21596 0 2014 0 98059 47.4822 -122.131 \n",
"21597 0 2007 0 98040 47.5653 -122.223 \n",
"21598 0 2014 0 98001 47.2931 -122.264 \n",
"21599 0 2003 0 98053 47.6857 -122.046 \n",
"21600 0 2008 0 98004 47.6321 -122.200 \n",
"21601 0 2008 0 98125 47.6963 -122.318 \n",
"21602 0 2014 0 98010 47.3095 -122.002 \n",
"21603 0 2003 0 98065 47.5389 -121.881 \n",
"21604 0 2014 0 98144 47.5699 -122.288 \n",
"21605 0 2014 0 98056 47.5137 -122.167 \n",
"21606 910 2009 0 98136 47.5537 -122.398 \n",
"21607 130 2008 0 98116 47.5773 -122.409 \n",
"21608 0 2009 0 98103 47.6993 -122.346 \n",
"21609 0 2014 0 98146 47.5107 -122.362 \n",
"21610 0 2009 0 98144 47.5944 -122.299 \n",
"21611 0 2004 0 98027 47.5345 -122.069 \n",
"21612 0 2008 0 98144 47.5941 -122.299 \n",
"\n",
" sqft_living15 sqft_lot15 price_category \n",
"21593 3030 7980 very_high \n",
"21594 2530 5988 middle \n",
"21595 1530 1282 middle \n",
"21596 3550 9421 high \n",
"21597 2290 10125 NaN \n",
"21598 2673 6500 middle \n",
"21599 3370 6814 high \n",
"21600 2780 8964 NaN \n",
"21601 1285 1253 middle \n",
"21602 1320 11303 low \n",
"21603 2270 5731 middle \n",
"21604 1400 1230 middle \n",
"21605 2520 6023 high \n",
"21606 2050 6200 very_high \n",
"21607 1330 1265 middle \n",
"21608 1530 1509 middle \n",
"21609 1830 7200 middle \n",
"21610 1020 2007 middle \n",
"21611 1410 1287 middle \n",
"21612 1020 1357 low \n",
"\n",
"[20 rows x 22 columns]"
]
},
"execution_count": 158,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
2024-11-15 16:44:23 +04:00
"\n",
2024-11-27 17:22:10 +04:00
"# Добавляем столбец с категорями цены\n",
"df['price_category'] = pd.cut(df['price'], bins=[75000,338750,602750,866750,1130750], labels=['low','middle','high','very_high'], include_lowest=True)\n",
"df.tail(20)\n"
2024-11-15 16:44:23 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-27 17:22:10 +04:00
"### Бизнес-цели\n",
"1. Задача регрессии – предсказание цены дома (price). Это может помочь риэлторам и аналитикам определить справедливую рыночную стоимость недвижимости.\n",
"\n",
"2. Задача классификации – определение вероятности того, что цена дома будет выше/ниже медианы рынка. Классифицировать дома по ценовым категориям (например, низкая, средняя, высокая цена). Это может помочь определить, какие дома популярны у покупателей.\n",
"\n",
"### Определение достижимого уровня качества модели для задачи классификации\n",
"#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации (Целевой признак - median_price)"
2024-11-15 16:44:23 +04:00
]
},
{
"cell_type": "code",
2024-11-27 17:22:10 +04:00
"execution_count": 159,
2024-11-15 16:44:23 +04:00
"metadata": {},
"outputs": [
2024-11-27 17:22:10 +04:00
{
"data": {
"text/plain": [
"'X_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
2024-11-15 16:44:23 +04:00
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>price_category</th>\n",
2024-11-27 17:22:10 +04:00
" <th>median_price</th>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>20962</th>\n",
" <td>1278000210</td>\n",
" <td>20150311T000000</td>\n",
" <td>110000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>2</td>\n",
" <td>1.00</td>\n",
2024-11-27 17:22:10 +04:00
" <td>828</td>\n",
" <td>4524</td>\n",
2024-11-15 16:44:23 +04:00
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1968</td>\n",
" <td>2007</td>\n",
" <td>98001</td>\n",
" <td>47.2655</td>\n",
" <td>-122.244</td>\n",
" <td>828</td>\n",
" <td>5402</td>\n",
" <td>0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>12284</th>\n",
" <td>2193300390</td>\n",
" <td>20140923T000000</td>\n",
" <td>624000.0</td>\n",
" <td>4</td>\n",
" <td>3.25</td>\n",
" <td>2810</td>\n",
" <td>11250</td>\n",
2024-11-15 16:44:23 +04:00
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1130</td>\n",
" <td>1980</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98052</td>\n",
" <td>47.6920</td>\n",
" <td>-122.099</td>\n",
" <td>2110</td>\n",
" <td>11250</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>7343</th>\n",
" <td>4289900005</td>\n",
" <td>20141230T000000</td>\n",
" <td>1535000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>4</td>\n",
2024-11-27 17:22:10 +04:00
" <td>3.25</td>\n",
" <td>2850</td>\n",
" <td>4100</td>\n",
2024-11-15 16:44:23 +04:00
" <td>2.0</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>3</td>\n",
2024-11-15 16:44:23 +04:00
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1030</td>\n",
" <td>1908</td>\n",
" <td>2003</td>\n",
" <td>98122</td>\n",
" <td>47.6147</td>\n",
" <td>-122.285</td>\n",
" <td>2130</td>\n",
" <td>4200</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>14247</th>\n",
" <td>316000145</td>\n",
" <td>20150325T000000</td>\n",
" <td>235000.0</td>\n",
" <td>4</td>\n",
2024-11-15 16:44:23 +04:00
" <td>1.00</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1360</td>\n",
" <td>7132</td>\n",
" <td>1.5</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1941</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98168</td>\n",
" <td>47.5054</td>\n",
" <td>-122.301</td>\n",
" <td>1280</td>\n",
" <td>7175</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>16670</th>\n",
" <td>629400480</td>\n",
" <td>20140619T000000</td>\n",
" <td>775000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>4</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2.75</td>\n",
" <td>3010</td>\n",
" <td>15992</td>\n",
" <td>2.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1996</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98075</td>\n",
" <td>47.5895</td>\n",
" <td>-121.994</td>\n",
" <td>3330</td>\n",
" <td>12333</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
2024-11-15 16:44:23 +04:00
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>88</th>\n",
" <td>1332700270</td>\n",
" <td>20140519T000000</td>\n",
" <td>215000.0</td>\n",
" <td>2</td>\n",
2024-11-15 16:44:23 +04:00
" <td>2.25</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1610</td>\n",
" <td>2040</td>\n",
2024-11-15 16:44:23 +04:00
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1979</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98056</td>\n",
" <td>47.5180</td>\n",
" <td>-122.194</td>\n",
" <td>1950</td>\n",
" <td>2025</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>15031</th>\n",
" <td>7129303070</td>\n",
" <td>20140820T000000</td>\n",
" <td>735000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>4</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2.75</td>\n",
" <td>3040</td>\n",
" <td>2415</td>\n",
2024-11-15 16:44:23 +04:00
" <td>2.0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1</td>\n",
" <td>4</td>\n",
2024-11-15 16:44:23 +04:00
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1966</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98118</td>\n",
" <td>47.5188</td>\n",
" <td>-122.256</td>\n",
" <td>2620</td>\n",
" <td>2433</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>5234</th>\n",
" <td>2432000130</td>\n",
" <td>20150414T000000</td>\n",
" <td>675000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>3</td>\n",
" <td>1.75</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1660</td>\n",
" <td>9549</td>\n",
2024-11-15 16:44:23 +04:00
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1956</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98033</td>\n",
" <td>47.6503</td>\n",
" <td>-122.198</td>\n",
" <td>2090</td>\n",
" <td>9549</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>19980</th>\n",
" <td>774100475</td>\n",
" <td>20140627T000000</td>\n",
" <td>415000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>3</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2.75</td>\n",
" <td>2600</td>\n",
" <td>64626</td>\n",
" <td>1.5</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2009</td>\n",
" <td>0</td>\n",
" <td>98014</td>\n",
" <td>47.7185</td>\n",
" <td>-121.405</td>\n",
" <td>1740</td>\n",
" <td>64626</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>3671</th>\n",
" <td>8847400115</td>\n",
" <td>20140723T000000</td>\n",
" <td>590000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>3</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2.00</td>\n",
" <td>2420</td>\n",
" <td>208652</td>\n",
" <td>1.5</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2005</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98010</td>\n",
" <td>47.3666</td>\n",
" <td>-121.978</td>\n",
" <td>3180</td>\n",
" <td>212137</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>17290 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" id date price bedrooms bathrooms \\\n",
"20962 1278000210 20150311T000000 110000.0 2 1.00 \n",
"12284 2193300390 20140923T000000 624000.0 4 3.25 \n",
"7343 4289900005 20141230T000000 1535000.0 4 3.25 \n",
"14247 316000145 20150325T000000 235000.0 4 1.00 \n",
"16670 629400480 20140619T000000 775000.0 4 2.75 \n",
"... ... ... ... ... ... \n",
"88 1332700270 20140519T000000 215000.0 2 2.25 \n",
"15031 7129303070 20140820T000000 735000.0 4 2.75 \n",
"5234 2432000130 20150414T000000 675000.0 3 1.75 \n",
"19980 774100475 20140627T000000 415000.0 3 2.75 \n",
"3671 8847400115 20140723T000000 590000.0 3 2.00 \n",
"\n",
" sqft_living sqft_lot floors waterfront view ... sqft_basement \\\n",
"20962 828 4524 1.0 0 0 ... 0 \n",
"12284 2810 11250 1.0 0 0 ... 1130 \n",
"7343 2850 4100 2.0 0 3 ... 1030 \n",
"14247 1360 7132 1.5 0 0 ... 0 \n",
"16670 3010 15992 2.0 0 0 ... 0 \n",
"... ... ... ... ... ... ... ... \n",
"88 1610 2040 2.0 0 0 ... 0 \n",
"15031 3040 2415 2.0 1 4 ... 0 \n",
"5234 1660 9549 1.0 0 0 ... 0 \n",
"19980 2600 64626 1.5 0 0 ... 0 \n",
"3671 2420 208652 1.5 0 0 ... 0 \n",
"\n",
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
"20962 1968 2007 98001 47.2655 -122.244 828 \n",
"12284 1980 0 98052 47.6920 -122.099 2110 \n",
"7343 1908 2003 98122 47.6147 -122.285 2130 \n",
"14247 1941 0 98168 47.5054 -122.301 1280 \n",
"16670 1996 0 98075 47.5895 -121.994 3330 \n",
"... ... ... ... ... ... ... \n",
"88 1979 0 98056 47.5180 -122.194 1950 \n",
"15031 1966 0 98118 47.5188 -122.256 2620 \n",
"5234 1956 0 98033 47.6503 -122.198 2090 \n",
"19980 2009 0 98014 47.7185 -121.405 1740 \n",
"3671 2005 0 98010 47.3666 -121.978 3180 \n",
"\n",
" sqft_lot15 price_category median_price \n",
"20962 5402 0 0 \n",
"12284 11250 1 1 \n",
"7343 4200 2 1 \n",
"14247 7175 0 0 \n",
"16670 12333 2 1 \n",
"... ... ... ... \n",
"88 2025 0 0 \n",
"15031 2433 2 1 \n",
"5234 9549 1 1 \n",
"19980 64626 1 0 \n",
"3671 212137 1 1 \n",
"\n",
"[17290 rows x 23 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>median_price</th>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
2024-11-27 17:22:10 +04:00
" </thead>\n",
" <tbody>\n",
2024-11-15 16:44:23 +04:00
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>20962</th>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>12284</th>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>7343</th>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>14247</th>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>16670</th>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>...</th>\n",
2024-11-15 16:44:23 +04:00
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>88</th>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" </tr>\n",
" <tr>\n",
" <th>15031</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5234</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19980</th>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" </tr>\n",
" <tr>\n",
" <th>3671</th>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-11-27 17:22:10 +04:00
"<p>17290 rows × 1 columns</p>\n",
2024-11-15 16:44:23 +04:00
"</div>"
],
"text/plain": [
2024-11-27 17:22:10 +04:00
" median_price\n",
"20962 0\n",
"12284 1\n",
"7343 1\n",
"14247 0\n",
"16670 1\n",
"... ...\n",
"88 0\n",
"15031 1\n",
"5234 1\n",
"19980 0\n",
"3671 1\n",
2024-11-15 16:44:23 +04:00
"\n",
2024-11-27 17:22:10 +04:00
"[17290 rows x 1 columns]"
2024-11-15 16:44:23 +04:00
]
},
"metadata": {},
2024-11-27 17:22:10 +04:00
"output_type": "display_data"
},
2024-11-15 16:44:23 +04:00
{
"data": {
"text/plain": [
2024-11-27 17:22:10 +04:00
"'X_test'"
2024-11-15 16:44:23 +04:00
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>price_category</th>\n",
2024-11-27 17:22:10 +04:00
" <th>median_price</th>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>11592</th>\n",
" <td>2028701000</td>\n",
" <td>20140529T000000</td>\n",
" <td>635200.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>4</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1.75</td>\n",
" <td>1640</td>\n",
" <td>4240</td>\n",
2024-11-15 16:44:23 +04:00
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>720</td>\n",
" <td>1921</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98117</td>\n",
" <td>47.6766</td>\n",
" <td>-122.368</td>\n",
" <td>1300</td>\n",
" <td>4240</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>8984</th>\n",
" <td>9406500530</td>\n",
" <td>20140912T000000</td>\n",
" <td>249000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>2</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2.00</td>\n",
" <td>1090</td>\n",
" <td>1357</td>\n",
2024-11-15 16:44:23 +04:00
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1990</td>\n",
" <td>0</td>\n",
" <td>98028</td>\n",
" <td>47.7526</td>\n",
" <td>-122.244</td>\n",
" <td>1078</td>\n",
" <td>1318</td>\n",
" <td>0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>8280</th>\n",
" <td>8097000330</td>\n",
" <td>20140721T000000</td>\n",
" <td>359950.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>3</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2.75</td>\n",
" <td>2540</td>\n",
" <td>8604</td>\n",
" <td>2.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1991</td>\n",
" <td>0</td>\n",
" <td>98092</td>\n",
" <td>47.3209</td>\n",
" <td>-122.185</td>\n",
" <td>2260</td>\n",
" <td>7438</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>792</th>\n",
" <td>8081020370</td>\n",
" <td>20140709T000000</td>\n",
" <td>1355000.0</td>\n",
" <td>4</td>\n",
" <td>3.50</td>\n",
" <td>3550</td>\n",
" <td>11000</td>\n",
2024-11-15 16:44:23 +04:00
" <td>1.0</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2</td>\n",
2024-11-15 16:44:23 +04:00
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1290</td>\n",
" <td>1999</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98006</td>\n",
" <td>47.5506</td>\n",
" <td>-122.134</td>\n",
" <td>4100</td>\n",
" <td>10012</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>10371</th>\n",
" <td>7518507580</td>\n",
" <td>20150502T000000</td>\n",
" <td>581000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>1170</td>\n",
" <td>4080</td>\n",
" <td>1.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1909</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98117</td>\n",
" <td>47.6784</td>\n",
" <td>-122.386</td>\n",
" <td>1560</td>\n",
" <td>4586</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>16733</th>\n",
" <td>7212650950</td>\n",
" <td>20140708T000000</td>\n",
" <td>336000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>4</td>\n",
" <td>2.50</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2530</td>\n",
" <td>8169</td>\n",
2024-11-15 16:44:23 +04:00
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1993</td>\n",
" <td>0</td>\n",
" <td>98003</td>\n",
" <td>47.2634</td>\n",
" <td>-122.312</td>\n",
" <td>2220</td>\n",
" <td>8013</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>13151</th>\n",
" <td>4365200620</td>\n",
" <td>20150312T000000</td>\n",
" <td>394000.0</td>\n",
" <td>3</td>\n",
" <td>1.00</td>\n",
" <td>1450</td>\n",
" <td>7930</td>\n",
2024-11-15 16:44:23 +04:00
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>300</td>\n",
" <td>1923</td>\n",
" <td>0</td>\n",
" <td>98126</td>\n",
" <td>47.5212</td>\n",
" <td>-122.371</td>\n",
" <td>1040</td>\n",
" <td>7740</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>11667</th>\n",
" <td>4083304355</td>\n",
" <td>20150318T000000</td>\n",
" <td>675000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>4</td>\n",
" <td>1.75</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1530</td>\n",
" <td>3615</td>\n",
" <td>1.5</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1913</td>\n",
" <td>0</td>\n",
" <td>98103</td>\n",
" <td>47.6529</td>\n",
" <td>-122.334</td>\n",
" <td>1650</td>\n",
" <td>4200</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>3683</th>\n",
" <td>2891100820</td>\n",
" <td>20140825T000000</td>\n",
" <td>213500.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>3</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1.00</td>\n",
" <td>1220</td>\n",
" <td>6000</td>\n",
2024-11-15 16:44:23 +04:00
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1968</td>\n",
" <td>0</td>\n",
" <td>98002</td>\n",
" <td>47.3245</td>\n",
" <td>-122.209</td>\n",
" <td>1420</td>\n",
" <td>6000</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12059</th>\n",
" <td>952000640</td>\n",
" <td>20141027T000000</td>\n",
" <td>715000.0</td>\n",
" <td>3</td>\n",
" <td>1.50</td>\n",
" <td>1670</td>\n",
" <td>5060</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1925</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98126</td>\n",
" <td>47.5671</td>\n",
" <td>-122.379</td>\n",
" <td>1670</td>\n",
" <td>5118</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-11-27 17:22:10 +04:00
"<p>4323 rows × 23 columns</p>\n",
2024-11-15 16:44:23 +04:00
"</div>"
],
"text/plain": [
2024-11-27 17:22:10 +04:00
" id date price bedrooms bathrooms \\\n",
"11592 2028701000 20140529T000000 635200.0 4 1.75 \n",
"8984 9406500530 20140912T000000 249000.0 2 2.00 \n",
"8280 8097000330 20140721T000000 359950.0 3 2.75 \n",
"792 8081020370 20140709T000000 1355000.0 4 3.50 \n",
"10371 7518507580 20150502T000000 581000.0 2 1.00 \n",
"... ... ... ... ... ... \n",
"16733 7212650950 20140708T000000 336000.0 4 2.50 \n",
"13151 4365200620 20150312T000000 394000.0 3 1.00 \n",
"11667 4083304355 20150318T000000 675000.0 4 1.75 \n",
"3683 2891100820 20140825T000000 213500.0 3 1.00 \n",
"12059 952000640 20141027T000000 715000.0 3 1.50 \n",
"\n",
" sqft_living sqft_lot floors waterfront view ... sqft_basement \\\n",
"11592 1640 4240 1.0 0 0 ... 720 \n",
"8984 1090 1357 2.0 0 0 ... 0 \n",
"8280 2540 8604 2.0 0 0 ... 0 \n",
"792 3550 11000 1.0 0 2 ... 1290 \n",
"10371 1170 4080 1.0 0 0 ... 0 \n",
"... ... ... ... ... ... ... ... \n",
"16733 2530 8169 2.0 0 0 ... 0 \n",
"13151 1450 7930 1.0 0 0 ... 300 \n",
"11667 1530 3615 1.5 0 0 ... 0 \n",
"3683 1220 6000 1.0 0 0 ... 0 \n",
"12059 1670 5060 2.0 0 2 ... 0 \n",
"\n",
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
"11592 1921 0 98117 47.6766 -122.368 1300 \n",
"8984 1990 0 98028 47.7526 -122.244 1078 \n",
"8280 1991 0 98092 47.3209 -122.185 2260 \n",
"792 1999 0 98006 47.5506 -122.134 4100 \n",
"10371 1909 0 98117 47.6784 -122.386 1560 \n",
"... ... ... ... ... ... ... \n",
"16733 1993 0 98003 47.2634 -122.312 2220 \n",
"13151 1923 0 98126 47.5212 -122.371 1040 \n",
"11667 1913 0 98103 47.6529 -122.334 1650 \n",
"3683 1968 0 98002 47.3245 -122.209 1420 \n",
"12059 1925 0 98126 47.5671 -122.379 1670 \n",
"\n",
" sqft_lot15 price_category median_price \n",
"11592 4240 1 1 \n",
"8984 1318 0 0 \n",
"8280 7438 1 0 \n",
"792 10012 2 1 \n",
"10371 4586 1 1 \n",
"... ... ... ... \n",
"16733 8013 1 0 \n",
"13151 7740 1 0 \n",
"11667 4200 1 1 \n",
"3683 6000 0 0 \n",
"12059 5118 2 1 \n",
"\n",
"[4323 rows x 23 columns]"
2024-11-15 16:44:23 +04:00
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
2024-11-27 17:22:10 +04:00
"'y_test'"
2024-11-15 16:44:23 +04:00
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2024-11-27 17:22:10 +04:00
" <th>median_price</th>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>11592</th>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>8984</th>\n",
" <td>0</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>8280</th>\n",
" <td>0</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>792</th>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>10371</th>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>16733</th>\n",
" <td>0</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>13151</th>\n",
" <td>0</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>11667</th>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>3683</th>\n",
" <td>0</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>12059</th>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-11-27 17:22:10 +04:00
"<p>4323 rows × 1 columns</p>\n",
2024-11-15 16:44:23 +04:00
"</div>"
],
"text/plain": [
2024-11-27 17:22:10 +04:00
" median_price\n",
"11592 1\n",
"8984 0\n",
"8280 0\n",
"792 1\n",
"10371 1\n",
2024-11-15 16:44:23 +04:00
"... ...\n",
2024-11-27 17:22:10 +04:00
"16733 0\n",
"13151 0\n",
"11667 1\n",
"3683 0\n",
"12059 1\n",
2024-11-15 16:44:23 +04:00
"\n",
2024-11-27 17:22:10 +04:00
"[4323 rows x 1 columns]"
2024-11-15 16:44:23 +04:00
]
},
"metadata": {},
"output_type": "display_data"
},
{
2024-11-27 17:22:10 +04:00
"name": "stdout",
"output_type": "stream",
"text": [
"id int64\n",
"date object\n",
"price float64\n",
"bedrooms int64\n",
"bathrooms float64\n",
"sqft_living int64\n",
"sqft_lot int64\n",
"floors float64\n",
"waterfront int64\n",
"view int64\n",
"condition int64\n",
"grade int64\n",
"sqft_above int64\n",
"sqft_basement int64\n",
"yr_built int64\n",
"yr_renovated int64\n",
"zipcode int64\n",
"lat float64\n",
"long float64\n",
"sqft_living15 int64\n",
"sqft_lot15 int64\n",
"price_category category\n",
"median_price int64\n",
"dtype: object\n"
]
}
],
"source": [
"from typing import Tuple\n",
"import pandas as pd\n",
"from pandas import DataFrame\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Создание целевого признака\n",
"median_price = df['price'].median()\n",
"df['median_price'] = np.where(df['price'] > median_price, 1, 0)\n",
"\n",
"# Разделение на признаки и целевую переменную\n",
"X = df.drop(columns=['id', 'date', 'price', 'median_price'])\n",
"y = df['median_price']\n",
"\n",
"# Примерная категоризация\n",
"df['price_category'] = pd.cut(df['price'], bins=[0, 300000, 700000, np.inf], labels=[0, 1, 2])\n",
"\n",
"# Выбор признаков и целевых переменных\n",
"X = df.drop(columns=['id', 'date', 'price', 'price_category'])\n",
"\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
" \n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
" \n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
" \n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" if frac_val <= 0:\n",
" assert len(df_input) == len(df_train) + len(df_temp)\n",
" return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
"\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
" return df_train, df_val, df_test, y_train, y_val, y_test\n",
"\n",
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
" df, stratify_colname=\"median_price\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42\n",
")\n",
"\n",
"display(\"X_train\", X_train)\n",
"display(\"y_train\", y_train)\n",
"\n",
"display(\"X_test\", X_test)\n",
"display(\"y_test\", y_test)\n",
"\n",
"print(df.dtypes)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Формирование конвейера\n",
"preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
"\n",
"preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
"\n",
"features_preprocessing -- трансформер для предобработки признаков\n",
"\n",
"drop_columns -- трансформер для удаления колонок\n",
"\n",
"pipeline_end -- основной конвейер предобработки данных и конструирования признаков"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.discriminant_analysis import StandardScaler\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.ensemble import RandomForestRegressor # Пример регрессионной модели\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.pipeline import make_pipeline\n",
"\n",
"pipeline_end = StandardScaler()\n",
"\n",
"\n",
"class HouseFeatures(BaseEstimator, TransformerMixin):\n",
" def __init__(self):\n",
" pass\n",
" def fit(self, X, y=None):\n",
" return self\n",
" def transform(self, X, y=None):\n",
" # Создание новых признаков\n",
" X = X.copy()\n",
" X[\"Living_area_to_Lot_ratio\"] = X[\"sqft_living\"] / X[\"sqft_lot\"]\n",
" return X\n",
" def get_feature_names_out(self, features_in):\n",
" # Добавление имен новых признаков\n",
" new_features = [\"Living_area_to_Lot_ratio\"]\n",
" return np.append(features_in, new_features, axis=0)\n",
"\n",
"#Предобработка числовых значений. Заполнение пустых значений на медиану.\n",
"preprocessing_num_class = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='median')),\n",
" ('scaler', StandardScaler())\n",
"])\n",
"\n",
"#Предобработка категориальных значений\n",
"preprocessing_cat_class = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='most_frequent')),\n",
" ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
"])\n",
"\n",
"columns_to_drop = [\"date\"]\n",
"numeric_columns = [\"sqft_living\", \"sqft_lot\", \"median_price\"]\n",
"cat_columns = []\n",
"\n",
"features_preprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_num\", preprocessing_num_class, numeric_columns),\n",
" (\"prepocessing_cat\", preprocessing_cat_class, cat_columns),\n",
" ],\n",
" remainder=\"passthrough\"\n",
")\n",
"\n",
"drop_columns = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"pipeline_end = Pipeline(\n",
" [\n",
" (\"features_preprocessing\", features_preprocessing),\n",
" (\"custom_features\", HouseFeatures()),\n",
" (\"drop_columns\", drop_columns),\n",
" ]\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Пример работы конвейера."
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [
2024-11-15 16:44:23 +04:00
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2024-11-27 17:22:10 +04:00
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>median_price</th>\n",
2024-11-15 16:44:23 +04:00
" <th>id</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>price_category</th>\n",
2024-11-27 17:22:10 +04:00
" <th>Living_area_to_Lot_ratio</th>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>20962</th>\n",
" <td>-1.360742</td>\n",
" <td>-0.262132</td>\n",
" <td>-0.994693</td>\n",
" <td>1278000210</td>\n",
" <td>110000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
2024-11-15 16:44:23 +04:00
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1968</td>\n",
" <td>2007</td>\n",
" <td>98001</td>\n",
" <td>47.2655</td>\n",
" <td>-122.244</td>\n",
" <td>828</td>\n",
" <td>5402</td>\n",
" <td>0</td>\n",
" <td>5.191063</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>12284</th>\n",
" <td>0.794390</td>\n",
" <td>-0.094121</td>\n",
" <td>1.005335</td>\n",
" <td>2193300390</td>\n",
" <td>624000.0</td>\n",
" <td>4</td>\n",
" <td>3.25</td>\n",
" <td>1.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1130</td>\n",
" <td>1980</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98052</td>\n",
" <td>47.6920</td>\n",
" <td>-122.099</td>\n",
" <td>2110</td>\n",
" <td>11250</td>\n",
" <td>1</td>\n",
" <td>-8.440052</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>7343</th>\n",
" <td>0.837884</td>\n",
" <td>-0.272723</td>\n",
" <td>1.005335</td>\n",
" <td>4289900005</td>\n",
" <td>1535000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>4</td>\n",
2024-11-27 17:22:10 +04:00
" <td>3.25</td>\n",
" <td>2.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>3</td>\n",
2024-11-15 16:44:23 +04:00
" <td>...</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1030</td>\n",
" <td>1908</td>\n",
" <td>2003</td>\n",
" <td>98122</td>\n",
" <td>47.6147</td>\n",
" <td>-122.285</td>\n",
" <td>2130</td>\n",
" <td>4200</td>\n",
" <td>2</td>\n",
" <td>-3.072292</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>14247</th>\n",
" <td>-0.782270</td>\n",
" <td>-0.196986</td>\n",
" <td>-0.994693</td>\n",
" <td>316000145</td>\n",
" <td>235000.0</td>\n",
" <td>4</td>\n",
2024-11-15 16:44:23 +04:00
" <td>1.00</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1.5</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1941</td>\n",
" <td>0</td>\n",
" <td>98168</td>\n",
" <td>47.5054</td>\n",
" <td>-122.301</td>\n",
" <td>1280</td>\n",
" <td>7175</td>\n",
" <td>0</td>\n",
" <td>3.971201</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>16670</th>\n",
" <td>1.011860</td>\n",
" <td>0.024330</td>\n",
" <td>1.005335</td>\n",
" <td>629400480</td>\n",
" <td>775000.0</td>\n",
" <td>4</td>\n",
" <td>2.75</td>\n",
" <td>2.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1996</td>\n",
" <td>0</td>\n",
" <td>98075</td>\n",
" <td>47.5895</td>\n",
" <td>-121.994</td>\n",
" <td>3330</td>\n",
" <td>12333</td>\n",
" <td>2</td>\n",
" <td>41.589045</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>88</th>\n",
" <td>-0.510432</td>\n",
" <td>-0.324180</td>\n",
" <td>-0.994693</td>\n",
" <td>1332700270</td>\n",
" <td>215000.0</td>\n",
" <td>2</td>\n",
" <td>2.25</td>\n",
" <td>2.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1979</td>\n",
" <td>0</td>\n",
" <td>98056</td>\n",
" <td>47.5180</td>\n",
" <td>-122.194</td>\n",
" <td>1950</td>\n",
" <td>2025</td>\n",
" <td>0</td>\n",
" <td>1.574534</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>15031</th>\n",
" <td>1.044481</td>\n",
" <td>-0.314813</td>\n",
" <td>1.005335</td>\n",
" <td>7129303070</td>\n",
" <td>735000.0</td>\n",
" <td>4</td>\n",
" <td>2.75</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:23 +04:00
" <td>4</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1966</td>\n",
" <td>0</td>\n",
" <td>98118</td>\n",
" <td>47.5188</td>\n",
" <td>-122.256</td>\n",
" <td>2620</td>\n",
" <td>2433</td>\n",
" <td>2</td>\n",
" <td>-3.317784</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>5234</th>\n",
" <td>-0.456065</td>\n",
" <td>-0.136611</td>\n",
" <td>1.005335</td>\n",
" <td>2432000130</td>\n",
" <td>675000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>3</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1.75</td>\n",
" <td>1.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>1956</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98033</td>\n",
" <td>47.6503</td>\n",
" <td>-122.198</td>\n",
" <td>2090</td>\n",
" <td>9549</td>\n",
" <td>1</td>\n",
" <td>3.338418</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>19980</th>\n",
" <td>0.566046</td>\n",
" <td>1.239169</td>\n",
" <td>-0.994693</td>\n",
" <td>774100475</td>\n",
" <td>415000.0</td>\n",
2024-11-15 16:44:23 +04:00
" <td>3</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2.75</td>\n",
" <td>1.5</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2009</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>98014</td>\n",
" <td>47.7185</td>\n",
" <td>-121.405</td>\n",
" <td>1740</td>\n",
" <td>64626</td>\n",
" <td>1</td>\n",
" <td>0.456795</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>3671</th>\n",
" <td>0.370323</td>\n",
" <td>4.836825</td>\n",
" <td>1.005335</td>\n",
" <td>8847400115</td>\n",
" <td>590000.0</td>\n",
" <td>3</td>\n",
" <td>2.00</td>\n",
" <td>1.5</td>\n",
2024-11-15 16:44:23 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>2005</td>\n",
" <td>0</td>\n",
" <td>98010</td>\n",
" <td>47.3666</td>\n",
" <td>-121.978</td>\n",
" <td>3180</td>\n",
" <td>212137</td>\n",
" <td>1</td>\n",
" <td>0.076563</td>\n",
2024-11-15 16:44:23 +04:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-11-27 17:22:10 +04:00
"<p>17290 rows × 23 columns</p>\n",
2024-11-15 16:44:23 +04:00
"</div>"
],
"text/plain": [
2024-11-27 17:22:10 +04:00
" sqft_living sqft_lot median_price id price bedrooms \\\n",
"20962 -1.360742 -0.262132 -0.994693 1278000210 110000.0 2 \n",
"12284 0.794390 -0.094121 1.005335 2193300390 624000.0 4 \n",
"7343 0.837884 -0.272723 1.005335 4289900005 1535000.0 4 \n",
"14247 -0.782270 -0.196986 -0.994693 316000145 235000.0 4 \n",
"16670 1.011860 0.024330 1.005335 629400480 775000.0 4 \n",
"... ... ... ... ... ... ... \n",
"88 -0.510432 -0.324180 -0.994693 1332700270 215000.0 2 \n",
"15031 1.044481 -0.314813 1.005335 7129303070 735000.0 4 \n",
"5234 -0.456065 -0.136611 1.005335 2432000130 675000.0 3 \n",
"19980 0.566046 1.239169 -0.994693 774100475 415000.0 3 \n",
"3671 0.370323 4.836825 1.005335 8847400115 590000.0 3 \n",
2024-11-15 16:44:23 +04:00
"\n",
2024-11-27 17:22:10 +04:00
" bathrooms floors waterfront view ... sqft_basement yr_built \\\n",
"20962 1.00 1.0 0 0 ... 0 1968 \n",
"12284 3.25 1.0 0 0 ... 1130 1980 \n",
"7343 3.25 2.0 0 3 ... 1030 1908 \n",
"14247 1.00 1.5 0 0 ... 0 1941 \n",
"16670 2.75 2.0 0 0 ... 0 1996 \n",
"... ... ... ... ... ... ... ... \n",
"88 2.25 2.0 0 0 ... 0 1979 \n",
"15031 2.75 2.0 1 4 ... 0 1966 \n",
"5234 1.75 1.0 0 0 ... 0 1956 \n",
"19980 2.75 1.5 0 0 ... 0 2009 \n",
"3671 2.00 1.5 0 0 ... 0 2005 \n",
2024-11-15 16:44:23 +04:00
"\n",
2024-11-27 17:22:10 +04:00
" yr_renovated zipcode lat long sqft_living15 sqft_lot15 \\\n",
"20962 2007 98001 47.2655 -122.244 828 5402 \n",
"12284 0 98052 47.6920 -122.099 2110 11250 \n",
"7343 2003 98122 47.6147 -122.285 2130 4200 \n",
"14247 0 98168 47.5054 -122.301 1280 7175 \n",
"16670 0 98075 47.5895 -121.994 3330 12333 \n",
"... ... ... ... ... ... ... \n",
"88 0 98056 47.5180 -122.194 1950 2025 \n",
"15031 0 98118 47.5188 -122.256 2620 2433 \n",
"5234 0 98033 47.6503 -122.198 2090 9549 \n",
"19980 0 98014 47.7185 -121.405 1740 64626 \n",
"3671 0 98010 47.3666 -121.978 3180 212137 \n",
"\n",
" price_category Living_area_to_Lot_ratio \n",
"20962 0 5.191063 \n",
"12284 1 -8.440052 \n",
"7343 2 -3.072292 \n",
"14247 0 3.971201 \n",
"16670 2 41.589045 \n",
"... ... ... \n",
"88 0 1.574534 \n",
"15031 2 -3.317784 \n",
"5234 1 3.338418 \n",
"19980 1 0.456795 \n",
"3671 1 0.076563 \n",
2024-11-15 16:44:23 +04:00
"\n",
2024-11-27 17:22:10 +04:00
"[17290 rows x 23 columns]"
2024-11-15 16:44:23 +04:00
]
},
2024-11-27 17:22:10 +04:00
"execution_count": 161,
2024-11-15 16:44:23 +04:00
"metadata": {},
2024-11-27 17:22:10 +04:00
"output_type": "execute_result"
2024-11-15 16:44:23 +04:00
}
],
"source": [
2024-11-27 17:22:10 +04:00
"preprocessing_result = pipeline_end.fit_transform(X_train)\n",
"preprocessed_df = pd.DataFrame(\n",
" preprocessing_result,\n",
" columns=pipeline_end.get_feature_names_out(),\n",
2024-11-15 16:44:23 +04:00
")\n",
"\n",
2024-11-27 17:22:10 +04:00
"preprocessed_df"
2024-11-15 16:44:23 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-27 17:22:10 +04:00
"### Формирование набора моделей для классификации¶\n",
"logistic -- логистическая регрессия\n",
2024-11-15 16:44:23 +04:00
"\n",
2024-11-27 17:22:10 +04:00
"ridge -- гребневая регрессия\n",
2024-11-15 16:44:23 +04:00
"\n",
2024-11-27 17:22:10 +04:00
"decision_tree -- дерево решений\n",
2024-11-15 16:44:23 +04:00
"\n",
2024-11-27 17:22:10 +04:00
"knn -- k-ближайших соседей\n",
2024-11-15 16:44:23 +04:00
"\n",
2024-11-27 17:22:10 +04:00
"naive_bayes -- наивный Байесовский классификатор\n",
"\n",
"gradient_boosting -- метод градиентного бустинга (набор деревьев решений)\n",
"\n",
"random_forest -- метод случайного леса (набор деревьев решений)\n",
"\n",
"mlp -- многослойный персептрон (нейронная сеть)"
2024-11-15 16:44:23 +04:00
]
},
{
"cell_type": "code",
2024-11-27 17:22:10 +04:00
"execution_count": 162,
2024-11-15 16:44:23 +04:00
"metadata": {},
2024-11-27 17:22:10 +04:00
"outputs": [],
"source": [
"from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree, svm\n",
"\n",
"class_models = {\n",
" \"logistic\": {\"model\": linear_model.LogisticRegression(max_iter=150)},\n",
" \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n",
" \"ridge\": {\"model\": linear_model.LogisticRegression(max_iter=150, solver='lbfgs', penalty=\"l2\", class_weight=\"balanced\")},\n",
" \"decision_tree\": {\n",
" \"model\": tree.DecisionTreeClassifier(max_depth=5, min_samples_split=10, random_state=random_state)\n",
" },\n",
"\n",
" \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n",
" \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n",
" \"gradient_boosting\": {\n",
" \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n",
" },\n",
"\n",
" \"random_forest\": {\n",
" \"model\": ensemble.RandomForestClassifier(\n",
" max_depth=5, class_weight=\"balanced\", random_state=random_state\n",
" )\n",
" },\n",
"\n",
" \"mlp\": {\n",
" \"model\": neural_network.MLPClassifier(\n",
" hidden_layer_sizes=(7,),\n",
" max_iter=200,\n",
" early_stopping=True,\n",
" random_state=random_state,\n",
" )\n",
" },\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Обучение моделей на обучающем наборе данных и оценка на тестовом"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: logistic\n",
"Model: ridge\n",
"Model: decision_tree\n",
"Model: knn\n",
"Model: naive_bayes\n",
"Model: gradient_boosting\n",
"Model: random_forest\n",
"Model: mlp\n"
]
}
],
"source": [
"import numpy as np\n",
"from sklearn import metrics\n",
"\n",
"for model_name in class_models.keys():\n",
" print(f\"Model: {model_name}\")\n",
" model = class_models[model_name][\"model\"]\n",
"\n",
" model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
" model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n",
"\n",
" y_train_predict = model_pipeline.predict(X_train)\n",
" y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n",
" y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n",
"\n",
" class_models[model_name][\"pipeline\"] = model_pipeline\n",
" class_models[model_name][\"probs\"] = y_test_probs\n",
" class_models[model_name][\"preds\"] = y_test_predict\n",
"\n",
" class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n",
" y_train, y_train_predict, zero_division=1\n",
" )\n",
" class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n",
" y_test, y_test_predict, zero_division=1\n",
" )\n",
" class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n",
" y_train, y_train_predict\n",
" )\n",
" class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n",
" y_train, y_train_predict\n",
" )\n",
" class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n",
" y_test, y_test_probs\n",
" )\n",
" class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n",
" class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n",
" class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n",
" y_test, y_test_predict\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Сводная таблица оценок качества для использованных моделей классификации. Матрица неточностей"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0cAAAQ9CAYAAACSpDaqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeVwU5eMH8M8ssIDcqFyCeKAo3mIZeSeBZB5p+fNKUdTyq+aRZ6aClZiWZx5peX3T1C4rzYNUxJQsDzwISRRvDhUBQYFld35/8GVsAxYWFheYz/v7mtfXnedh5pk15uMzz8wzgiiKIoiIiIiIiGROYewGEBERERERVQXsHBEREREREYGdIyIiIiIiIgDsHBEREREREQFg54iIiIiIiAgAO0dEREREREQA2DkiIiIiIiICwM4RERERERERAHaOiIiIiIiIALBzROW0ZcsWCIKA69evV8r2r1+/DkEQsGXLFoNsLzIyEoIgIDIy0iDbIyIiqilCQ0MhCEKZ6gqCgNDQ0MptEJERsXNENcratWsN1qEiIiIiInkxNXYDiIrj6emJJ0+ewMzMTK+fW7t2LerUqYPg4GCt9V27dsWTJ0+gVCoN2EoiIqLq7/3338fs2bON3QyiKoGdI6qSBEGAhYWFwbanUCgMuj0iIqKaIDs7G1ZWVjA15T8JiQDeVkcGtHbtWrRo0QLm5uZwc3PDhAkTkJ6eXqTemjVr0KhRI1haWuL555/H8ePH0b17d3Tv3l2qU9wzR8nJyRg1ahTc3d1hbm4OV1dX9OvXT3ruqUGDBoiNjcWxY8cgCAIEQZC2WdIzR6dOncIrr7wCBwcHWFlZoXXr1li5cqVhvxgiIqIqoPDZor/++gtDhw6Fg4MDOnfuXOwzR7m5uZg6dSrq1q0LGxsb9O3bF7dv3y52u5GRkejQoQMsLCzQuHFjfP755yU+x/TVV1/B19cXlpaWcHR0xODBg3Hr1q1KOV6i8uBlAjKI0NBQhIWFwd/fH+PHj0d8fDzWrVuHP//8EydOnJBuj1u3bh0mTpyILl26YOrUqbh+/Tr69+8PBwcHuLu769zHwIEDERsbi0mTJqFBgwZITU1FREQEbt68iQYNGmDFihWYNGkSrK2tMXfuXACAs7NziduLiIjAq6++CldXV0yePBkuLi6Ii4vD3r17MXnyZMN9OURERFXIG2+8gSZNmmDRokUQRRGpqalF6owZMwZfffUVhg4dihdffBFHjhxB7969i9Q7d+4cevXqBVdXV4SFhUGtVmPhwoWoW7dukbofffQR5s2bh0GDBmHMmDG4d+8eVq9eja5du+LcuXOwt7evjMMl0o9IVA6bN28WAYiJiYliamqqqFQqxYCAAFGtVkt1PvvsMxGAuGnTJlEURTE3N1esXbu2+Nxzz4kqlUqqt2XLFhGA2K1bN2ldYmKiCEDcvHmzKIqi+PDhQxGAuHTpUp3tatGihdZ2Ch09elQEIB49elQURVHMz88XGzZsKHp6eooPHz7UqqvRaMr+RRAREVUTCxYsEAGIQ4YMKXZ9oZiYGBGA+J///Eer3tChQ0UA4oIFC6R1ffr0EWvVqiXeuXNHWnflyhXR1NRUa5vXr18XTUxMxI8++khrmxcvXhRNTU2LrCcyFt5WRxX266+/Ii8vD1OmTIFC8fQ/qbFjx8LW1hb79u0DAJw+fRoPHjzA2LFjte5tHjZsGBwcHHTuw9LSEkqlEpGRkXj48GGF23zu3DkkJiZiypQpRa5UlXU6UyIiouro7bff1ln+yy+/AADeeecdrfVTpkzR+qxWq/Hrr7+if//+cHNzk9Z7eXkhKChIq+73338PjUaDQYMG4f79+9Li4uKCJk2a4OjRoxU4IiLD4W11VGE3btwAAHh7e2utVyqVaNSokVRe+P9eXl5a9UxNTdGgQQOd+zA3N8fHH3+Md999F87OznjhhRfw6quvYsSIEXBxcdG7zVevXgUAtGzZUu+fJSIiqs4aNmyos/zGjRtQKBRo3Lix1vp/53xqaiqePHlSJNeBoll/5coViKKIJk2aFLtPfWenJaos7BxRtTFlyhT06dMHe/bswcGDBzFv3jyEh4fjyJEjaNeunbGbR0REVC1YWlo+831qNBoIgoD9+/fDxMSkSLm1tfUzbxNRcXhbHVWYp6cnACA+Pl5rfV5eHhITE6Xywv9PSEjQqpefny/NOFeaxo0b491338WhQ4dw6dIl5OXl4dNPP5XKy3pLXOHVsEuXLpWpPhERkVx4enpCo9FId1kU+nfOOzk5wcLCokiuA0WzvnHjxhBFEQ0bNoS/v3+R5YUXXjD8gRCVAztHVGH+/v5QKpVYtWoVRFGU1n/55ZfIyMiQZrfp0KEDateujY0bNyI/P1+qt3379lKfI3r8+DFycnK01jVu3Bg2NjbIzc2V1llZWRU7ffi/tW/fHg0bNsSKFSuK1P/nMRAREclN4fNCq1at0lq/YsUKrc8mJibw9/fHnj17cPfuXWl9QkIC9u/fr1V3wIABMDExQVhYWJGcFUURDx48MOAREJUfb6ujCqtbty7mzJmDsLAw9OrVC3379kV8fDzWrl2L5557DsOHDwdQ8AxSaGgoJk2ahJdeegmDBg3C9evXsWXLFjRu3FjnqM/ff/+Nnj17YtCgQfDx8YGpqSl++OEHpKSkYPDgwVI9X19frFu3Dh9++CG8vLzg5OSEl156qcj2FAoF1q1bhz59+qBt27YYNWoUXF1dcfnyZcTGxuLgwYOG/6KIiIiqgbZt22LIkCFYu3YtMjIy8OKLL+Lw4cPFjhCFhobi0KFD6NSpE8aPHw+1Wo3PPvsMLVu2RExMjFSvcePG+PDDDzFnzhzpNR42NjZITEzEDz/8gHHjxmH69OnP8CiJisfOERlEaGgo6tati88++wxTp06Fo6Mjxo0bh0WLFmk9ZDlx4kSIoohPP/0U06dPR5s2bfDTTz/hnXfegYWFRYnb9/DwwJAhQ3D48GH897//hampKZo1a4bdu3dj4MCBUr358+fjxo0bWLJkCR49eoRu3boV2zkCgMDAQBw9ehRhYWH49NNPodFo0LhxY4wdO9ZwXwwREVE1tGnTJtStWxfbt2/Hnj178NJLL2Hfvn3w8PDQqufr64v9+/dj+vTpmDdvHjw8PLBw4ULExcXh8uXLWnVnz56Npk2bYvny5QgLCwNQkO8BAQHo27fvMzs2Il0EkfcQkZFpNBrUrVsXAwYMwMaNG43dHCIiIqqg/v37IzY2FleuXDF2U4j0wmeO6JnKyckpcq/xtm3bkJaWhu7duxunUURERFRuT5480fp85coV/PLLL8x1qpY4ckTPVGRkJKZOnYo33ngDtWvXxtmzZ/Hll1+iefPmOHPmDJRKpbGbSERERHpwdXVFcHCw9G7DdevWITc3F+fOnSvxvUZEVRWfOaJnqkGDBvDw8MCqVauQlpYGR0dHjBgxAosXL2bHiIiIqBrq1asXvv76ayQnJ8Pc3Bx+fn5YtGgRO0ZULXHkiIiIiIiICHzmiIiIiIiICAA7R0RERERERAD4zFGZaDQa3L17FzY2NjpfVEpUE4miiEePHsHNzQ0KhWGvp+Tk5CAvL6/UekqlUud7sIhIfpjNJGfM5srDzlEZ3L17t8hLz4jk5tatW3B3dzfY9nJyctDQ0xrJqepS67q4uCAxMbFGnoSJqHyYzUTM5srAzlEZ2NjYAABunG0AW2veiWgMrzVtZewmyFY+VPgNv0i/B4aSl5eH5FQ1Ek57wNam5N+rzEcaeHW4hby8vBp3Aiai8mM2Gx+z2XiYzZWHnaMyKByut7VW6PwPhSqPqWBm7CbI1//ms6ys21asbQRY25S8bQ14uwwRFcVsNj5msxExmysNO0dEZFQqUQ2VjjcKqETNM2wNERERyTmb2TkiIqPSQIQGJZ+AdZURERGR4ck5m9k5IiKj0kCEWqYnYCIioqpIztnMzhERGZVK1ECl4xxbk4fuiYiIqiI5ZzM7R0RkVJr/LbrKiYiI6NmRczazc0RERqUuZeheVxkREREZnpyzmZ0jIjIqlYhShu6fXVuIiIhI3tnMzhE
"text/plain": [
"<Figure size 1200x1000 with 16 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.metrics import ConfusionMatrixDisplay\n",
"import matplotlib.pyplot as plt\n",
"\n",
"_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n",
"for index, key in enumerate(class_models.keys()):\n",
" c_matrix = class_models[key][\"Confusion_matrix\"]\n",
" disp = ConfusionMatrixDisplay(\n",
" confusion_matrix=c_matrix, display_labels=[\"Less\", \"More\"]\n",
" ).plot(ax=ax.flat[index])\n",
" disp.ax_.set_title(key)\n",
"\n",
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Н а данных графиках, левый нижний квадрат обозначает, кол-во правильно классифицированных значениях, относимых к классу \"Less\", чем больше число в этом квадрате, тем лучше модель может классифицировать этот класс. Нижний левый квадрат отвечает за кол-во правильно классифицированных значениях \"More\". Здесь так же как и в левом верхнем, чем выше значение, тем лучше.\n",
"\n",
"### Точность, полнота, верность (аккуратность), F-мера"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_6e86b_row0_col0, #T_6e86b_row0_col1, #T_6e86b_row0_col2, #T_6e86b_row0_col3, #T_6e86b_row1_col0, #T_6e86b_row1_col1, #T_6e86b_row1_col2, #T_6e86b_row1_col3, #T_6e86b_row2_col0, #T_6e86b_row2_col1, #T_6e86b_row2_col2, #T_6e86b_row2_col3, #T_6e86b_row3_col0, #T_6e86b_row3_col1, #T_6e86b_row3_col2, #T_6e86b_row3_col3, #T_6e86b_row4_col0, #T_6e86b_row4_col1, #T_6e86b_row4_col2, #T_6e86b_row4_col3, #T_6e86b_row5_col0, #T_6e86b_row5_col1 {\n",
" background-color: #a8db34;\n",
" color: #000000;\n",
"}\n",
"#T_6e86b_row0_col4, #T_6e86b_row0_col5, #T_6e86b_row0_col6, #T_6e86b_row0_col7, #T_6e86b_row1_col4, #T_6e86b_row1_col5, #T_6e86b_row1_col6, #T_6e86b_row1_col7, #T_6e86b_row2_col4, #T_6e86b_row2_col5, #T_6e86b_row2_col6, #T_6e86b_row2_col7, #T_6e86b_row3_col4, #T_6e86b_row3_col5, #T_6e86b_row3_col6, #T_6e86b_row3_col7, #T_6e86b_row4_col4, #T_6e86b_row4_col5, #T_6e86b_row4_col6, #T_6e86b_row4_col7 {\n",
" background-color: #da5a6a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_6e86b_row5_col2 {\n",
" background-color: #6ccd5a;\n",
" color: #000000;\n",
"}\n",
"#T_6e86b_row5_col3 {\n",
" background-color: #6ece58;\n",
" color: #000000;\n",
"}\n",
"#T_6e86b_row5_col4 {\n",
" background-color: #c43e7f;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_6e86b_row5_col5 {\n",
" background-color: #c5407e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_6e86b_row5_col6, #T_6e86b_row5_col7 {\n",
" background-color: #ce4b75;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_6e86b_row6_col0 {\n",
" background-color: #40bd72;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_6e86b_row6_col1 {\n",
" background-color: #38b977;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_6e86b_row6_col2 {\n",
" background-color: #7fd34e;\n",
" color: #000000;\n",
"}\n",
"#T_6e86b_row6_col3 {\n",
" background-color: #75d054;\n",
" color: #000000;\n",
"}\n",
"#T_6e86b_row6_col4 {\n",
" background-color: #be3885;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_6e86b_row6_col5 {\n",
" background-color: #b42e8d;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_6e86b_row6_col6 {\n",
" background-color: #cc4977;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_6e86b_row6_col7 {\n",
" background-color: #c8437b;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_6e86b_row7_col0, #T_6e86b_row7_col1, #T_6e86b_row7_col2, #T_6e86b_row7_col3 {\n",
" background-color: #26818e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_6e86b_row7_col4, #T_6e86b_row7_col5, #T_6e86b_row7_col6, #T_6e86b_row7_col7 {\n",
" background-color: #4e02a2;\n",
" color: #f1f1f1;\n",
"}\n",
"</style>\n",
"<table id=\"T_6e86b\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" > </th>\n",
" <th id=\"T_6e86b_level0_col0\" class=\"col_heading level0 col0\" >Precision_train</th>\n",
" <th id=\"T_6e86b_level0_col1\" class=\"col_heading level0 col1\" >Precision_test</th>\n",
" <th id=\"T_6e86b_level0_col2\" class=\"col_heading level0 col2\" >Recall_train</th>\n",
" <th id=\"T_6e86b_level0_col3\" class=\"col_heading level0 col3\" >Recall_test</th>\n",
" <th id=\"T_6e86b_level0_col4\" class=\"col_heading level0 col4\" >Accuracy_train</th>\n",
" <th id=\"T_6e86b_level0_col5\" class=\"col_heading level0 col5\" >Accuracy_test</th>\n",
" <th id=\"T_6e86b_level0_col6\" class=\"col_heading level0 col6\" >F1_train</th>\n",
" <th id=\"T_6e86b_level0_col7\" class=\"col_heading level0 col7\" >F1_test</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_6e86b_level0_row0\" class=\"row_heading level0 row0\" >logistic</th>\n",
" <td id=\"T_6e86b_row0_col0\" class=\"data row0 col0\" >1.000000</td>\n",
" <td id=\"T_6e86b_row0_col1\" class=\"data row0 col1\" >1.000000</td>\n",
" <td id=\"T_6e86b_row0_col2\" class=\"data row0 col2\" >0.999767</td>\n",
" <td id=\"T_6e86b_row0_col3\" class=\"data row0 col3\" >1.000000</td>\n",
" <td id=\"T_6e86b_row0_col4\" class=\"data row0 col4\" >0.999884</td>\n",
" <td id=\"T_6e86b_row0_col5\" class=\"data row0 col5\" >1.000000</td>\n",
" <td id=\"T_6e86b_row0_col6\" class=\"data row0 col6\" >0.999884</td>\n",
" <td id=\"T_6e86b_row0_col7\" class=\"data row0 col7\" >1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_6e86b_level0_row1\" class=\"row_heading level0 row1\" >ridge</th>\n",
" <td id=\"T_6e86b_row1_col0\" class=\"data row1 col0\" >1.000000</td>\n",
" <td id=\"T_6e86b_row1_col1\" class=\"data row1 col1\" >1.000000</td>\n",
" <td id=\"T_6e86b_row1_col2\" class=\"data row1 col2\" >0.999651</td>\n",
" <td id=\"T_6e86b_row1_col3\" class=\"data row1 col3\" >1.000000</td>\n",
" <td id=\"T_6e86b_row1_col4\" class=\"data row1 col4\" >0.999826</td>\n",
" <td id=\"T_6e86b_row1_col5\" class=\"data row1 col5\" >1.000000</td>\n",
" <td id=\"T_6e86b_row1_col6\" class=\"data row1 col6\" >0.999826</td>\n",
" <td id=\"T_6e86b_row1_col7\" class=\"data row1 col7\" >1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_6e86b_level0_row2\" class=\"row_heading level0 row2\" >decision_tree</th>\n",
" <td id=\"T_6e86b_row2_col0\" class=\"data row2 col0\" >1.000000</td>\n",
" <td id=\"T_6e86b_row2_col1\" class=\"data row2 col1\" >1.000000</td>\n",
" <td id=\"T_6e86b_row2_col2\" class=\"data row2 col2\" >1.000000</td>\n",
" <td id=\"T_6e86b_row2_col3\" class=\"data row2 col3\" >1.000000</td>\n",
" <td id=\"T_6e86b_row2_col4\" class=\"data row2 col4\" >1.000000</td>\n",
" <td id=\"T_6e86b_row2_col5\" class=\"data row2 col5\" >1.000000</td>\n",
" <td id=\"T_6e86b_row2_col6\" class=\"data row2 col6\" >1.000000</td>\n",
" <td id=\"T_6e86b_row2_col7\" class=\"data row2 col7\" >1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_6e86b_level0_row3\" class=\"row_heading level0 row3\" >gradient_boosting</th>\n",
" <td id=\"T_6e86b_row3_col0\" class=\"data row3 col0\" >1.000000</td>\n",
" <td id=\"T_6e86b_row3_col1\" class=\"data row3 col1\" >1.000000</td>\n",
" <td id=\"T_6e86b_row3_col2\" class=\"data row3 col2\" >1.000000</td>\n",
" <td id=\"T_6e86b_row3_col3\" class=\"data row3 col3\" >1.000000</td>\n",
" <td id=\"T_6e86b_row3_col4\" class=\"data row3 col4\" >1.000000</td>\n",
" <td id=\"T_6e86b_row3_col5\" class=\"data row3 col5\" >1.000000</td>\n",
" <td id=\"T_6e86b_row3_col6\" class=\"data row3 col6\" >1.000000</td>\n",
" <td id=\"T_6e86b_row3_col7\" class=\"data row3 col7\" >1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_6e86b_level0_row4\" class=\"row_heading level0 row4\" >random_forest</th>\n",
" <td id=\"T_6e86b_row4_col0\" class=\"data row4 col0\" >1.000000</td>\n",
" <td id=\"T_6e86b_row4_col1\" class=\"data row4 col1\" >1.000000</td>\n",
" <td id=\"T_6e86b_row4_col2\" class=\"data row4 col2\" >1.000000</td>\n",
" <td id=\"T_6e86b_row4_col3\" class=\"data row4 col3\" >1.000000</td>\n",
" <td id=\"T_6e86b_row4_col4\" class=\"data row4 col4\" >1.000000</td>\n",
" <td id=\"T_6e86b_row4_col5\" class=\"data row4 col5\" >1.000000</td>\n",
" <td id=\"T_6e86b_row4_col6\" class=\"data row4 col6\" >1.000000</td>\n",
" <td id=\"T_6e86b_row4_col7\" class=\"data row4 col7\" >1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_6e86b_level0_row5\" class=\"row_heading level0 row5\" >naive_bayes</th>\n",
" <td id=\"T_6e86b_row5_col0\" class=\"data row5 col0\" >1.000000</td>\n",
" <td id=\"T_6e86b_row5_col1\" class=\"data row5 col1\" >1.000000</td>\n",
" <td id=\"T_6e86b_row5_col2\" class=\"data row5 col2\" >0.786719</td>\n",
" <td id=\"T_6e86b_row5_col3\" class=\"data row5 col3\" >0.793953</td>\n",
" <td id=\"T_6e86b_row5_col4\" class=\"data row5 col4\" >0.893927</td>\n",
" <td id=\"T_6e86b_row5_col5\" class=\"data row5 col5\" >0.897525</td>\n",
" <td id=\"T_6e86b_row5_col6\" class=\"data row5 col6\" >0.880630</td>\n",
" <td id=\"T_6e86b_row5_col7\" class=\"data row5 col7\" >0.885144</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_6e86b_level0_row6\" class=\"row_heading level0 row6\" >knn</th>\n",
" <td id=\"T_6e86b_row6_col0\" class=\"data row6 col0\" >0.872486</td>\n",
" <td id=\"T_6e86b_row6_col1\" class=\"data row6 col1\" >0.827473</td>\n",
" <td id=\"T_6e86b_row6_col2\" class=\"data row6 col2\" >0.857774</td>\n",
" <td id=\"T_6e86b_row6_col3\" class=\"data row6 col3\" >0.820930</td>\n",
" <td id=\"T_6e86b_row6_col4\" class=\"data row6 col4\" >0.866917</td>\n",
" <td id=\"T_6e86b_row6_col5\" class=\"data row6 col5\" >0.825815</td>\n",
" <td id=\"T_6e86b_row6_col6\" class=\"data row6 col6\" >0.865068</td>\n",
" <td id=\"T_6e86b_row6_col7\" class=\"data row6 col7\" >0.824189</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_6e86b_level0_row7\" class=\"row_heading level0 row7\" >mlp</th>\n",
" <td id=\"T_6e86b_row7_col0\" class=\"data row7 col0\" >0.687500</td>\n",
" <td id=\"T_6e86b_row7_col1\" class=\"data row7 col1\" >0.615385</td>\n",
" <td id=\"T_6e86b_row7_col2\" class=\"data row7 col2\" >0.002558</td>\n",
" <td id=\"T_6e86b_row7_col3\" class=\"data row7 col3\" >0.003721</td>\n",
" <td id=\"T_6e86b_row7_col4\" class=\"data row7 col4\" >0.503355</td>\n",
" <td id=\"T_6e86b_row7_col5\" class=\"data row7 col5\" >0.503354</td>\n",
" <td id=\"T_6e86b_row7_col6\" class=\"data row7 col6\" >0.005098</td>\n",
" <td id=\"T_6e86b_row7_col7\" class=\"data row7 col7\" >0.007397</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x1ab940a6300>"
]
},
"execution_count": 165,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
" [\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" \"Accuracy_train\",\n",
" \"Accuracy_test\",\n",
" \"F1_train\",\n",
" \"F1_test\",\n",
" ]\n",
"]\n",
"class_metrics.sort_values(\n",
" by=\"Accuracy_test\", ascending=False\n",
").style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"100% точности у модели может свидетельствовать о е е переобучении, то есть модели обучилась классифицировать значения только для обучающей выборки, но на тестовой выборке результаты будут плохими.\n",
"\n",
"### ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_abe99_row0_col0, #T_abe99_row0_col1, #T_abe99_row1_col0, #T_abe99_row1_col1, #T_abe99_row2_col0, #T_abe99_row2_col1, #T_abe99_row3_col0, #T_abe99_row3_col1, #T_abe99_row4_col0, #T_abe99_row4_col1 {\n",
" background-color: #a8db34;\n",
" color: #000000;\n",
"}\n",
"#T_abe99_row0_col2, #T_abe99_row0_col3, #T_abe99_row0_col4, #T_abe99_row1_col2, #T_abe99_row1_col3, #T_abe99_row1_col4, #T_abe99_row2_col2, #T_abe99_row2_col3, #T_abe99_row2_col4, #T_abe99_row3_col2, #T_abe99_row3_col3, #T_abe99_row3_col4, #T_abe99_row4_col2, #T_abe99_row4_col3, #T_abe99_row4_col4, #T_abe99_row5_col2 {\n",
" background-color: #da5a6a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_abe99_row5_col0 {\n",
" background-color: #6ece58;\n",
" color: #000000;\n",
"}\n",
"#T_abe99_row5_col1 {\n",
" background-color: #86d549;\n",
" color: #000000;\n",
"}\n",
"#T_abe99_row5_col3 {\n",
" background-color: #c5407e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_abe99_row5_col4 {\n",
" background-color: #c7427c;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_abe99_row6_col0 {\n",
" background-color: #4cc26c;\n",
" color: #000000;\n",
"}\n",
"#T_abe99_row6_col1 {\n",
" background-color: #75d054;\n",
" color: #000000;\n",
"}\n",
"#T_abe99_row6_col2 {\n",
" background-color: #c8437b;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_abe99_row6_col3, #T_abe99_row6_col4 {\n",
" background-color: #b42e8d;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_abe99_row7_col0, #T_abe99_row7_col1 {\n",
" background-color: #26818e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_abe99_row7_col2, #T_abe99_row7_col3, #T_abe99_row7_col4 {\n",
" background-color: #4e02a2;\n",
" color: #f1f1f1;\n",
"}\n",
"</style>\n",
"<table id=\"T_abe99\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" > </th>\n",
" <th id=\"T_abe99_level0_col0\" class=\"col_heading level0 col0\" >Accuracy_test</th>\n",
" <th id=\"T_abe99_level0_col1\" class=\"col_heading level0 col1\" >F1_test</th>\n",
" <th id=\"T_abe99_level0_col2\" class=\"col_heading level0 col2\" >ROC_AUC_test</th>\n",
" <th id=\"T_abe99_level0_col3\" class=\"col_heading level0 col3\" >Cohen_kappa_test</th>\n",
" <th id=\"T_abe99_level0_col4\" class=\"col_heading level0 col4\" >MCC_test</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_abe99_level0_row0\" class=\"row_heading level0 row0\" >logistic</th>\n",
" <td id=\"T_abe99_row0_col0\" class=\"data row0 col0\" >1.000000</td>\n",
" <td id=\"T_abe99_row0_col1\" class=\"data row0 col1\" >1.000000</td>\n",
" <td id=\"T_abe99_row0_col2\" class=\"data row0 col2\" >1.000000</td>\n",
" <td id=\"T_abe99_row0_col3\" class=\"data row0 col3\" >1.000000</td>\n",
" <td id=\"T_abe99_row0_col4\" class=\"data row0 col4\" >1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_abe99_level0_row1\" class=\"row_heading level0 row1\" >ridge</th>\n",
" <td id=\"T_abe99_row1_col0\" class=\"data row1 col0\" >1.000000</td>\n",
" <td id=\"T_abe99_row1_col1\" class=\"data row1 col1\" >1.000000</td>\n",
" <td id=\"T_abe99_row1_col2\" class=\"data row1 col2\" >1.000000</td>\n",
" <td id=\"T_abe99_row1_col3\" class=\"data row1 col3\" >1.000000</td>\n",
" <td id=\"T_abe99_row1_col4\" class=\"data row1 col4\" >1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_abe99_level0_row2\" class=\"row_heading level0 row2\" >decision_tree</th>\n",
" <td id=\"T_abe99_row2_col0\" class=\"data row2 col0\" >1.000000</td>\n",
" <td id=\"T_abe99_row2_col1\" class=\"data row2 col1\" >1.000000</td>\n",
" <td id=\"T_abe99_row2_col2\" class=\"data row2 col2\" >1.000000</td>\n",
" <td id=\"T_abe99_row2_col3\" class=\"data row2 col3\" >1.000000</td>\n",
" <td id=\"T_abe99_row2_col4\" class=\"data row2 col4\" >1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_abe99_level0_row3\" class=\"row_heading level0 row3\" >gradient_boosting</th>\n",
" <td id=\"T_abe99_row3_col0\" class=\"data row3 col0\" >1.000000</td>\n",
" <td id=\"T_abe99_row3_col1\" class=\"data row3 col1\" >1.000000</td>\n",
" <td id=\"T_abe99_row3_col2\" class=\"data row3 col2\" >1.000000</td>\n",
" <td id=\"T_abe99_row3_col3\" class=\"data row3 col3\" >1.000000</td>\n",
" <td id=\"T_abe99_row3_col4\" class=\"data row3 col4\" >1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_abe99_level0_row4\" class=\"row_heading level0 row4\" >random_forest</th>\n",
" <td id=\"T_abe99_row4_col0\" class=\"data row4 col0\" >1.000000</td>\n",
" <td id=\"T_abe99_row4_col1\" class=\"data row4 col1\" >1.000000</td>\n",
" <td id=\"T_abe99_row4_col2\" class=\"data row4 col2\" >1.000000</td>\n",
" <td id=\"T_abe99_row4_col3\" class=\"data row4 col3\" >1.000000</td>\n",
" <td id=\"T_abe99_row4_col4\" class=\"data row4 col4\" >1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_abe99_level0_row5\" class=\"row_heading level0 row5\" >naive_bayes</th>\n",
" <td id=\"T_abe99_row5_col0\" class=\"data row5 col0\" >0.897525</td>\n",
" <td id=\"T_abe99_row5_col1\" class=\"data row5 col1\" >0.885144</td>\n",
" <td id=\"T_abe99_row5_col2\" class=\"data row5 col2\" >0.999566</td>\n",
" <td id=\"T_abe99_row5_col3\" class=\"data row5 col3\" >0.794820</td>\n",
" <td id=\"T_abe99_row5_col4\" class=\"data row5 col4\" >0.812098</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_abe99_level0_row6\" class=\"row_heading level0 row6\" >knn</th>\n",
" <td id=\"T_abe99_row6_col0\" class=\"data row6 col0\" >0.825815</td>\n",
" <td id=\"T_abe99_row6_col1\" class=\"data row6 col1\" >0.824189</td>\n",
" <td id=\"T_abe99_row6_col2\" class=\"data row6 col2\" >0.910823</td>\n",
" <td id=\"T_abe99_row6_col3\" class=\"data row6 col3\" >0.651606</td>\n",
" <td id=\"T_abe99_row6_col4\" class=\"data row6 col4\" >0.651627</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_abe99_level0_row7\" class=\"row_heading level0 row7\" >mlp</th>\n",
" <td id=\"T_abe99_row7_col0\" class=\"data row7 col0\" >0.503354</td>\n",
" <td id=\"T_abe99_row7_col1\" class=\"data row7 col1\" >0.007397</td>\n",
" <td id=\"T_abe99_row7_col2\" class=\"data row7 col2\" >0.497071</td>\n",
" <td id=\"T_abe99_row7_col3\" class=\"data row7 col3\" >0.001427</td>\n",
" <td id=\"T_abe99_row7_col4\" class=\"data row7 col4\" >0.012966</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x1ab940a5df0>"
]
},
"execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
" [\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" \"ROC_AUC_test\",\n",
" \"Cohen_kappa_test\",\n",
" \"MCC_test\",\n",
" ]\n",
"]\n",
"class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\n",
" \"ROC_AUC_test\",\n",
" \"MCC_test\",\n",
" \"Cohen_kappa_test\",\n",
" ],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Вывод лучшей модели"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'logistic'"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n",
"\n",
"display(best_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Вывод данных с ошибкой предсказания для оценки"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Error items count: 0'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>Predicted</th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>...</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>price_category</th>\n",
" <th>median_price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"<p>0 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [id, Predicted, date, price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition, grade, sqft_above, sqft_basement, yr_built, yr_renovated, zipcode, lat, long, sqft_living15, sqft_lot15, price_category, median_price]\n",
"Index: []\n",
"\n",
"[0 rows x 24 columns]"
]
},
"execution_count": 168,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessing_result = pipeline_end.transform(X_test)\n",
"preprocessed_df = pd.DataFrame(\n",
" preprocessing_result,\n",
" columns=pipeline_end.get_feature_names_out(),\n",
")\n",
"\n",
"y_pred = class_models[best_model][\"preds\"]\n",
"\n",
"error_index = y_test[y_test[\"median_price\"] != y_pred].index.tolist()\n",
"display(f\"Error items count: {len(error_index)}\")\n",
"\n",
"error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n",
"error_df = X_test.loc[error_index].copy()\n",
"error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n",
"error_df.sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Пример использования обученной модели (конвейера) для предсказания"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>price_category</th>\n",
" <th>median_price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>11592</th>\n",
" <td>2028701000</td>\n",
" <td>20140529T000000</td>\n",
" <td>635200.0</td>\n",
" <td>4</td>\n",
" <td>1.75</td>\n",
" <td>1640</td>\n",
" <td>4240</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>720</td>\n",
" <td>1921</td>\n",
" <td>0</td>\n",
" <td>98117</td>\n",
" <td>47.6766</td>\n",
" <td>-122.368</td>\n",
" <td>1300</td>\n",
" <td>4240</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" id date price bedrooms bathrooms sqft_living \\\n",
"11592 2028701000 20140529T000000 635200.0 4 1.75 1640 \n",
"\n",
" sqft_lot floors waterfront view ... sqft_basement yr_built \\\n",
"11592 4240 1.0 0 0 ... 720 1921 \n",
"\n",
" yr_renovated zipcode lat long sqft_living15 sqft_lot15 \\\n",
"11592 0 98117 47.6766 -122.368 1300 4240 \n",
"\n",
" price_category median_price \n",
"11592 1 1 \n",
"\n",
"[1 rows x 23 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>median_price</th>\n",
" <th>id</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>price_category</th>\n",
" <th>Living_area_to_Lot_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>11592</th>\n",
" <td>-0.477812</td>\n",
" <td>-0.269226</td>\n",
" <td>1.005335</td>\n",
" <td>2.028701e+09</td>\n",
" <td>635200.0</td>\n",
" <td>4.0</td>\n",
" <td>1.75</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>720.0</td>\n",
" <td>1921.0</td>\n",
" <td>0.0</td>\n",
" <td>98117.0</td>\n",
" <td>47.6766</td>\n",
" <td>-122.368</td>\n",
" <td>1300.0</td>\n",
" <td>4240.0</td>\n",
" <td>1.0</td>\n",
" <td>1.774763</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" sqft_living sqft_lot median_price id price bedrooms \\\n",
"11592 -0.477812 -0.269226 1.005335 2.028701e+09 635200.0 4.0 \n",
"\n",
" bathrooms floors waterfront view ... sqft_basement yr_built \\\n",
"11592 1.75 1.0 0.0 0.0 ... 720.0 1921.0 \n",
"\n",
" yr_renovated zipcode lat long sqft_living15 sqft_lot15 \\\n",
"11592 0.0 98117.0 47.6766 -122.368 1300.0 4240.0 \n",
"\n",
" price_category Living_area_to_Lot_ratio \n",
"11592 1.0 1.774763 \n",
"\n",
"[1 rows x 23 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'predicted: 1 (proba: [0. 1.])'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'real: 1'"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"model = class_models[best_model][\"pipeline\"]\n",
"\n",
"example_id = 11592\n",
"test = pd.DataFrame(X_test.loc[example_id, :]).T\n",
"test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T\n",
"display(test)\n",
"display(test_preprocessed)\n",
"result_proba = model.predict_proba(test)[0]\n",
"result = model.predict(test)[0]\n",
"real = int(y_test.loc[example_id].values[0])\n",
"display(f\"predicted: {result} (proba: {result_proba})\")\n",
"display(f\"real: {real}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Подбор гиперпараметров методом поиска по сетке"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'model__criterion': 'gini',\n",
" 'model__max_depth': 5,\n",
" 'model__max_features': 'sqrt',\n",
" 'model__n_estimators': 10}"
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.model_selection import GridSearchCV\n",
"\n",
"optimized_model_type = \"random_forest\"\n",
"\n",
"random_forest_model = class_models[optimized_model_type][\"pipeline\"]\n",
"\n",
"param_grid = {\n",
" \"model__n_estimators\": [10, 50, 100],\n",
" \"model__max_features\": [\"sqrt\", \"log2\"],\n",
" \"model__max_depth\": [5, 7, 10],\n",
" \"model__criterion\": [\"gini\", \"entropy\"],\n",
"}\n",
"\n",
"gs_optomizer = GridSearchCV(\n",
" estimator=random_forest_model, param_grid=param_grid, n_jobs=-1\n",
")\n",
"gs_optomizer.fit(X_train, y_train.values.ravel())\n",
"gs_optomizer.best_params_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Обучение модели с новыми гиперпараметрами"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [],
"source": [
"optimized_model = ensemble.RandomForestClassifier(\n",
" random_state=random_state,\n",
" criterion=\"gini\",\n",
" max_depth=5,\n",
" max_features=\"sqrt\",\n",
" n_estimators=10,\n",
")\n",
"\n",
"result = {}\n",
"\n",
"result[\"pipeline\"] = Pipeline([(\"pipeline\", pipeline_end), (\"model\", optimized_model)]).fit(X_train, y_train.values.ravel())\n",
"result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n",
"result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n",
"result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n",
"\n",
"result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n",
"result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n",
"result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n",
"result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n",
"result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n",
"result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n",
"result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n",
"result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n",
"result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n",
"result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n",
"result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n",
"result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Формирование данных для оценки старой и новой версии модели"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [],
"source": [
"optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n",
"optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
" data=class_models[optimized_model_type]\n",
")\n",
"optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
" data=result\n",
")\n",
"optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n",
"optimized_metrics = optimized_metrics.set_index(\"Name\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Оценка параметров старой и новой модели"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_69705_row0_col0, #T_69705_row0_col1, #T_69705_row0_col2, #T_69705_row0_col3, #T_69705_row1_col0, #T_69705_row1_col1, #T_69705_row1_col2, #T_69705_row1_col3 {\n",
" background-color: #440154;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_69705_row0_col4, #T_69705_row0_col5, #T_69705_row0_col6, #T_69705_row0_col7, #T_69705_row1_col4, #T_69705_row1_col5, #T_69705_row1_col6, #T_69705_row1_col7 {\n",
" background-color: #0d0887;\n",
" color: #f1f1f1;\n",
"}\n",
"</style>\n",
"<table id=\"T_69705\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" > </th>\n",
" <th id=\"T_69705_level0_col0\" class=\"col_heading level0 col0\" >Precision_train</th>\n",
" <th id=\"T_69705_level0_col1\" class=\"col_heading level0 col1\" >Precision_test</th>\n",
" <th id=\"T_69705_level0_col2\" class=\"col_heading level0 col2\" >Recall_train</th>\n",
" <th id=\"T_69705_level0_col3\" class=\"col_heading level0 col3\" >Recall_test</th>\n",
" <th id=\"T_69705_level0_col4\" class=\"col_heading level0 col4\" >Accuracy_train</th>\n",
" <th id=\"T_69705_level0_col5\" class=\"col_heading level0 col5\" >Accuracy_test</th>\n",
" <th id=\"T_69705_level0_col6\" class=\"col_heading level0 col6\" >F1_train</th>\n",
" <th id=\"T_69705_level0_col7\" class=\"col_heading level0 col7\" >F1_test</th>\n",
" </tr>\n",
" <tr>\n",
" <th class=\"index_name level0\" >Name</th>\n",
" <th class=\"blank col0\" > </th>\n",
" <th class=\"blank col1\" > </th>\n",
" <th class=\"blank col2\" > </th>\n",
" <th class=\"blank col3\" > </th>\n",
" <th class=\"blank col4\" > </th>\n",
" <th class=\"blank col5\" > </th>\n",
" <th class=\"blank col6\" > </th>\n",
" <th class=\"blank col7\" > </th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_69705_level0_row0\" class=\"row_heading level0 row0\" >Old</th>\n",
" <td id=\"T_69705_row0_col0\" class=\"data row0 col0\" >1.000000</td>\n",
" <td id=\"T_69705_row0_col1\" class=\"data row0 col1\" >1.000000</td>\n",
" <td id=\"T_69705_row0_col2\" class=\"data row0 col2\" >1.000000</td>\n",
" <td id=\"T_69705_row0_col3\" class=\"data row0 col3\" >1.000000</td>\n",
" <td id=\"T_69705_row0_col4\" class=\"data row0 col4\" >1.000000</td>\n",
" <td id=\"T_69705_row0_col5\" class=\"data row0 col5\" >1.000000</td>\n",
" <td id=\"T_69705_row0_col6\" class=\"data row0 col6\" >1.000000</td>\n",
" <td id=\"T_69705_row0_col7\" class=\"data row0 col7\" >1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_69705_level0_row1\" class=\"row_heading level0 row1\" >New</th>\n",
" <td id=\"T_69705_row1_col0\" class=\"data row1 col0\" >1.000000</td>\n",
" <td id=\"T_69705_row1_col1\" class=\"data row1 col1\" >1.000000</td>\n",
" <td id=\"T_69705_row1_col2\" class=\"data row1 col2\" >1.000000</td>\n",
" <td id=\"T_69705_row1_col3\" class=\"data row1 col3\" >1.000000</td>\n",
" <td id=\"T_69705_row1_col4\" class=\"data row1 col4\" >1.000000</td>\n",
" <td id=\"T_69705_row1_col5\" class=\"data row1 col5\" >1.000000</td>\n",
" <td id=\"T_69705_row1_col6\" class=\"data row1 col6\" >1.000000</td>\n",
" <td id=\"T_69705_row1_col7\" class=\"data row1 col7\" >1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x1ab910a1f70>"
]
},
"execution_count": 173,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"optimized_metrics[\n",
" [\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" \"Accuracy_train\",\n",
" \"Accuracy_test\",\n",
" \"F1_train\",\n",
" \"F1_test\",\n",
" ]\n",
"].style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Значения 1 в кажой ячейке обосзначают, что модели очень точно классифицируют положительные образцы, не пропуская их."
]
},
{
"cell_type": "code",
"execution_count": 174,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_23990_row0_col0, #T_23990_row0_col1, #T_23990_row1_col0, #T_23990_row1_col1 {\n",
" background-color: #440154;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_23990_row0_col2, #T_23990_row0_col3, #T_23990_row0_col4, #T_23990_row1_col2, #T_23990_row1_col3, #T_23990_row1_col4 {\n",
" background-color: #0d0887;\n",
" color: #f1f1f1;\n",
"}\n",
"</style>\n",
"<table id=\"T_23990\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" > </th>\n",
" <th id=\"T_23990_level0_col0\" class=\"col_heading level0 col0\" >Accuracy_test</th>\n",
" <th id=\"T_23990_level0_col1\" class=\"col_heading level0 col1\" >F1_test</th>\n",
" <th id=\"T_23990_level0_col2\" class=\"col_heading level0 col2\" >ROC_AUC_test</th>\n",
" <th id=\"T_23990_level0_col3\" class=\"col_heading level0 col3\" >Cohen_kappa_test</th>\n",
" <th id=\"T_23990_level0_col4\" class=\"col_heading level0 col4\" >MCC_test</th>\n",
" </tr>\n",
" <tr>\n",
" <th class=\"index_name level0\" >Name</th>\n",
" <th class=\"blank col0\" > </th>\n",
" <th class=\"blank col1\" > </th>\n",
" <th class=\"blank col2\" > </th>\n",
" <th class=\"blank col3\" > </th>\n",
" <th class=\"blank col4\" > </th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_23990_level0_row0\" class=\"row_heading level0 row0\" >Old</th>\n",
" <td id=\"T_23990_row0_col0\" class=\"data row0 col0\" >1.000000</td>\n",
" <td id=\"T_23990_row0_col1\" class=\"data row0 col1\" >1.000000</td>\n",
" <td id=\"T_23990_row0_col2\" class=\"data row0 col2\" >1.000000</td>\n",
" <td id=\"T_23990_row0_col3\" class=\"data row0 col3\" >1.000000</td>\n",
" <td id=\"T_23990_row0_col4\" class=\"data row0 col4\" >1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_23990_level0_row1\" class=\"row_heading level0 row1\" >New</th>\n",
" <td id=\"T_23990_row1_col0\" class=\"data row1 col0\" >1.000000</td>\n",
" <td id=\"T_23990_row1_col1\" class=\"data row1 col1\" >1.000000</td>\n",
" <td id=\"T_23990_row1_col2\" class=\"data row1 col2\" >1.000000</td>\n",
" <td id=\"T_23990_row1_col3\" class=\"data row1 col3\" >1.000000</td>\n",
" <td id=\"T_23990_row1_col4\" class=\"data row1 col4\" >1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x1ab93b1dfa0>"
]
},
"execution_count": 174,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"optimized_metrics[\n",
" [\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" \"ROC_AUC_test\",\n",
" \"Cohen_kappa_test\",\n",
" \"MCC_test\",\n",
" ]\n",
"].style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\n",
" \"ROC_AUC_test\",\n",
" \"MCC_test\",\n",
" \"Cohen_kappa_test\",\n",
" ],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Значения 1 в кажой ячейке обосзначают, что модели точно классифицировали все тестовые примеры, не допустив никаких ошибок в предсказаниях."
]
},
{
"cell_type": "code",
"execution_count": 175,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2kAAAGsCAYAAABHMu+IAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABTp0lEQVR4nO3deXwU9f3H8fcmkAPIJgRIQiSES4FouCumKIdiAlKUgvUnN4KgNmgFOcQqBmkJxWoVpeDFYYWC9aCKigIaEIhW0IAiRC4BhQQUSAhIrt3fH5jVJRyzZJKd3byej8c8fuzM7Dff4Ufz9jvfz3zH5nQ6nQIAAAAAWEKAtzsAAAAAAPgFgzQAAAAAsBAGaQAAAABgIQzSAAAAAMBCGKQBAAAAgIUwSAMAAAAAC2GQBgAAAAAWwiANAAAAACykhrc7AAA4t9OnT6uoqMi09oKCghQSEmJaewAAeIJcM45BGgBY0OnTp9U0vo5yDpea1mZMTIz27t3rt4EGALAucs0zDNIAwIKKioqUc7hUezfHyx5W8cr0/BMONe24T0VFRX4ZZgAAayPXPMMgDQAszB4WYEqYAQBgBeSaMQzSAMDCSp0OlTrNaQcAAG8j14xhkAYAFuaQUw5VPM3MaAMAgIoi14xhrhEAAAAALISZNACwMIccMqOgw5xWAACoGHLNGAZpAGBhpU6nSp0VL+kwow0AACqKXDOGckcAAAAAsBBm0gDAwnjAGgDgT8g1YxikAYCFOeRUKWEGAPAT5JoxlDsCAAAAgIUwkwYAFkZZCADAn5BrxjCTBgAAAAAWwkwaAFgYSxUDAPwJuWYMgzQAsDDHz5sZ7QAA4G3kmjGUOwIAAACAhTCTBgAWVmrSUsVmtAEAQEWRa8YwSAMACyt1ntnMaAcAAG8j14yh3BEAAAAALISZNACwMB6wBgD4E3LNGAZpAGBhDtlUKpsp7QAA4G3kmjGUOwIAAACAhTCTBgAW5nCe2cxoBwAAbyPXjGEmDQAAAAAshJk0ALCwUpNq981oAwCAiiLXjGGQBgAWRpgBAPwJuWYM5Y4AgHLS09P1m9/8RmFhYYqKilK/fv2UnZ3tds7p06eVmpqqevXqqU6dOhowYIByc3Pdztm/f7/69OmjWrVqKSoqShMnTlRJSYnbORkZGerQoYOCg4PVokULLVy4sLIvDwBQzfharjFIAwALczhtpm2eWLt2rVJTU/XJJ59o1apVKi4uVnJysk6ePOk6Z9y4cXr77bf1n//8R2vXrtXBgwfVv39/1/HS0lL16dNHRUVF2rhxoxYtWqSFCxdq6tSprnP27t2rPn36qEePHsrKytL999+vO++8U++//37F//IAAJZDrhljczqdfr42CgD4nvz8fIWHh2vtV5epTljF76cVnHCo21XfKy8vT3a73ePvHzlyRFFRUVq7dq26du2qvLw8NWjQQEuWLNGtt94qSdqxY4dat26tzMxMXXPNNXrvvff0u9/9TgcPHlR0dLQkad68eZo8ebKOHDmioKAgTZ48We+8846++uor18+6/fbbdfz4ca1cubLC1w0AsAZyzbNcYyYNAKqR/Px8t62wsNDQ9/Ly8iRJkZGRkqTNmzeruLhYPXv2dJ3TqlUrNW7cWJmZmZKkzMxMJSYmuoJMklJSUpSfn69t27a5zvl1G2XnlLUBAMCF+GuuMUgDAAsrVYBpmyTFxcUpPDzctaWnp1+0Dw6HQ/fff7+6dOmiq666SpKUk5OjoKAgRUREuJ0bHR2tnJwc1zm/DrKy42XHLnROfn6+fvrpJ8//wgAAlkauGcPqjgBgYc5LqLs/XzuSdODAAbeykODg4It+NzU1VV999ZXWr19f4X4AAKo3cs0YZtIAoBqx2+1u28XCbOzYsVqxYoU++ugjNWrUyLU/JiZGRUVFOn78uNv5ubm5iomJcZ1z9qpYZZ8vdo7dbldoaOglXSMAoPrw11xjkAYAFlb2PhkzNk84nU6NHTtWb775pj788EM1bdrU7XjHjh1Vs2ZNrVmzxrUvOztb+/fvV1JSkiQpKSlJX375pQ4fPuw6Z9WqVbLb7UpISHCd8+s2ys4pawMA4F/INWModwQACyt1BqjUWfH7aaUeruObmpqqJUuW6L///a/CwsJctfbh4eEKDQ1VeHi4Ro0apfHjxysyMlJ2u1333nuvkpKSdM0110iSkpOTlZCQoKFDh2rWrFnKycnRww8/rNTUVNedzrvvvlvPPvusJk2apJEjR+rDDz/Uq6++qnfeeafC1wwAsB5yzRiW4AcACypbqvi9rU1V24Slik+ecKh3m72Glyq22c59h3LBggUaMWKEpDMv/XzggQf073//W4WFhUpJSdE///lPV8mHJO3bt0/33HOPMjIyVLt2bQ0fPlwzZ85UjRq/3CPMyMjQuHHj9PXXX6tRo0Z65JFHXD8DAOAfyLURHl0fgzQAsKCyMHtnazPVDguscHsnT5SqT5s9l/w+GQAAKoJc8wzPpAEAAACAhfBMGgBY2KU8HH2+dgAA8DZyzRgGaQBgYeY9YE1lOwDA+8g1Yyh3BAAAAAALYSYNACzMIZscJpR0mNEGAAAVRa4ZwyANACzMoQCVmlD04JB/l4UAAHwDuWYM5Y4AAAAAYCHMpAGAhfGANQDAn5BrxjBIAwALcyhADspCAAB+glwzhnJHAAAAALAQZtIAwMJKnTaVOk146acJbQAAUFHkmjHMpAEAAACAhTCTBgAWVmrSUsWlfl67DwDwDeSaMQzSAMDCHM4AOUxYBcvh56tgAQB8A7lmDOWOAAAAAGAhzKQBgIVRFgIA8CfkmjEM0gDAwhwyZwUrR8W7AgBAhZFrxlDuCAAAAAAWwkwaAFiYQwFymHA/zYw2AACoKHLNGAZpAGBhpc4AlZqwCpYZbQAAUFHkmjH+fXUAAAAA4GOYSQMAC3PIJofMeMC64m0AAFBR5JoxDNIAwMIoCwEA+BNyzRj/vjoAAAAA8DHMpAGAhZn30k/uyQEAvI9cM8a/rw4AAAAAfAwzaQY4HA4dPHhQYWFhstn8+yFFABXndDp14sQJxcbGKiCgYvfCHE6bHE4THrA2oQ34D3INgCfItarHIM2AgwcPKi4uztvdAOBjDhw4oEaNGlWoDYdJZSH+/tJPeIZcA3ApyLWqwyDNgLCwMEnSvs+byF7Hv/9BwHO/vyLR212AxZSoWOv1rut3B2A15BouhFzD2ci1qscgzYCyUhB7nQDZwwgzuKthq+ntLsBqnGf+jxllZA5ngBwmLDNsRhvwH+QaLoRcQznkWpVjkAYAFlYqm0pNeGGnGW0AAFBR5Jox/j0EBQAAAAAfw0waAFgYZSEAAH9CrhnDIA0ALKxU5pR0lFa8KwAAVBi5Zox/D0EBAAAAwMcwSAMACysrCzFj88S6devUt29fxcbGymazafny5W7HbTbbObfHH3/cdU6TJk3KHZ85c6ZbO1u3btV1112nkJAQxcXFadasWZf8dwUAsD5v5ZrkW9nGIA0AUM7JkyfVtm1bzZkz55zHDx065LbNnz9fNptNAwYMcDvvscceczvv3nvvdR3Lz89XcnKy4uPjtXnzZj3++ONKS0vT888/X6nXBgConnwp23gmDQAsrNQZoFITHo72tI3evXurd+/e5z0eExPj9vm///2vevTooWbNmrntDwsLK3dumcWLF6uoqEjz589XUFCQrrzySmVlZenJJ5/UmDFjPOovAMA3eCvXJN/KNmbSAMDCnLLJYcLm/Pkh7fz8fLetsLCwwn3Mzc3VO++8o1GjRpU7NnPmTNWrV0/t27fX448/rpKSEtexzMxMde3aVUFBQa59KSkpys7O1rFjxyrcLwCA9fhCrknezzYGaQBQjcTFxSk8PNy1paenV7jNRYsWKSwsTP3793fbf99992np0qX66KOPdNddd2nGjBmaNGmS63hOTo6io6PdvlP2OScnp8L9AgD4v8rINcn72Ua5IwBYmNllIQcOHJDdbnftDw4OrnDb8+fP1+DBgxUSEuK2f/z48a4/t2nTRkFBQbrrrruUnp5uys8FAPgeX8g1yfvZxiANACzM4bT
"text/plain": [
"<Figure size 1000x400 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False\n",
")\n",
"\n",
"for index in range(0, len(optimized_metrics)):\n",
" c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n",
" disp = ConfusionMatrixDisplay(\n",
" confusion_matrix=c_matrix, display_labels=[\"Less\", \"More\"]\n",
" ).plot(ax=ax.flat[index])\n",
"\n",
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Задача регресии: предсказание цены дома (price)."
]
},
{
"cell_type": "code",
"execution_count": 176,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Среднее значение поля: 2079.8997362698374\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>price_category</th>\n",
" <th>median_price</th>\n",
" <th>average_price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7129300520</td>\n",
" <td>20141013T000000</td>\n",
" <td>221900.0</td>\n",
" <td>3</td>\n",
" <td>1.00</td>\n",
" <td>1180</td>\n",
" <td>5650</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1955</td>\n",
" <td>0</td>\n",
" <td>98178</td>\n",
" <td>47.5112</td>\n",
" <td>-122.257</td>\n",
" <td>1340</td>\n",
" <td>5650</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6414100192</td>\n",
" <td>20141209T000000</td>\n",
" <td>538000.0</td>\n",
" <td>3</td>\n",
" <td>2.25</td>\n",
" <td>2570</td>\n",
" <td>7242</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1951</td>\n",
" <td>1991</td>\n",
" <td>98125</td>\n",
" <td>47.7210</td>\n",
" <td>-122.319</td>\n",
" <td>1690</td>\n",
" <td>7639</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5631500400</td>\n",
" <td>20150225T000000</td>\n",
" <td>180000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>770</td>\n",
" <td>10000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1933</td>\n",
" <td>0</td>\n",
" <td>98028</td>\n",
" <td>47.7379</td>\n",
" <td>-122.233</td>\n",
" <td>2720</td>\n",
" <td>8062</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2487200875</td>\n",
" <td>20141209T000000</td>\n",
" <td>604000.0</td>\n",
" <td>4</td>\n",
" <td>3.00</td>\n",
" <td>1960</td>\n",
" <td>5000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1965</td>\n",
" <td>0</td>\n",
" <td>98136</td>\n",
" <td>47.5208</td>\n",
" <td>-122.393</td>\n",
" <td>1360</td>\n",
" <td>5000</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1954400510</td>\n",
" <td>20150218T000000</td>\n",
" <td>510000.0</td>\n",
" <td>3</td>\n",
" <td>2.00</td>\n",
" <td>1680</td>\n",
" <td>8080</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1987</td>\n",
" <td>0</td>\n",
" <td>98074</td>\n",
" <td>47.6168</td>\n",
" <td>-122.045</td>\n",
" <td>1800</td>\n",
" <td>7503</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" id date price bedrooms bathrooms sqft_living \\\n",
"0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
"1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
"2 5631500400 20150225T000000 180000.0 2 1.00 770 \n",
"3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
"4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
"\n",
" sqft_lot floors waterfront view ... yr_built yr_renovated zipcode \\\n",
"0 5650 1.0 0 0 ... 1955 0 98178 \n",
"1 7242 2.0 0 0 ... 1951 1991 98125 \n",
"2 10000 1.0 0 0 ... 1933 0 98028 \n",
"3 5000 1.0 0 0 ... 1965 0 98136 \n",
"4 8080 1.0 0 0 ... 1987 0 98074 \n",
"\n",
" lat long sqft_living15 sqft_lot15 price_category median_price \\\n",
"0 47.5112 -122.257 1340 5650 0 0 \n",
"1 47.7210 -122.319 1690 7639 1 1 \n",
"2 47.7379 -122.233 2720 8062 0 0 \n",
"3 47.5208 -122.393 1360 5000 1 1 \n",
"4 47.6168 -122.045 1800 7503 1 1 \n",
"\n",
" average_price \n",
"0 0 \n",
"1 1 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
"[5 rows x 24 columns]"
]
},
"execution_count": 176,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Вычисление среднего значения поля \"sqft_living\"\n",
"average_price = df['sqft_living'].mean()\n",
"print(f\"Среднее значение поля: {average_price}\")\n",
"\n",
"# Создание новой колонки, указывающей, выше или ниже среднего значение цена закрытия\n",
"df['average_price'] = (df['sqft_living'] > average_price).astype(int)\n",
"\n",
"df.dropna(inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Делим DF на выборки"
]
},
{
"cell_type": "code",
"execution_count": 179,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'X_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" <th>price_category</th>\n",
" <th>median_price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>6325</th>\n",
" <td>5467910190</td>\n",
" <td>20140527T000000</td>\n",
" <td>325000.0</td>\n",
" <td>3</td>\n",
" <td>1.75</td>\n",
" <td>1780</td>\n",
" <td>13095</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1983</td>\n",
" <td>0</td>\n",
" <td>98042</td>\n",
" <td>47.3670</td>\n",
" <td>-122.152</td>\n",
" <td>2750</td>\n",
" <td>13095</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13473</th>\n",
" <td>9331800580</td>\n",
" <td>20150310T000000</td>\n",
" <td>257000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>1000</td>\n",
" <td>3700</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>200</td>\n",
" <td>1929</td>\n",
" <td>0</td>\n",
" <td>98118</td>\n",
" <td>47.5520</td>\n",
" <td>-122.290</td>\n",
" <td>1270</td>\n",
" <td>5000</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17614</th>\n",
" <td>2407000405</td>\n",
" <td>20150226T000000</td>\n",
" <td>228500.0</td>\n",
" <td>3</td>\n",
" <td>1.00</td>\n",
" <td>1080</td>\n",
" <td>7486</td>\n",
" <td>1.5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>90</td>\n",
" <td>1942</td>\n",
" <td>0</td>\n",
" <td>98146</td>\n",
" <td>47.4838</td>\n",
" <td>-122.335</td>\n",
" <td>1170</td>\n",
" <td>7800</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16970</th>\n",
" <td>5466700290</td>\n",
" <td>20150108T000000</td>\n",
" <td>288000.0</td>\n",
" <td>3</td>\n",
" <td>2.25</td>\n",
" <td>2090</td>\n",
" <td>7500</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>810</td>\n",
" <td>1977</td>\n",
" <td>0</td>\n",
" <td>98031</td>\n",
" <td>47.3951</td>\n",
" <td>-122.172</td>\n",
" <td>1800</td>\n",
" <td>7350</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20868</th>\n",
" <td>3026059361</td>\n",
" <td>20150417T000000</td>\n",
" <td>479000.0</td>\n",
" <td>2</td>\n",
" <td>2.50</td>\n",
" <td>1741</td>\n",
" <td>1439</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>295</td>\n",
" <td>2007</td>\n",
" <td>0</td>\n",
" <td>98034</td>\n",
" <td>47.7043</td>\n",
" <td>-122.209</td>\n",
" <td>2090</td>\n",
" <td>10454</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11964</th>\n",
" <td>5272200045</td>\n",
" <td>20141113T000000</td>\n",
" <td>378000.0</td>\n",
" <td>3</td>\n",
" <td>1.50</td>\n",
" <td>1000</td>\n",
" <td>6914</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1947</td>\n",
" <td>0</td>\n",
" <td>98125</td>\n",
" <td>47.7144</td>\n",
" <td>-122.319</td>\n",
" <td>1000</td>\n",
" <td>6947</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21575</th>\n",
" <td>9578500790</td>\n",
" <td>20141111T000000</td>\n",
" <td>399950.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>3087</td>\n",
" <td>5002</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2014</td>\n",
" <td>0</td>\n",
" <td>98023</td>\n",
" <td>47.2974</td>\n",
" <td>-122.349</td>\n",
" <td>2927</td>\n",
" <td>5183</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5390</th>\n",
" <td>7202350480</td>\n",
" <td>20140930T000000</td>\n",
" <td>575000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>2120</td>\n",
" <td>4780</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2004</td>\n",
" <td>0</td>\n",
" <td>98053</td>\n",
" <td>47.6810</td>\n",
" <td>-122.032</td>\n",
" <td>1690</td>\n",
" <td>2650</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>860</th>\n",
" <td>1723049033</td>\n",
" <td>20140620T000000</td>\n",
" <td>245000.0</td>\n",
" <td>1</td>\n",
" <td>0.75</td>\n",
" <td>380</td>\n",
" <td>15000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1963</td>\n",
" <td>0</td>\n",
" <td>98168</td>\n",
" <td>47.4810</td>\n",
" <td>-122.323</td>\n",
" <td>1170</td>\n",
" <td>15000</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15795</th>\n",
" <td>6147650280</td>\n",
" <td>20150325T000000</td>\n",
" <td>315000.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>3130</td>\n",
" <td>5999</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2006</td>\n",
" <td>0</td>\n",
" <td>98042</td>\n",
" <td>47.3837</td>\n",
" <td>-122.099</td>\n",
" <td>3020</td>\n",
" <td>5997</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>17290 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" id date price bedrooms bathrooms \\\n",
"6325 5467910190 20140527T000000 325000.0 3 1.75 \n",
"13473 9331800580 20150310T000000 257000.0 2 1.00 \n",
"17614 2407000405 20150226T000000 228500.0 3 1.00 \n",
"16970 5466700290 20150108T000000 288000.0 3 2.25 \n",
"20868 3026059361 20150417T000000 479000.0 2 2.50 \n",
"... ... ... ... ... ... \n",
"11964 5272200045 20141113T000000 378000.0 3 1.50 \n",
"21575 9578500790 20141111T000000 399950.0 3 2.50 \n",
"5390 7202350480 20140930T000000 575000.0 3 2.50 \n",
"860 1723049033 20140620T000000 245000.0 1 0.75 \n",
"15795 6147650280 20150325T000000 315000.0 4 2.50 \n",
"\n",
" sqft_living sqft_lot floors waterfront view ... sqft_basement \\\n",
"6325 1780 13095 1.0 0 0 ... 0 \n",
"13473 1000 3700 1.0 0 0 ... 200 \n",
"17614 1080 7486 1.5 0 0 ... 90 \n",
"16970 2090 7500 1.0 0 0 ... 810 \n",
"20868 1741 1439 2.0 0 0 ... 295 \n",
"... ... ... ... ... ... ... ... \n",
"11964 1000 6914 1.0 0 0 ... 0 \n",
"21575 3087 5002 2.0 0 0 ... 0 \n",
"5390 2120 4780 2.0 0 0 ... 0 \n",
"860 380 15000 1.0 0 0 ... 0 \n",
"15795 3130 5999 2.0 0 0 ... 0 \n",
"\n",
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
"6325 1983 0 98042 47.3670 -122.152 2750 \n",
"13473 1929 0 98118 47.5520 -122.290 1270 \n",
"17614 1942 0 98146 47.4838 -122.335 1170 \n",
"16970 1977 0 98031 47.3951 -122.172 1800 \n",
"20868 2007 0 98034 47.7043 -122.209 2090 \n",
"... ... ... ... ... ... ... \n",
"11964 1947 0 98125 47.7144 -122.319 1000 \n",
"21575 2014 0 98023 47.2974 -122.349 2927 \n",
"5390 2004 0 98053 47.6810 -122.032 1690 \n",
"860 1963 0 98168 47.4810 -122.323 1170 \n",
"15795 2006 0 98042 47.3837 -122.099 3020 \n",
"\n",
" sqft_lot15 price_category median_price \n",
"6325 13095 1 0 \n",
"13473 5000 0 0 \n",
"17614 7800 0 0 \n",
"16970 7350 0 0 \n",
"20868 10454 1 1 \n",
"... ... ... ... \n",
"11964 6947 1 0 \n",
"21575 5183 1 0 \n",
"5390 2650 1 1 \n",
"860 15000 0 0 \n",
"15795 5997 1 0 \n",
"\n",
"[17290 rows x 23 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>average_price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>6325</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13473</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17614</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16970</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20868</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11964</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21575</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5390</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>860</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15795</th>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>17290 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" average_price\n",
"6325 0\n",
"13473 0\n",
"17614 0\n",
"16970 1\n",
"20868 0\n",
"... ...\n",
"11964 0\n",
"21575 1\n",
"5390 1\n",
"860 0\n",
"15795 1\n",
"\n",
"[17290 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'X_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
2024-11-15 16:44:46 +04:00
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
2024-11-27 17:22:10 +04:00
" <th>date</th>\n",
2024-11-15 16:44:46 +04:00
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
2024-11-27 17:22:10 +04:00
" <th>waterfront</th>\n",
" <th>view</th>\n",
2024-11-15 17:35:10 +04:00
" <th>...</th>\n",
2024-11-27 17:22:10 +04:00
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
2024-11-15 16:44:46 +04:00
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
2024-11-27 17:22:10 +04:00
" <th>price_category</th>\n",
" <th>median_price</th>\n",
2024-11-15 16:44:46 +04:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>735</th>\n",
" <td>2591820310</td>\n",
" <td>20141006T000000</td>\n",
" <td>365000.0</td>\n",
" <td>4</td>\n",
" <td>2.25</td>\n",
" <td>2070</td>\n",
" <td>8893</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1986</td>\n",
" <td>0</td>\n",
" <td>98058</td>\n",
" <td>47.4388</td>\n",
" <td>-122.162</td>\n",
" <td>2390</td>\n",
" <td>7700</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
2024-11-15 16:44:46 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>2830</th>\n",
" <td>7974200820</td>\n",
" <td>20140821T000000</td>\n",
" <td>865000.0</td>\n",
" <td>5</td>\n",
" <td>3.00</td>\n",
" <td>2900</td>\n",
" <td>6730</td>\n",
2024-11-15 17:35:10 +04:00
" <td>1.0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1070</td>\n",
" <td>1977</td>\n",
" <td>0</td>\n",
" <td>98115</td>\n",
" <td>47.6784</td>\n",
" <td>-122.285</td>\n",
" <td>2370</td>\n",
" <td>6283</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:46 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>4106</th>\n",
" <td>7701450110</td>\n",
" <td>20140815T000000</td>\n",
" <td>1038000.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>3770</td>\n",
" <td>10893</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1997</td>\n",
" <td>0</td>\n",
" <td>98006</td>\n",
" <td>47.5646</td>\n",
" <td>-122.129</td>\n",
" <td>3710</td>\n",
" <td>9685</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:46 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>16218</th>\n",
" <td>9522300010</td>\n",
" <td>20150331T000000</td>\n",
" <td>1490000.0</td>\n",
" <td>3</td>\n",
" <td>3.50</td>\n",
" <td>4560</td>\n",
" <td>14608</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1990</td>\n",
" <td>0</td>\n",
" <td>98034</td>\n",
" <td>47.6995</td>\n",
" <td>-122.228</td>\n",
" <td>4050</td>\n",
" <td>14226</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
2024-11-15 16:44:46 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>19964</th>\n",
" <td>9510861140</td>\n",
" <td>20140714T000000</td>\n",
" <td>711000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>2550</td>\n",
" <td>5376</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2004</td>\n",
" <td>0</td>\n",
" <td>98052</td>\n",
" <td>47.6647</td>\n",
" <td>-122.083</td>\n",
" <td>2250</td>\n",
" <td>4050</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
2024-11-15 17:35:10 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
2024-11-15 17:35:10 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>13674</th>\n",
" <td>6163900333</td>\n",
" <td>20141110T000000</td>\n",
" <td>338000.0</td>\n",
" <td>3</td>\n",
" <td>1.75</td>\n",
" <td>1250</td>\n",
" <td>7710</td>\n",
2024-11-15 17:35:10 +04:00
" <td>1.0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1947</td>\n",
" <td>0</td>\n",
" <td>98155</td>\n",
" <td>47.7623</td>\n",
" <td>-122.317</td>\n",
" <td>1340</td>\n",
" <td>7710</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
2024-11-15 17:35:10 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>20377</th>\n",
" <td>3528960020</td>\n",
" <td>20140708T000000</td>\n",
" <td>673000.0</td>\n",
" <td>3</td>\n",
" <td>2.75</td>\n",
" <td>2830</td>\n",
" <td>3496</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2012</td>\n",
" <td>0</td>\n",
" <td>98029</td>\n",
" <td>47.5606</td>\n",
" <td>-122.011</td>\n",
" <td>2160</td>\n",
" <td>3501</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
2024-11-15 17:35:10 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>8805</th>\n",
" <td>1687000220</td>\n",
" <td>20141016T000000</td>\n",
" <td>285000.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>2434</td>\n",
" <td>4400</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2007</td>\n",
" <td>0</td>\n",
" <td>98001</td>\n",
" <td>47.2874</td>\n",
" <td>-122.283</td>\n",
" <td>2434</td>\n",
" <td>4400</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10168</th>\n",
" <td>4141400030</td>\n",
" <td>20141201T000000</td>\n",
" <td>605000.0</td>\n",
" <td>4</td>\n",
" <td>1.75</td>\n",
" <td>2250</td>\n",
" <td>10108</td>\n",
2024-11-15 17:35:10 +04:00
" <td>1.0</td>\n",
2024-11-27 17:22:10 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1967</td>\n",
" <td>0</td>\n",
" <td>98008</td>\n",
" <td>47.5922</td>\n",
" <td>-122.118</td>\n",
" <td>2050</td>\n",
" <td>9750</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2522</th>\n",
" <td>1822500160</td>\n",
" <td>20141212T000000</td>\n",
" <td>356500.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>2570</td>\n",
" <td>11473</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2008</td>\n",
" <td>0</td>\n",
" <td>98003</td>\n",
" <td>47.2809</td>\n",
" <td>-122.296</td>\n",
" <td>2430</td>\n",
" <td>5997</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4323 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" id date price bedrooms bathrooms \\\n",
"735 2591820310 20141006T000000 365000.0 4 2.25 \n",
"2830 7974200820 20140821T000000 865000.0 5 3.00 \n",
"4106 7701450110 20140815T000000 1038000.0 4 2.50 \n",
"16218 9522300010 20150331T000000 1490000.0 3 3.50 \n",
"19964 9510861140 20140714T000000 711000.0 3 2.50 \n",
"... ... ... ... ... ... \n",
"13674 6163900333 20141110T000000 338000.0 3 1.75 \n",
"20377 3528960020 20140708T000000 673000.0 3 2.75 \n",
"8805 1687000220 20141016T000000 285000.0 4 2.50 \n",
"10168 4141400030 20141201T000000 605000.0 4 1.75 \n",
"2522 1822500160 20141212T000000 356500.0 4 2.50 \n",
"\n",
" sqft_living sqft_lot floors waterfront view ... sqft_basement \\\n",
"735 2070 8893 2.0 0 0 ... 0 \n",
"2830 2900 6730 1.0 0 0 ... 1070 \n",
"4106 3770 10893 2.0 0 2 ... 0 \n",
"16218 4560 14608 2.0 0 2 ... 0 \n",
"19964 2550 5376 2.0 0 0 ... 0 \n",
"... ... ... ... ... ... ... ... \n",
"13674 1250 7710 1.0 0 0 ... 0 \n",
"20377 2830 3496 2.0 0 0 ... 0 \n",
"8805 2434 4400 2.0 0 0 ... 0 \n",
"10168 2250 10108 1.0 0 0 ... 0 \n",
"2522 2570 11473 2.0 0 0 ... 0 \n",
"\n",
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
"735 1986 0 98058 47.4388 -122.162 2390 \n",
"2830 1977 0 98115 47.6784 -122.285 2370 \n",
"4106 1997 0 98006 47.5646 -122.129 3710 \n",
"16218 1990 0 98034 47.6995 -122.228 4050 \n",
"19964 2004 0 98052 47.6647 -122.083 2250 \n",
"... ... ... ... ... ... ... \n",
"13674 1947 0 98155 47.7623 -122.317 1340 \n",
"20377 2012 0 98029 47.5606 -122.011 2160 \n",
"8805 2007 0 98001 47.2874 -122.283 2434 \n",
"10168 1967 0 98008 47.5922 -122.118 2050 \n",
"2522 2008 0 98003 47.2809 -122.296 2430 \n",
"\n",
" sqft_lot15 price_category median_price \n",
"735 7700 1 0 \n",
"2830 6283 2 1 \n",
"4106 9685 2 1 \n",
"16218 14226 2 1 \n",
"19964 4050 2 1 \n",
"... ... ... ... \n",
"13674 7710 1 0 \n",
"20377 3501 1 1 \n",
"8805 4400 0 0 \n",
"10168 9750 1 1 \n",
"2522 5997 1 0 \n",
"\n",
"[4323 rows x 23 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>average_price</th>\n",
2024-11-15 17:35:10 +04:00
" </tr>\n",
2024-11-27 17:22:10 +04:00
" </thead>\n",
" <tbody>\n",
2024-11-15 17:35:10 +04:00
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>735</th>\n",
" <td>0</td>\n",
2024-11-15 17:35:10 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>2830</th>\n",
" <td>1</td>\n",
2024-11-15 17:35:10 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>4106</th>\n",
" <td>1</td>\n",
2024-11-15 17:35:10 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>16218</th>\n",
" <td>1</td>\n",
2024-11-15 17:35:10 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>19964</th>\n",
" <td>1</td>\n",
2024-11-15 17:35:10 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>...</th>\n",
" <td>...</td>\n",
2024-11-15 17:35:10 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>13674</th>\n",
" <td>0</td>\n",
2024-11-15 17:35:10 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>20377</th>\n",
" <td>1</td>\n",
2024-11-15 17:35:10 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>8805</th>\n",
" <td>1</td>\n",
2024-11-15 16:44:46 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>10168</th>\n",
" <td>1</td>\n",
2024-11-15 17:35:10 +04:00
" </tr>\n",
" <tr>\n",
2024-11-27 17:22:10 +04:00
" <th>2522</th>\n",
" <td>1</td>\n",
2024-11-15 16:44:46 +04:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-11-27 17:22:10 +04:00
"<p>4323 rows × 1 columns</p>\n",
2024-11-15 16:44:46 +04:00
"</div>"
],
"text/plain": [
2024-11-27 17:22:10 +04:00
" average_price\n",
"735 0\n",
"2830 1\n",
"4106 1\n",
"16218 1\n",
"19964 1\n",
"... ...\n",
"13674 0\n",
"20377 1\n",
"8805 1\n",
"10168 1\n",
"2522 1\n",
2024-11-15 16:44:46 +04:00
"\n",
2024-11-27 17:22:10 +04:00
"[4323 rows x 1 columns]"
2024-11-15 16:44:46 +04:00
]
},
"metadata": {},
2024-11-27 17:22:10 +04:00
"output_type": "display_data"
2024-11-15 16:44:23 +04:00
}
],
"source": [
2024-11-27 17:22:10 +04:00
"from typing import Tuple\n",
"from pandas import DataFrame\n",
2024-11-15 16:44:23 +04:00
"from sklearn.model_selection import train_test_split\n",
"\n",
2024-11-27 17:22:10 +04:00
"def split_into_train_test(\n",
" df_input: DataFrame,\n",
" target_colname: str = \"average_price\",\n",
" frac_train: float = 0.8,\n",
" random_state: int = None,\n",
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame]:\n",
" \n",
" if not (0 < frac_train < 1):\n",
" raise ValueError(\"Fraction must be between 0 and 1.\")\n",
" \n",
" # Проверка наличия целевого признака\n",
" if target_colname not in df_input.columns:\n",
" raise ValueError(f\"{target_colname} is not a column in the DataFrame.\")\n",
" \n",
" # Разделяем данные на признаки и целевую переменную\n",
" X = df_input.drop(columns=[target_colname]) # Признаки\n",
" y = df_input[[target_colname]] # Целевая переменная\n",
"\n",
" # Разделяем данные на обучающую и тестовую выборки\n",
" X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y,\n",
" test_size=(1.0 - frac_train),\n",
" random_state=random_state\n",
" )\n",
" \n",
" return X_train, X_test, y_train, y_test\n",
"\n",
"X_train, X_test, y_train, y_test = split_into_train_test(\n",
" df, \n",
" target_colname=\"average_price\", \n",
" frac_train=0.8, \n",
" random_state=42\n",
")\n",
"\n",
"display(\"X_train\", X_train)\n",
"display(\"y_train\", y_train)\n",
"\n",
"display(\"X_test\", X_test)\n",
"display(\"y_test\", y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Формирование конвейера для решения задачи регрессии"
]
},
{
"cell_type": "code",
"execution_count": 180,
"metadata": {},
"outputs": [],
"source": [
"class HouseFeatures(BaseEstimator, TransformerMixin):\n",
" def __init__(self):\n",
" pass\n",
" def fit(self, X, y=None):\n",
" return self\n",
" def transform(self, X, y=None):\n",
" # Создание новых признаков\n",
" X = X.copy()\n",
" X[\"Square\"] = X[\"sqft_living\"] / X[\"sqft_lot\"]\n",
" return X\n",
" def get_feature_names_out(self, features_in):\n",
" # Добавление имен новых признаков\n",
" new_features = [\"Square\"]\n",
" return np.append(features_in, new_features, axis=0)\n",
2024-11-15 16:44:23 +04:00
"\n",
"# Указываем столбцы, которые нужно удалить и обрабатывать\n",
2024-11-27 17:22:10 +04:00
"columns_to_drop = [\"date\"]\n",
"num_columns = [\"bathrooms\", \"floors\", \"waterfront\", \"view\"]\n",
"cat_columns = [] \n",
2024-11-15 16:44:23 +04:00
"\n",
"# Определяем предобработку для численных данных\n",
"num_imputer = SimpleImputer(strategy=\"median\")\n",
"num_scaler = StandardScaler()\n",
"preprocessing_num = Pipeline(\n",
" [\n",
" (\"imputer\", num_imputer),\n",
" (\"scaler\", num_scaler),\n",
" ]\n",
")\n",
"\n",
"# Определяем предобработку для категориальных данных\n",
2024-11-27 17:22:10 +04:00
"cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
2024-11-15 16:44:23 +04:00
"preprocessing_cat = Pipeline(\n",
" [\n",
" (\"imputer\", cat_imputer),\n",
" (\"encoder\", cat_encoder),\n",
" ]\n",
")\n",
"\n",
2024-11-27 17:22:10 +04:00
"# Подготовка признаков с использованием ColumnTransformer\n",
2024-11-15 16:44:23 +04:00
"features_preprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
2024-11-27 17:22:10 +04:00
" (\"preprocessing_num\", preprocessing_num, num_columns),\n",
" (\"preprocessing_cat\", preprocessing_cat, cat_columns),\n",
2024-11-15 16:44:23 +04:00
" ],\n",
" remainder=\"passthrough\"\n",
")\n",
"\n",
2024-11-27 17:22:10 +04:00
"# Удаление нежелательных столбцов\n",
2024-11-15 16:44:23 +04:00
"drop_columns = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
2024-11-27 17:22:10 +04:00
"# Постобработка признаков\n",
2024-11-15 16:44:23 +04:00
"features_postprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
2024-11-27 17:22:10 +04:00
" (\"preprocessing_cat\", preprocessing_cat, [\"price_category\"]), \n",
2024-11-15 16:44:23 +04:00
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
2024-11-27 17:22:10 +04:00
"# Создание окончательного конвейера\n",
"pipeline = Pipeline(\n",
2024-11-15 16:44:23 +04:00
" [\n",
" (\"features_preprocessing\", features_preprocessing),\n",
" (\"drop_columns\", drop_columns),\n",
2024-11-27 17:22:10 +04:00
" (\"custom_features\", HouseFeatures()),\n",
" (\"model\", RandomForestRegressor()) # Выбор модели для обучения\n",
2024-11-15 16:44:23 +04:00
" ]\n",
")\n",
2024-11-15 17:35:10 +04:00
"\n",
2024-11-27 17:22:10 +04:00
"# Использование конвейера\n",
"def train_pipeline(X, y):\n",
" pipeline.fit(X, y)"
2024-11-15 17:35:10 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-27 17:22:10 +04:00
"### Определение перечня алгоритмов решения задачи аппроксимации (регрессии)"
2024-11-15 17:35:10 +04:00
]
},
{
"cell_type": "code",
2024-11-27 17:22:10 +04:00
"execution_count": 181,
2024-11-15 17:35:10 +04:00
"metadata": {},
"outputs": [],
"source": [
2024-11-27 17:22:10 +04:00
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"from sklearn import linear_model, tree, neighbors, ensemble, neural_network\n",
2024-11-15 17:35:10 +04:00
"\n",
2024-11-27 17:22:10 +04:00
"random_state = 9\n",
"\n",
"models = {\n",
" \"linear\": {\"model\": linear_model.LinearRegression(n_jobs=-1)},\n",
" \"linear_poly\": {\n",
" \"model\": make_pipeline(\n",
" PolynomialFeatures(degree=2),\n",
" linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n",
" )\n",
2024-11-15 17:35:10 +04:00
" },\n",
2024-11-27 17:22:10 +04:00
" \"linear_interact\": {\n",
" \"model\": make_pipeline(\n",
" PolynomialFeatures(interaction_only=True),\n",
" linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n",
" )\n",
" },\n",
" \"ridge\": {\"model\": linear_model.RidgeCV()},\n",
" \"decision_tree\": {\n",
" \"model\": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)\n",
2024-11-15 17:35:10 +04:00
" },\n",
2024-11-27 17:22:10 +04:00
" \"knn\": {\"model\": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},\n",
2024-11-15 17:35:10 +04:00
" \"random_forest\": {\n",
2024-11-27 17:22:10 +04:00
" \"model\": ensemble.RandomForestRegressor(\n",
" max_depth=7, random_state=random_state, n_jobs=-1\n",
2024-11-15 17:35:10 +04:00
" )\n",
" },\n",
" \"mlp\": {\n",
2024-11-27 17:22:10 +04:00
" \"model\": neural_network.MLPRegressor(\n",
" activation=\"tanh\",\n",
" hidden_layer_sizes=(3,),\n",
2024-11-15 17:35:10 +04:00
" max_iter=500,\n",
" early_stopping=True,\n",
" random_state=random_state,\n",
" )\n",
" },\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-27 17:22:10 +04:00
"### Обучение и оценка моделей с помощью различных алгоритмов"
2024-11-15 17:35:10 +04:00
]
},
{
"cell_type": "code",
2024-11-27 17:22:10 +04:00
"execution_count": 182,
2024-11-15 17:35:10 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-27 17:22:10 +04:00
"Model: logistic\n",
"MSE (train): 0.24060150375939848\n",
"MSE (test): 0.23455933379597502\n",
"MAE (train): 0.24060150375939848\n",
"MAE (test): 0.23455933379597502\n",
"R2 (train): 0.015780807725750634\n",
"R2 (test): 0.045807954005714024\n",
"STD (train): 0.48387852043102103\n",
"STD (test): 0.4780359236045559\n",
"----------------------------------------\n",
"Model: ridge\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n"
2024-11-15 17:35:10 +04:00
]
},
{
2024-11-27 17:22:10 +04:00
"name": "stdout",
"output_type": "stream",
"text": [
"MSE (train): 0.210989010989011\n",
"MSE (test): 0.2035623409669211\n",
"MAE (train): 0.210989010989011\n",
"MAE (test): 0.2035623409669211\n",
"R2 (train): 0.1369154775441198\n",
"R2 (test): 0.17190433878207922\n",
"STD (train): 0.45781332911823247\n",
"STD (test): 0.4499815316182845\n",
"----------------------------------------\n",
"Model: decision_tree\n",
"MSE (train): 0.0\n",
"MSE (test): 0.0\n",
"MAE (train): 0.0\n",
"MAE (test): 0.0\n",
"R2 (train): 1.0\n",
"R2 (test): 1.0\n",
"STD (train): 0.0\n",
"STD (test): 0.0\n",
"----------------------------------------\n",
"Model: knn\n",
"MSE (train): 0.1949681897050318\n",
"MSE (test): 0.27989821882951654\n",
"MAE (train): 0.1949681897050318\n",
"MAE (test): 0.27989821882951654\n",
"R2 (train): 0.20245122664507342\n",
"R2 (test): -0.13863153417464114\n",
"STD (train): 0.43948973967967464\n",
"STD (test): 0.5264647910268833\n",
"----------------------------------------\n",
"Model: naive_bayes\n",
"MSE (train): 0.26928860613071137\n",
"MSE (test): 0.2690261392551469\n",
"MAE (train): 0.26928860613071137\n",
"MAE (test): 0.2690261392551469\n",
"R2 (train): -0.10156840366079445\n",
"R2 (test): -0.09440369772322943\n",
"STD (train): 0.47316941542228536\n",
"STD (test): 0.47206502931490235\n",
"----------------------------------------\n",
"Model: gradient_boosting\n",
"MSE (train): 0.0\n",
"MSE (test): 0.0\n",
"MAE (train): 0.0\n",
"MAE (test): 0.0\n",
"R2 (train): 1.0\n",
"R2 (test): 1.0\n",
"STD (train): 0.0\n",
"STD (test): 0.0\n",
"----------------------------------------\n",
"Model: random_forest\n",
"MSE (train): 0.0\n",
"MSE (test): 0.0\n",
"MAE (train): 0.0\n",
"MAE (test): 0.0\n",
"R2 (train): 1.0\n",
"R2 (test): 1.0\n",
"STD (train): 0.0\n",
"STD (test): 0.0\n",
"----------------------------------------\n",
"Model: mlp\n",
"MSE (train): 0.4253903990746096\n",
"MSE (test): 0.4353458246588018\n",
"MAE (train): 0.4253903990746096\n",
"MAE (test): 0.4353458246588018\n",
"R2 (train): -0.7401279228791116\n",
"R2 (test): -0.7709954936501442\n",
"STD (train): 0.4959884986820156\n",
"STD (test): 0.49782384226978177\n",
"----------------------------------------\n"
2024-11-15 17:35:10 +04:00
]
}
],
"source": [
"import numpy as np\n",
"from sklearn import metrics\n",
2024-11-27 17:22:10 +04:00
"from sklearn.pipeline import Pipeline\n",
"\n",
"# Проверка наличия необходимых переменных\n",
"if 'class_models' not in locals():\n",
" raise ValueError(\"class_models is not defined\")\n",
"if 'X_train' not in locals() or 'X_test' not in locals() or 'y_train' not in locals() or 'y_test' not in locals():\n",
" raise ValueError(\"Train/test data is not defined\")\n",
"\n",
2024-11-15 17:35:10 +04:00
"\n",
2024-11-27 17:22:10 +04:00
"y_train = np.ravel(y_train) \n",
"y_test = np.ravel(y_test) \n",
"\n",
"# Инициализация списка для хранения результатов\n",
"results = []\n",
"\n",
"# Проход по моделям и оценка их качества\n",
2024-11-15 17:35:10 +04:00
"for model_name in class_models.keys():\n",
" print(f\"Model: {model_name}\")\n",
2024-11-27 17:22:10 +04:00
" \n",
" # Извлечение модели из словаря\n",
2024-11-15 17:35:10 +04:00
" model = class_models[model_name][\"model\"]\n",
2024-11-27 17:22:10 +04:00
" \n",
" # Создание пайплайна\n",
2024-11-15 17:35:10 +04:00
" model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
2024-11-27 17:22:10 +04:00
" \n",
" # Обучение модели\n",
" model_pipeline.fit(X_train, y_train)\n",
2024-11-15 17:35:10 +04:00
"\n",
2024-11-27 17:22:10 +04:00
" # Предсказание для обучающей и тестовой выборки\n",
2024-11-15 17:35:10 +04:00
" y_train_predict = model_pipeline.predict(X_train)\n",
2024-11-27 17:22:10 +04:00
" y_test_predict = model_pipeline.predict(X_test)\n",
2024-11-15 16:44:23 +04:00
"\n",
2024-11-27 17:22:10 +04:00
" # Сохранение пайплайна и предсказаний\n",
2024-11-15 17:35:10 +04:00
" class_models[model_name][\"pipeline\"] = model_pipeline\n",
" class_models[model_name][\"preds\"] = y_test_predict\n",
2024-11-15 16:44:23 +04:00
"\n",
2024-11-27 17:22:10 +04:00
" # Вычисление метрик для регрессии\n",
" class_models[model_name][\"MSE_train\"] = metrics.mean_squared_error(y_train, y_train_predict)\n",
" class_models[model_name][\"MSE_test\"] = metrics.mean_squared_error(y_test, y_test_predict)\n",
" class_models[model_name][\"MAE_train\"] = metrics.mean_absolute_error(y_train, y_train_predict)\n",
" class_models[model_name][\"MAE_test\"] = metrics.mean_absolute_error(y_test, y_test_predict)\n",
" class_models[model_name][\"R2_train\"] = metrics.r2_score(y_train, y_train_predict)\n",
" class_models[model_name][\"R2_test\"] = metrics.r2_score(y_test, y_test_predict)\n",
"\n",
" # Дополнительные метрики\n",
" class_models[model_name][\"STD_train\"] = np.std(y_train - y_train_predict)\n",
" class_models[model_name][\"STD_test\"] = np.std(y_test - y_test_predict)\n",
"\n",
" # Вывод результатов для текущей модели\n",
" print(f\"MSE (train): {class_models[model_name]['MSE_train']}\")\n",
" print(f\"MSE (test): {class_models[model_name]['MSE_test']}\")\n",
" print(f\"MAE (train): {class_models[model_name]['MAE_train']}\")\n",
" print(f\"MAE (test): {class_models[model_name]['MAE_test']}\")\n",
" print(f\"R2 (train): {class_models[model_name]['R2_train']}\")\n",
" print(f\"R2 (test): {class_models[model_name]['R2_test']}\")\n",
" print(f\"STD (train): {class_models[model_name]['STD_train']}\")\n",
" print(f\"STD (test): {class_models[model_name]['STD_test']}\")\n",
" print(\"-\" * 40) # Разделитель для разных моделей"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Подбор гиперпараметров методом поиска по сетке"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 36 candidates, totalling 180 fits\n",
"Best parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}\n",
"Best MSE: 0.1476816399382576\n"
]
}
],
"source": [
"df['date'] = pd.to_datetime(df['date'], errors='coerce') # Coerce invalid dates to NaT\n",
"df.dropna(subset=['date'], inplace=True) # Drop rows with invalid dates\n",
"df['year'] = df['date'].dt.year\n",
"df['month'] = df['date'].dt.month\n",
"df['day'] = df['date'].dt.day\n",
"\n",
"X = df[['yr_built', 'year', 'month', 'day', 'price', 'price_category']]\n",
"y = df['average_price']\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"model = RandomForestRegressor()\n",
"param_grid = {\n",
" 'n_estimators': [50, 100, 200],\n",
" 'max_depth': [None, 10, 20, 30],\n",
" 'min_samples_split': [2, 5, 10]\n",
"}\n",
"\n",
"grid_search = GridSearchCV(estimator=model, param_grid=param_grid,\n",
" scoring='neg_mean_squared_error', cv=5, n_jobs=-1, verbose=2)\n",
"\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Best parameters:\", grid_search.best_params_)\n",
"print(\"Best MSE:\", -grid_search.best_score_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Обучение модели с новыми гиперпараметрами и сравнение новых и старых данных"
]
},
{
"cell_type": "code",
"execution_count": 186,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 36 candidates, totalling 180 fits\n",
"Старые параметры: {'max_depth': 10, 'min_samples_split': 15, 'n_estimators': 200}\n",
"Лучший результат (MSE) на старых параметрах: 0.1472657852824936\n",
"\n",
"Новые параметры: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}\n",
"Лучший результат (MSE) на новых параметрах: 0.14907357358498077\n",
"Среднеквадратическая ошибка (MSE) на тестовых данных: 0.1443569152033931\n",
"Корень среднеквадратичной ошибки (RMSE) на тестовых данных: 0.37994330524881353\n"
]
}
],
"source": [
"# 1. Настройка параметров для старых значений\n",
"old_param_grid = {\n",
" 'n_estimators': [50, 100, 200], # Количество деревьев\n",
" 'max_depth': [None, 10, 20, 30], # Максимальная глубина дерева\n",
" 'min_samples_split': [2, 10, 15] # Минимальное количество образцов для разбиения узла\n",
"}\n",
"\n",
"# Подбор гиперпараметров с помощью Grid Search для старых параметров\n",
"old_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n",
" param_grid=old_param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1, verbose=2)\n",
"\n",
"# Обучение модели на тренировочных данных\n",
"old_grid_search.fit(X_train, y_train)\n",
"\n",
"# 2. Результаты подбора для старых параметров\n",
"old_best_params = old_grid_search.best_params_\n",
"old_best_mse = -old_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n",
"\n",
"# 3. Настройка параметров для новых значений\n",
"new_param_grid = {\n",
" 'n_estimators': [200],\n",
" 'max_depth': [10],\n",
" 'min_samples_split': [10]\n",
"}\n",
"\n",
"# Подбор гиперпараметров с помощью Grid Search для новых параметров\n",
"new_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n",
" param_grid=new_param_grid, scoring='neg_mean_squared_error', cv=2)\n",
"\n",
"# Обучение модели на тренировочных данных\n",
"new_grid_search.fit(X_train, y_train)\n",
"\n",
"# 4. Результаты подбора для новых параметров\n",
"new_best_params = new_grid_search.best_params_\n",
"new_best_mse = -new_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n",
"\n",
"# 5. Обучение модели с лучшими параметрами для новых значений\n",
"model_best = RandomForestRegressor(**new_best_params)\n",
"model_best.fit(X_train, y_train)\n",
"\n",
"# Прогнозирование на тестовой выборке\n",
"y_pred = model_best.predict(X_test)\n",
"\n",
"# Оценка производительности модели\n",
"mse = metrics.mean_squared_error(y_test, y_pred)\n",
"rmse = np.sqrt(mse)\n",
"\n",
"# Вывод результатов\n",
"print(\"Старые параметры:\", old_best_params)\n",
"print(\"Лучший результат (MSE) на старых параметрах:\", old_best_mse)\n",
"print(\"\\nН о вые параметры:\", new_best_params)\n",
"print(\"Лучший результат (MSE) на новых параметрах:\", new_best_mse)\n",
"print(\"Среднеквадратическая ошибка (MSE) на тестовых данных:\", mse)\n",
"print(\"Корень среднеквадратичной ошибки (RMSE) на тестовых данных:\", rmse)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Посмотрев на результат, можно сказать, что старая модель имеет меньшую среднеквадратичную ошибку, следовательно она оказалась лучше модели с новыми настройками.\n",
"Т .к. старые параметры дали наилучший результат, можно сказать, что модель способна выдать высокую точность при настройке гиперпараметров. Попытка с новыми параметрами позволила оценить, как модель реагирует на изменения параметров."
2024-11-15 16:44:23 +04:00
]
2024-11-27 17:22:10 +04:00
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
2024-11-15 16:44:23 +04:00
}
],
"metadata": {
"kernelspec": {
"display_name": "kernel",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}