4257 lines
308 KiB
Plaintext
Raw Normal View History

2024-11-16 00:11:46 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Загрузка набора данных"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>condition</th>\n",
" <th>grade</th>\n",
" <th>sqft_above</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>7129300520</th>\n",
" <td>20141013T000000</td>\n",
" <td>221900.0</td>\n",
" <td>3</td>\n",
" <td>1.00</td>\n",
" <td>1180</td>\n",
" <td>5650</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>1180</td>\n",
" <td>0</td>\n",
" <td>1955</td>\n",
" <td>0</td>\n",
" <td>98178</td>\n",
" <td>47.5112</td>\n",
" <td>-122.257</td>\n",
" <td>1340</td>\n",
" <td>5650</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6414100192</th>\n",
" <td>20141209T000000</td>\n",
" <td>538000.0</td>\n",
" <td>3</td>\n",
" <td>2.25</td>\n",
" <td>2570</td>\n",
" <td>7242</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>2170</td>\n",
" <td>400</td>\n",
" <td>1951</td>\n",
" <td>1991</td>\n",
" <td>98125</td>\n",
" <td>47.7210</td>\n",
" <td>-122.319</td>\n",
" <td>1690</td>\n",
" <td>7639</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5631500400</th>\n",
" <td>20150225T000000</td>\n",
" <td>180000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>770</td>\n",
" <td>10000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>770</td>\n",
" <td>0</td>\n",
" <td>1933</td>\n",
" <td>0</td>\n",
" <td>98028</td>\n",
" <td>47.7379</td>\n",
" <td>-122.233</td>\n",
" <td>2720</td>\n",
" <td>8062</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2487200875</th>\n",
" <td>20141209T000000</td>\n",
" <td>604000.0</td>\n",
" <td>4</td>\n",
" <td>3.00</td>\n",
" <td>1960</td>\n",
" <td>5000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" <td>1050</td>\n",
" <td>910</td>\n",
" <td>1965</td>\n",
" <td>0</td>\n",
" <td>98136</td>\n",
" <td>47.5208</td>\n",
" <td>-122.393</td>\n",
" <td>1360</td>\n",
" <td>5000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1954400510</th>\n",
" <td>20150218T000000</td>\n",
" <td>510000.0</td>\n",
" <td>3</td>\n",
" <td>2.00</td>\n",
" <td>1680</td>\n",
" <td>8080</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>1680</td>\n",
" <td>0</td>\n",
" <td>1987</td>\n",
" <td>0</td>\n",
" <td>98074</td>\n",
" <td>47.6168</td>\n",
" <td>-122.045</td>\n",
" <td>1800</td>\n",
" <td>7503</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>263000018</th>\n",
" <td>20140521T000000</td>\n",
" <td>360000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>1530</td>\n",
" <td>1131</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>1530</td>\n",
" <td>0</td>\n",
" <td>2009</td>\n",
" <td>0</td>\n",
" <td>98103</td>\n",
" <td>47.6993</td>\n",
" <td>-122.346</td>\n",
" <td>1530</td>\n",
" <td>1509</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6600060120</th>\n",
" <td>20150223T000000</td>\n",
" <td>400000.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>2310</td>\n",
" <td>5813</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>2310</td>\n",
" <td>0</td>\n",
" <td>2014</td>\n",
" <td>0</td>\n",
" <td>98146</td>\n",
" <td>47.5107</td>\n",
" <td>-122.362</td>\n",
" <td>1830</td>\n",
" <td>7200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1523300141</th>\n",
" <td>20140623T000000</td>\n",
" <td>402101.0</td>\n",
" <td>2</td>\n",
" <td>0.75</td>\n",
" <td>1020</td>\n",
" <td>1350</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>1020</td>\n",
" <td>0</td>\n",
" <td>2009</td>\n",
" <td>0</td>\n",
" <td>98144</td>\n",
" <td>47.5944</td>\n",
" <td>-122.299</td>\n",
" <td>1020</td>\n",
" <td>2007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>291310100</th>\n",
" <td>20150116T000000</td>\n",
" <td>400000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>1600</td>\n",
" <td>2388</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>1600</td>\n",
" <td>0</td>\n",
" <td>2004</td>\n",
" <td>0</td>\n",
" <td>98027</td>\n",
" <td>47.5345</td>\n",
" <td>-122.069</td>\n",
" <td>1410</td>\n",
" <td>1287</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1523300157</th>\n",
" <td>20141015T000000</td>\n",
" <td>325000.0</td>\n",
" <td>2</td>\n",
" <td>0.75</td>\n",
" <td>1020</td>\n",
" <td>1076</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>1020</td>\n",
" <td>0</td>\n",
" <td>2008</td>\n",
" <td>0</td>\n",
" <td>98144</td>\n",
" <td>47.5941</td>\n",
" <td>-122.299</td>\n",
" <td>1020</td>\n",
" <td>1357</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>21613 rows × 20 columns</p>\n",
"</div>"
],
"text/plain": [
" date price bedrooms bathrooms sqft_living \\\n",
"id \n",
"7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
"6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
"5631500400 20150225T000000 180000.0 2 1.00 770 \n",
"2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
"1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
"... ... ... ... ... ... \n",
"263000018 20140521T000000 360000.0 3 2.50 1530 \n",
"6600060120 20150223T000000 400000.0 4 2.50 2310 \n",
"1523300141 20140623T000000 402101.0 2 0.75 1020 \n",
"291310100 20150116T000000 400000.0 3 2.50 1600 \n",
"1523300157 20141015T000000 325000.0 2 0.75 1020 \n",
"\n",
" sqft_lot floors waterfront view condition grade sqft_above \\\n",
"id \n",
"7129300520 5650 1.0 0 0 3 7 1180 \n",
"6414100192 7242 2.0 0 0 3 7 2170 \n",
"5631500400 10000 1.0 0 0 3 6 770 \n",
"2487200875 5000 1.0 0 0 5 7 1050 \n",
"1954400510 8080 1.0 0 0 3 8 1680 \n",
"... ... ... ... ... ... ... ... \n",
"263000018 1131 3.0 0 0 3 8 1530 \n",
"6600060120 5813 2.0 0 0 3 8 2310 \n",
"1523300141 1350 2.0 0 0 3 7 1020 \n",
"291310100 2388 2.0 0 0 3 8 1600 \n",
"1523300157 1076 2.0 0 0 3 7 1020 \n",
"\n",
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
"id \n",
"7129300520 0 1955 0 98178 47.5112 -122.257 \n",
"6414100192 400 1951 1991 98125 47.7210 -122.319 \n",
"5631500400 0 1933 0 98028 47.7379 -122.233 \n",
"2487200875 910 1965 0 98136 47.5208 -122.393 \n",
"1954400510 0 1987 0 98074 47.6168 -122.045 \n",
"... ... ... ... ... ... ... \n",
"263000018 0 2009 0 98103 47.6993 -122.346 \n",
"6600060120 0 2014 0 98146 47.5107 -122.362 \n",
"1523300141 0 2009 0 98144 47.5944 -122.299 \n",
"291310100 0 2004 0 98027 47.5345 -122.069 \n",
"1523300157 0 2008 0 98144 47.5941 -122.299 \n",
"\n",
" sqft_living15 sqft_lot15 \n",
"id \n",
"7129300520 1340 5650 \n",
"6414100192 1690 7639 \n",
"5631500400 2720 8062 \n",
"2487200875 1360 5000 \n",
"1954400510 1800 7503 \n",
"... ... ... \n",
"263000018 1530 1509 \n",
"6600060120 1830 7200 \n",
"1523300141 1020 2007 \n",
"291310100 1410 1287 \n",
"1523300157 1020 1357 \n",
"\n",
"[21613 rows x 20 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"from sklearn import set_config\n",
"\n",
"set_config(transform_output=\"pandas\")\n",
"\n",
"random_state=9\n",
"\n",
"df = pd.read_csv(\"data/kc_house_data.csv\", index_col=\"id\")\n",
"\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации\n",
"\n",
"Целевой признак -- waterfront"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'X_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>condition</th>\n",
" <th>grade</th>\n",
" <th>sqft_above</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3046200125</th>\n",
" <td>20150406T000000</td>\n",
" <td>202000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>740</td>\n",
" <td>6550</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>740</td>\n",
" <td>0</td>\n",
" <td>1946</td>\n",
" <td>0</td>\n",
" <td>98168</td>\n",
" <td>47.4807</td>\n",
" <td>-122.332</td>\n",
" <td>1080</td>\n",
" <td>8515</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1853000030</th>\n",
" <td>20150416T000000</td>\n",
" <td>775000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>3550</td>\n",
" <td>32807</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>9</td>\n",
" <td>3550</td>\n",
" <td>0</td>\n",
" <td>1989</td>\n",
" <td>0</td>\n",
" <td>98077</td>\n",
" <td>47.7292</td>\n",
" <td>-122.082</td>\n",
" <td>3270</td>\n",
" <td>35001</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1825079005</th>\n",
" <td>20140609T000000</td>\n",
" <td>739000.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>2800</td>\n",
" <td>246114</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>9</td>\n",
" <td>2800</td>\n",
" <td>0</td>\n",
" <td>1999</td>\n",
" <td>0</td>\n",
" <td>98014</td>\n",
" <td>47.6586</td>\n",
" <td>-121.962</td>\n",
" <td>2750</td>\n",
" <td>60351</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2523039315</th>\n",
" <td>20141022T000000</td>\n",
" <td>481000.0</td>\n",
" <td>3</td>\n",
" <td>2.00</td>\n",
" <td>2580</td>\n",
" <td>15653</td>\n",
" <td>1.5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>9</td>\n",
" <td>2580</td>\n",
" <td>0</td>\n",
" <td>1990</td>\n",
" <td>0</td>\n",
" <td>98166</td>\n",
" <td>47.4561</td>\n",
" <td>-122.361</td>\n",
" <td>1920</td>\n",
" <td>9840</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6623400246</th>\n",
" <td>20140523T000000</td>\n",
" <td>200000.0</td>\n",
" <td>4</td>\n",
" <td>1.00</td>\n",
" <td>1350</td>\n",
" <td>11507</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>1350</td>\n",
" <td>0</td>\n",
" <td>1966</td>\n",
" <td>0</td>\n",
" <td>98055</td>\n",
" <td>47.4269</td>\n",
" <td>-122.197</td>\n",
" <td>1320</td>\n",
" <td>25675</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2523069134</th>\n",
" <td>20150406T000000</td>\n",
" <td>495000.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>2480</td>\n",
" <td>91911</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" <td>1470</td>\n",
" <td>1010</td>\n",
" <td>1973</td>\n",
" <td>0</td>\n",
" <td>98027</td>\n",
" <td>47.4579</td>\n",
" <td>-121.981</td>\n",
" <td>2540</td>\n",
" <td>91911</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1931300412</th>\n",
" <td>20150416T000000</td>\n",
" <td>475000.0</td>\n",
" <td>3</td>\n",
" <td>2.25</td>\n",
" <td>1190</td>\n",
" <td>1200</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>1190</td>\n",
" <td>0</td>\n",
" <td>2008</td>\n",
" <td>0</td>\n",
" <td>98103</td>\n",
" <td>47.6542</td>\n",
" <td>-122.346</td>\n",
" <td>1180</td>\n",
" <td>1224</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4331000400</th>\n",
" <td>20150220T000000</td>\n",
" <td>252000.0</td>\n",
" <td>3</td>\n",
" <td>1.50</td>\n",
" <td>1150</td>\n",
" <td>13200</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>1150</td>\n",
" <td>0</td>\n",
" <td>1956</td>\n",
" <td>0</td>\n",
" <td>98166</td>\n",
" <td>47.4752</td>\n",
" <td>-122.345</td>\n",
" <td>1220</td>\n",
" <td>13066</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9212900180</th>\n",
" <td>20140625T000000</td>\n",
" <td>760000.0</td>\n",
" <td>4</td>\n",
" <td>2.50</td>\n",
" <td>2760</td>\n",
" <td>6000</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" <td>2230</td>\n",
" <td>530</td>\n",
" <td>1942</td>\n",
" <td>0</td>\n",
" <td>98115</td>\n",
" <td>47.6877</td>\n",
" <td>-122.295</td>\n",
" <td>1600</td>\n",
" <td>6000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7000100775</th>\n",
" <td>20140721T000000</td>\n",
" <td>625000.0</td>\n",
" <td>3</td>\n",
" <td>2.00</td>\n",
" <td>1730</td>\n",
" <td>12219</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" <td>1730</td>\n",
" <td>0</td>\n",
" <td>1986</td>\n",
" <td>0</td>\n",
" <td>98004</td>\n",
" <td>47.5825</td>\n",
" <td>-122.189</td>\n",
" <td>2470</td>\n",
" <td>13594</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>17290 rows × 20 columns</p>\n",
"</div>"
],
"text/plain": [
" date price bedrooms bathrooms sqft_living \\\n",
"id \n",
"3046200125 20150406T000000 202000.0 2 1.00 740 \n",
"1853000030 20150416T000000 775000.0 3 2.50 3550 \n",
"1825079005 20140609T000000 739000.0 4 2.50 2800 \n",
"2523039315 20141022T000000 481000.0 3 2.00 2580 \n",
"6623400246 20140523T000000 200000.0 4 1.00 1350 \n",
"... ... ... ... ... ... \n",
"2523069134 20150406T000000 495000.0 4 2.50 2480 \n",
"1931300412 20150416T000000 475000.0 3 2.25 1190 \n",
"4331000400 20150220T000000 252000.0 3 1.50 1150 \n",
"9212900180 20140625T000000 760000.0 4 2.50 2760 \n",
"7000100775 20140721T000000 625000.0 3 2.00 1730 \n",
"\n",
" sqft_lot floors waterfront view condition grade sqft_above \\\n",
"id \n",
"3046200125 6550 1.0 0 0 4 5 740 \n",
"1853000030 32807 2.0 0 0 3 9 3550 \n",
"1825079005 246114 2.0 0 0 3 9 2800 \n",
"2523039315 15653 1.5 0 0 3 9 2580 \n",
"6623400246 11507 1.0 0 0 3 7 1350 \n",
"... ... ... ... ... ... ... ... \n",
"2523069134 91911 1.0 0 2 4 7 1470 \n",
"1931300412 1200 3.0 0 0 3 8 1190 \n",
"4331000400 13200 1.0 0 0 3 7 1150 \n",
"9212900180 6000 2.0 0 0 5 7 2230 \n",
"7000100775 12219 1.0 0 0 4 7 1730 \n",
"\n",
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
"id \n",
"3046200125 0 1946 0 98168 47.4807 -122.332 \n",
"1853000030 0 1989 0 98077 47.7292 -122.082 \n",
"1825079005 0 1999 0 98014 47.6586 -121.962 \n",
"2523039315 0 1990 0 98166 47.4561 -122.361 \n",
"6623400246 0 1966 0 98055 47.4269 -122.197 \n",
"... ... ... ... ... ... ... \n",
"2523069134 1010 1973 0 98027 47.4579 -121.981 \n",
"1931300412 0 2008 0 98103 47.6542 -122.346 \n",
"4331000400 0 1956 0 98166 47.4752 -122.345 \n",
"9212900180 530 1942 0 98115 47.6877 -122.295 \n",
"7000100775 0 1986 0 98004 47.5825 -122.189 \n",
"\n",
" sqft_living15 sqft_lot15 \n",
"id \n",
"3046200125 1080 8515 \n",
"1853000030 3270 35001 \n",
"1825079005 2750 60351 \n",
"2523039315 1920 9840 \n",
"6623400246 1320 25675 \n",
"... ... ... \n",
"2523069134 2540 91911 \n",
"1931300412 1180 1224 \n",
"4331000400 1220 13066 \n",
"9212900180 1600 6000 \n",
"7000100775 2470 13594 \n",
"\n",
"[17290 rows x 20 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>waterfront</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3046200125</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1853000030</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1825079005</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2523039315</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6623400246</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2523069134</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1931300412</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4331000400</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9212900180</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7000100775</th>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>17290 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" waterfront\n",
"id \n",
"3046200125 0\n",
"1853000030 0\n",
"1825079005 0\n",
"2523039315 0\n",
"6623400246 0\n",
"... ...\n",
"2523069134 0\n",
"1931300412 0\n",
"4331000400 0\n",
"9212900180 0\n",
"7000100775 0\n",
"\n",
"[17290 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'X_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>condition</th>\n",
" <th>grade</th>\n",
" <th>sqft_above</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1775950100</th>\n",
" <td>20150113T000000</td>\n",
" <td>357823.0</td>\n",
" <td>3</td>\n",
" <td>1.50</td>\n",
" <td>1240</td>\n",
" <td>9196</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>1240</td>\n",
" <td>0</td>\n",
" <td>1968</td>\n",
" <td>0</td>\n",
" <td>98072</td>\n",
" <td>47.7562</td>\n",
" <td>-122.094</td>\n",
" <td>1690</td>\n",
" <td>10800</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3550800040</th>\n",
" <td>20141114T000000</td>\n",
" <td>223000.0</td>\n",
" <td>3</td>\n",
" <td>1.00</td>\n",
" <td>940</td>\n",
" <td>7980</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>940</td>\n",
" <td>0</td>\n",
" <td>1961</td>\n",
" <td>0</td>\n",
" <td>98146</td>\n",
" <td>47.5107</td>\n",
" <td>-122.345</td>\n",
" <td>1050</td>\n",
" <td>7980</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1454600256</th>\n",
" <td>20141013T000000</td>\n",
" <td>710000.0</td>\n",
" <td>5</td>\n",
" <td>2.50</td>\n",
" <td>2570</td>\n",
" <td>9600</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>1620</td>\n",
" <td>950</td>\n",
" <td>1956</td>\n",
" <td>0</td>\n",
" <td>98125</td>\n",
" <td>47.7216</td>\n",
" <td>-122.282</td>\n",
" <td>2680</td>\n",
" <td>9900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1467400095</th>\n",
" <td>20150224T000000</td>\n",
" <td>545000.0</td>\n",
" <td>4</td>\n",
" <td>1.75</td>\n",
" <td>2040</td>\n",
" <td>53578</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" <td>1160</td>\n",
" <td>880</td>\n",
" <td>1959</td>\n",
" <td>0</td>\n",
" <td>98038</td>\n",
" <td>47.3844</td>\n",
" <td>-122.000</td>\n",
" <td>2040</td>\n",
" <td>53578</td>\n",
" </tr>\n",
" <tr>\n",
" <th>624069003</th>\n",
" <td>20150102T000000</td>\n",
" <td>829000.0</td>\n",
" <td>4</td>\n",
" <td>2.75</td>\n",
" <td>2970</td>\n",
" <td>59677</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>8</td>\n",
" <td>1610</td>\n",
" <td>1360</td>\n",
" <td>1973</td>\n",
" <td>0</td>\n",
" <td>98075</td>\n",
" <td>47.5953</td>\n",
" <td>-122.080</td>\n",
" <td>2930</td>\n",
" <td>42489</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3500100189</th>\n",
" <td>20140630T000000</td>\n",
" <td>300000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>960</td>\n",
" <td>8153</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>960</td>\n",
" <td>0</td>\n",
" <td>1947</td>\n",
" <td>0</td>\n",
" <td>98155</td>\n",
" <td>47.7341</td>\n",
" <td>-122.300</td>\n",
" <td>1160</td>\n",
" <td>8199</td>\n",
" </tr>\n",
" <tr>\n",
" <th>952001495</th>\n",
" <td>20150306T000000</td>\n",
" <td>588000.0</td>\n",
" <td>4</td>\n",
" <td>1.75</td>\n",
" <td>2170</td>\n",
" <td>5750</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>1370</td>\n",
" <td>800</td>\n",
" <td>1975</td>\n",
" <td>0</td>\n",
" <td>98116</td>\n",
" <td>47.5668</td>\n",
" <td>-122.383</td>\n",
" <td>1450</td>\n",
" <td>5750</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6072300800</th>\n",
" <td>20150505T000000</td>\n",
" <td>595000.0</td>\n",
" <td>4</td>\n",
" <td>1.75</td>\n",
" <td>2510</td>\n",
" <td>8989</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>8</td>\n",
" <td>1680</td>\n",
" <td>830</td>\n",
" <td>1964</td>\n",
" <td>0</td>\n",
" <td>98006</td>\n",
" <td>47.5569</td>\n",
" <td>-122.172</td>\n",
" <td>2510</td>\n",
" <td>8931</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2944010240</th>\n",
" <td>20140908T000000</td>\n",
" <td>988000.0</td>\n",
" <td>4</td>\n",
" <td>3.00</td>\n",
" <td>4040</td>\n",
" <td>19700</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>11</td>\n",
" <td>4040</td>\n",
" <td>0</td>\n",
" <td>1987</td>\n",
" <td>0</td>\n",
" <td>98052</td>\n",
" <td>47.7205</td>\n",
" <td>-122.127</td>\n",
" <td>3930</td>\n",
" <td>21887</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7893802670</th>\n",
" <td>20150424T000000</td>\n",
" <td>279900.0</td>\n",
" <td>3</td>\n",
" <td>3.25</td>\n",
" <td>2240</td>\n",
" <td>5000</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>9</td>\n",
" <td>1540</td>\n",
" <td>700</td>\n",
" <td>1989</td>\n",
" <td>0</td>\n",
" <td>98198</td>\n",
" <td>47.4114</td>\n",
" <td>-122.334</td>\n",
" <td>1800</td>\n",
" <td>7500</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4323 rows × 20 columns</p>\n",
"</div>"
],
"text/plain": [
" date price bedrooms bathrooms sqft_living \\\n",
"id \n",
"1775950100 20150113T000000 357823.0 3 1.50 1240 \n",
"3550800040 20141114T000000 223000.0 3 1.00 940 \n",
"1454600256 20141013T000000 710000.0 5 2.50 2570 \n",
"1467400095 20150224T000000 545000.0 4 1.75 2040 \n",
"624069003 20150102T000000 829000.0 4 2.75 2970 \n",
"... ... ... ... ... ... \n",
"3500100189 20140630T000000 300000.0 2 1.00 960 \n",
"952001495 20150306T000000 588000.0 4 1.75 2170 \n",
"6072300800 20150505T000000 595000.0 4 1.75 2510 \n",
"2944010240 20140908T000000 988000.0 4 3.00 4040 \n",
"7893802670 20150424T000000 279900.0 3 3.25 2240 \n",
"\n",
" sqft_lot floors waterfront view condition grade sqft_above \\\n",
"id \n",
"1775950100 9196 1.0 0 0 3 8 1240 \n",
"3550800040 7980 1.0 0 0 3 6 940 \n",
"1454600256 9600 1.0 0 2 3 8 1620 \n",
"1467400095 53578 1.0 0 0 5 7 1160 \n",
"624069003 59677 1.0 0 2 4 8 1610 \n",
"... ... ... ... ... ... ... ... \n",
"3500100189 8153 1.0 0 0 3 6 960 \n",
"952001495 5750 1.0 0 2 3 7 1370 \n",
"6072300800 8989 1.0 0 0 4 8 1680 \n",
"2944010240 19700 2.0 0 0 3 11 4040 \n",
"7893802670 5000 2.0 0 0 3 9 1540 \n",
"\n",
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
"id \n",
"1775950100 0 1968 0 98072 47.7562 -122.094 \n",
"3550800040 0 1961 0 98146 47.5107 -122.345 \n",
"1454600256 950 1956 0 98125 47.7216 -122.282 \n",
"1467400095 880 1959 0 98038 47.3844 -122.000 \n",
"624069003 1360 1973 0 98075 47.5953 -122.080 \n",
"... ... ... ... ... ... ... \n",
"3500100189 0 1947 0 98155 47.7341 -122.300 \n",
"952001495 800 1975 0 98116 47.5668 -122.383 \n",
"6072300800 830 1964 0 98006 47.5569 -122.172 \n",
"2944010240 0 1987 0 98052 47.7205 -122.127 \n",
"7893802670 700 1989 0 98198 47.4114 -122.334 \n",
"\n",
" sqft_living15 sqft_lot15 \n",
"id \n",
"1775950100 1690 10800 \n",
"3550800040 1050 7980 \n",
"1454600256 2680 9900 \n",
"1467400095 2040 53578 \n",
"624069003 2930 42489 \n",
"... ... ... \n",
"3500100189 1160 8199 \n",
"952001495 1450 5750 \n",
"6072300800 2510 8931 \n",
"2944010240 3930 21887 \n",
"7893802670 1800 7500 \n",
"\n",
"[4323 rows x 20 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>waterfront</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1775950100</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3550800040</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1454600256</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1467400095</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>624069003</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3500100189</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>952001495</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6072300800</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2944010240</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7893802670</th>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4323 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" waterfront\n",
"id \n",
"1775950100 0\n",
"3550800040 0\n",
"1454600256 0\n",
"1467400095 0\n",
"624069003 0\n",
"... ...\n",
"3500100189 0\n",
"952001495 0\n",
"6072300800 0\n",
"2944010240 0\n",
"7893802670 0\n",
"\n",
"[4323 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from utils import split_stratified_into_train_val_test\n",
"\n",
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
" df,\n",
" stratify_colname=\"waterfront\",\n",
" frac_train=0.80,\n",
" frac_val=0,\n",
" frac_test=0.20,\n",
" random_state=random_state,\n",
")\n",
"\n",
"display(\"X_train\", X_train)\n",
"display(\"y_train\", y_train)\n",
"\n",
"display(\"X_test\", X_test)\n",
"display(\"y_test\", y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Формирование конвейера для классификации данных\n",
"\n",
"preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
"\n",
"preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
"\n",
"features_preprocessing -- трансформер для предобработки признаков\n",
"\n",
"features_engineering -- трансформер для конструирования признаков\n",
"\n",
"drop_columns -- трансформер для удаления колонок\n",
"\n",
"features_postprocessing -- трансформер для унитарного кодирования новых признаков\n",
"\n",
"pipeline_end -- основной конвейер предобработки данных и конструирования признаков\n",
"\n",
"Конвейер выполняется последовательно.\n",
"\n",
"Трансформер выполняет параллельно для указанного набора колонок.\n",
"\n",
"Документация: \n",
"\n",
"https://scikit-learn.org/1.5/api/sklearn.pipeline.html\n",
"\n",
"https://scikit-learn.org/1.5/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.discriminant_analysis import StandardScaler\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"from custom_transformers import HouseFeatures\n",
"\n",
"\n",
"columns_to_drop = [\"waterfront\", \"yr_built\", \"zipcode\"]\n",
"num_columns = [\n",
" column\n",
" for column in df.columns\n",
" if column not in columns_to_drop and df[column].dtype != \"object\"\n",
"]\n",
"cat_columns = [\n",
" column\n",
" for column in df.columns\n",
" if column not in columns_to_drop and df[column].dtype == \"object\"\n",
"]\n",
"\n",
"num_imputer = SimpleImputer(strategy=\"median\")\n",
"num_scaler = StandardScaler()\n",
"preprocessing_num = Pipeline(\n",
" [\n",
" (\"imputer\", num_imputer),\n",
" (\"scaler\", num_scaler),\n",
" ]\n",
")\n",
"\n",
"cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=-1)\n",
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
"preprocessing_cat = Pipeline(\n",
" [\n",
" (\"imputer\", cat_imputer),\n",
" (\"encoder\", cat_encoder),\n",
" ]\n",
")\n",
"\n",
"features_preprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_num\", preprocessing_num, num_columns),\n",
" (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
" (\"prepocessing_features\", cat_imputer, [\"yr_built\", \"zipcode\"]),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"features_engineering = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"add_features\", HouseFeatures(), [\"yr_built\", \"zipcode\"]),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"drop_columns = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"features_postprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_cat\", preprocessing_cat, [\"Region\"]),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"pipeline_end = Pipeline(\n",
" [\n",
" (\"features_preprocessing\", features_preprocessing),\n",
" (\"features_engineering\", features_engineering),\n",
" (\"drop_columns\", drop_columns),\n",
" (\"features_postprocessing\", features_postprocessing),\n",
" ]\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Демонстрация работы конвейера для предобработки данных при классификации"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Region_north</th>\n",
" <th>House_age</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>view</th>\n",
" <th>condition</th>\n",
" <th>...</th>\n",
" <th>date_20150506T000000</th>\n",
" <th>date_20150507T000000</th>\n",
" <th>date_20150508T000000</th>\n",
" <th>date_20150509T000000</th>\n",
" <th>date_20150510T000000</th>\n",
" <th>date_20150511T000000</th>\n",
" <th>date_20150512T000000</th>\n",
" <th>date_20150513T000000</th>\n",
" <th>date_20150514T000000</th>\n",
" <th>date_20150515T000000</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3046200125</th>\n",
" <td>0.0</td>\n",
" <td>78</td>\n",
" <td>-0.945119</td>\n",
" <td>-1.468373</td>\n",
" <td>-1.448400</td>\n",
" <td>-1.462069</td>\n",
" <td>-0.205788</td>\n",
" <td>-0.918509</td>\n",
" <td>-0.305883</td>\n",
" <td>0.909775</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1853000030</th>\n",
" <td>1.0</td>\n",
" <td>35</td>\n",
" <td>0.667867</td>\n",
" <td>-0.393286</td>\n",
" <td>0.503345</td>\n",
" <td>1.605653</td>\n",
" <td>0.405288</td>\n",
" <td>0.935992</td>\n",
" <td>-0.305883</td>\n",
" <td>-0.628763</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1825079005</th>\n",
" <td>1.0</td>\n",
" <td>25</td>\n",
" <td>0.566528</td>\n",
" <td>0.681800</td>\n",
" <td>0.503345</td>\n",
" <td>0.786866</td>\n",
" <td>5.369556</td>\n",
" <td>0.935992</td>\n",
" <td>-0.305883</td>\n",
" <td>-0.628763</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2523039315</th>\n",
" <td>0.0</td>\n",
" <td>34</td>\n",
" <td>-0.159739</td>\n",
" <td>-0.393286</td>\n",
" <td>-0.147237</td>\n",
" <td>0.546688</td>\n",
" <td>0.006065</td>\n",
" <td>0.008742</td>\n",
" <td>-0.305883</td>\n",
" <td>-0.628763</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6623400246</th>\n",
" <td>1.0</td>\n",
" <td>58</td>\n",
" <td>-0.950749</td>\n",
" <td>0.681800</td>\n",
" <td>-1.448400</td>\n",
" <td>-0.796122</td>\n",
" <td>-0.090424</td>\n",
" <td>-0.918509</td>\n",
" <td>-0.305883</td>\n",
" <td>-0.628763</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2523069134</th>\n",
" <td>1.0</td>\n",
" <td>51</td>\n",
" <td>-0.120329</td>\n",
" <td>0.681800</td>\n",
" <td>0.503345</td>\n",
" <td>0.437517</td>\n",
" <td>1.780808</td>\n",
" <td>-0.918509</td>\n",
" <td>2.308411</td>\n",
" <td>0.909775</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1931300412</th>\n",
" <td>1.0</td>\n",
" <td>16</td>\n",
" <td>-0.176628</td>\n",
" <td>-0.393286</td>\n",
" <td>0.178054</td>\n",
" <td>-0.970797</td>\n",
" <td>-0.330298</td>\n",
" <td>2.790494</td>\n",
" <td>-0.305883</td>\n",
" <td>-0.628763</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4331000400</th>\n",
" <td>0.0</td>\n",
" <td>68</td>\n",
" <td>-0.804370</td>\n",
" <td>-0.393286</td>\n",
" <td>-0.797819</td>\n",
" <td>-1.014465</td>\n",
" <td>-0.051023</td>\n",
" <td>-0.918509</td>\n",
" <td>-0.305883</td>\n",
" <td>-0.628763</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9212900180</th>\n",
" <td>1.0</td>\n",
" <td>82</td>\n",
" <td>0.625642</td>\n",
" <td>0.681800</td>\n",
" <td>0.503345</td>\n",
" <td>0.743197</td>\n",
" <td>-0.218588</td>\n",
" <td>0.935992</td>\n",
" <td>-0.305883</td>\n",
" <td>2.448313</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7000100775</th>\n",
" <td>1.0</td>\n",
" <td>38</td>\n",
" <td>0.245619</td>\n",
" <td>-0.393286</td>\n",
" <td>-0.147237</td>\n",
" <td>-0.381270</td>\n",
" <td>-0.073854</td>\n",
" <td>-0.918509</td>\n",
" <td>-0.305883</td>\n",
" <td>0.909775</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>17290 rows × 384 columns</p>\n",
"</div>"
],
"text/plain": [
" Region_north House_age price bedrooms bathrooms \\\n",
"id \n",
"3046200125 0.0 78 -0.945119 -1.468373 -1.448400 \n",
"1853000030 1.0 35 0.667867 -0.393286 0.503345 \n",
"1825079005 1.0 25 0.566528 0.681800 0.503345 \n",
"2523039315 0.0 34 -0.159739 -0.393286 -0.147237 \n",
"6623400246 1.0 58 -0.950749 0.681800 -1.448400 \n",
"... ... ... ... ... ... \n",
"2523069134 1.0 51 -0.120329 0.681800 0.503345 \n",
"1931300412 1.0 16 -0.176628 -0.393286 0.178054 \n",
"4331000400 0.0 68 -0.804370 -0.393286 -0.797819 \n",
"9212900180 1.0 82 0.625642 0.681800 0.503345 \n",
"7000100775 1.0 38 0.245619 -0.393286 -0.147237 \n",
"\n",
" sqft_living sqft_lot floors view condition ... \\\n",
"id ... \n",
"3046200125 -1.462069 -0.205788 -0.918509 -0.305883 0.909775 ... \n",
"1853000030 1.605653 0.405288 0.935992 -0.305883 -0.628763 ... \n",
"1825079005 0.786866 5.369556 0.935992 -0.305883 -0.628763 ... \n",
"2523039315 0.546688 0.006065 0.008742 -0.305883 -0.628763 ... \n",
"6623400246 -0.796122 -0.090424 -0.918509 -0.305883 -0.628763 ... \n",
"... ... ... ... ... ... ... \n",
"2523069134 0.437517 1.780808 -0.918509 2.308411 0.909775 ... \n",
"1931300412 -0.970797 -0.330298 2.790494 -0.305883 -0.628763 ... \n",
"4331000400 -1.014465 -0.051023 -0.918509 -0.305883 -0.628763 ... \n",
"9212900180 0.743197 -0.218588 0.935992 -0.305883 2.448313 ... \n",
"7000100775 -0.381270 -0.073854 -0.918509 -0.305883 0.909775 ... \n",
"\n",
" date_20150506T000000 date_20150507T000000 date_20150508T000000 \\\n",
"id \n",
"3046200125 0.0 0.0 0.0 \n",
"1853000030 0.0 0.0 0.0 \n",
"1825079005 0.0 0.0 0.0 \n",
"2523039315 0.0 0.0 0.0 \n",
"6623400246 0.0 0.0 0.0 \n",
"... ... ... ... \n",
"2523069134 0.0 0.0 0.0 \n",
"1931300412 0.0 0.0 0.0 \n",
"4331000400 0.0 0.0 0.0 \n",
"9212900180 0.0 0.0 0.0 \n",
"7000100775 0.0 0.0 0.0 \n",
"\n",
" date_20150509T000000 date_20150510T000000 date_20150511T000000 \\\n",
"id \n",
"3046200125 0.0 0.0 0.0 \n",
"1853000030 0.0 0.0 0.0 \n",
"1825079005 0.0 0.0 0.0 \n",
"2523039315 0.0 0.0 0.0 \n",
"6623400246 0.0 0.0 0.0 \n",
"... ... ... ... \n",
"2523069134 0.0 0.0 0.0 \n",
"1931300412 0.0 0.0 0.0 \n",
"4331000400 0.0 0.0 0.0 \n",
"9212900180 0.0 0.0 0.0 \n",
"7000100775 0.0 0.0 0.0 \n",
"\n",
" date_20150512T000000 date_20150513T000000 date_20150514T000000 \\\n",
"id \n",
"3046200125 0.0 0.0 0.0 \n",
"1853000030 0.0 0.0 0.0 \n",
"1825079005 0.0 0.0 0.0 \n",
"2523039315 0.0 0.0 0.0 \n",
"6623400246 0.0 0.0 0.0 \n",
"... ... ... ... \n",
"2523069134 0.0 0.0 0.0 \n",
"1931300412 0.0 0.0 0.0 \n",
"4331000400 0.0 0.0 0.0 \n",
"9212900180 0.0 0.0 0.0 \n",
"7000100775 0.0 0.0 0.0 \n",
"\n",
" date_20150515T000000 \n",
"id \n",
"3046200125 0.0 \n",
"1853000030 0.0 \n",
"1825079005 0.0 \n",
"2523039315 0.0 \n",
"6623400246 0.0 \n",
"... ... \n",
"2523069134 0.0 \n",
"1931300412 0.0 \n",
"4331000400 0.0 \n",
"9212900180 0.0 \n",
"7000100775 0.0 \n",
"\n",
"[17290 rows x 384 columns]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessing_result = pipeline_end.fit_transform(X_train)\n",
"preprocessed_df = pd.DataFrame(\n",
" preprocessing_result,\n",
" columns=pipeline_end.get_feature_names_out(),\n",
")\n",
"\n",
"preprocessed_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Формирование набора моделей для классификации\n",
"\n",
"logistic -- логистическая регрессия\n",
"\n",
"ridge -- гребневая регрессия\n",
"\n",
"decision_tree -- дерево решений\n",
"\n",
"knn -- k-ближайших соседей\n",
"\n",
"naive_bayes -- наивный Байесовский классификатор\n",
"\n",
"gradient_boosting -- метод градиентного бустинга (набор деревьев решений)\n",
"\n",
"random_forest -- метод случайного леса (набор деревьев решений)\n",
"\n",
"mlp -- многослойный персептрон (нейронная сеть)\n",
"\n",
"Документация: https://scikit-learn.org/1.5/supervised_learning.html"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree\n",
"\n",
"class_models = {\n",
" \"logistic\": {\"model\": linear_model.LogisticRegression()},\n",
" # \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n",
" \"ridge\": {\"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")},\n",
" \"decision_tree\": {\n",
" \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)\n",
" },\n",
" \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n",
" \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n",
" \"gradient_boosting\": {\n",
" \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n",
" },\n",
" \"random_forest\": {\n",
" \"model\": ensemble.RandomForestClassifier(\n",
" max_depth=11, class_weight=\"balanced\", random_state=random_state\n",
" )\n",
" },\n",
" \"mlp\": {\n",
" \"model\": neural_network.MLPClassifier(\n",
" hidden_layer_sizes=(7,),\n",
" max_iter=500,\n",
" early_stopping=True,\n",
" random_state=random_state,\n",
" )\n",
" },\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Обучение моделей на обучающем наборе данных и оценка на тестовом"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: logistic\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n",
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: ridge\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n",
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: decision_tree\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: knn\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: naive_bayes\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: gradient_boosting\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: random_forest\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: mlp\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
" warnings.warn(\n"
]
}
],
"source": [
"import numpy as np\n",
"from sklearn import metrics\n",
"\n",
"for model_name in class_models.keys():\n",
" print(f\"Model: {model_name}\")\n",
" model = class_models[model_name][\"model\"]\n",
"\n",
" model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
" model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n",
"\n",
" y_train_predict = model_pipeline.predict(X_train)\n",
" y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n",
" y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n",
"\n",
" class_models[model_name][\"pipeline\"] = model_pipeline\n",
" class_models[model_name][\"probs\"] = y_test_probs\n",
" class_models[model_name][\"preds\"] = y_test_predict\n",
"\n",
" class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n",
" y_train, y_train_predict\n",
" )\n",
" class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n",
" y_train, y_train_predict\n",
" )\n",
" class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n",
" y_train, y_train_predict\n",
" )\n",
" class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n",
" y_test, y_test_probs\n",
" )\n",
" class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n",
" class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n",
" class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n",
" y_test, y_test_predict\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Сводная таблица оценок качества для использованных моделей классификации\n",
"\n",
"Документация: https://scikit-learn.org/1.5/modules/model_evaluation.html"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Матрица неточностей"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2IAAAQ9CAYAAAA70P4+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeVxU5f4H8M8ZYNh3BSQQUdwQ3KhfUSpoJpqapl6z3FO7Gi5grjc3NLX0uuZWmqJXy6ysm0sqLpCmmRvmgrihUAioCCiyzpzfH1xOTsDA6AAzcz7v1+u8rnPOM2eeOVfPp+8zzzlHEEVRBBEREREREdUYRW13gIiIiIiISG5YiBEREREREdUwFmJEREREREQ1jIUYERERERFRDWMhRkREREREVMNYiBEREREREdUwFmJEREREREQ1jIUYERERERFRDWMhRkREREREVMNYiJHBi46OhiAIuHXrVrXs/9atWxAEAdHR0XrZX2xsLARBQGxsrF72R0REZCrmzJkDQRCq1FYQBMyZM6d6O0RUi1iIET2lNWvW6K14IyIiIiJ5Ma/tDhDVNh8fH+Tl5cHCwkKn961ZswZ16tTBsGHDNNZ36NABeXl5UCqVeuwlERGR8ZsxYwamTZtW290gMggsxEj2BEGAlZWV3vanUCj0uj8iIiJTkJubC1tbW5ib8z8/iQBOTSQjtWbNGrRo0QKWlpbw9PREeHg4srKyyrRbvXo1GjZsCGtra/zf//0fjh49itDQUISGhkptyrtGLC0tDcOHD4eXlxcsLS1Rr1499OrVS7pOrUGDBrh06RLi4uIgCAIEQZD2WdE1YidPnsTrr78OZ2dn2NraomXLllixYoV+DwwREZEBKL0W7PLly3jnnXfg7OyMdu3alXuNWEFBASIjI1G3bl3Y29vjjTfewB9//FHufmNjY/H888/DysoKjRo1wmeffVbhdWdbt25FUFAQrK2t4eLiggEDBiAlJaVavi/R0+CQBBmdOXPmICoqCp07d8aYMWOQmJiItWvX4tSpU/jll1+kKYZr167F2LFj0b59e0RGRuLWrVvo3bs3nJ2d4eXlpfUz+vbti0uXLmHcuHFo0KABMjIyEBMTg+TkZDRo0ADLly/HuHHjYGdnhw8//BAA4O7uXuH+YmJi0KNHD9SrVw8TJkyAh4cHEhISsHv3bkyYMEF/B4eIiMiA/OMf/0Djxo2xYMECiKKIjIyMMm1GjhyJrVu34p133sHLL7+Mw4cPo3v37mXanTt3Dl27dkW9evUQFRUFlUqFuXPnom7dumXazp8/HzNnzkT//v0xcuRI3L17F59++ik6dOiAc+fOwcnJqTq+LpFuRCIDt2nTJhGAmJSUJGZkZIhKpVLs0qWLqFKppDarVq0SAYgbN24URVEUCwoKRFdXV/GFF14Qi4qKpHbR0dEiADEkJERal5SUJAIQN23aJIqiKD548EAEIC5evFhrv1q0aKGxn1JHjhwRAYhHjhwRRVEUi4uLRV9fX9HHx0d88OCBRlu1Wl31A0FERGQkZs+eLQIQ33777XLXl4qPjxcBiO+//75Gu3feeUcEIM6ePVta17NnT9HGxkb8888/pXXXrl0Tzc3NNfZ569Yt0czMTJw/f77GPi9cuCCam5uXWU9UWzg1kYzKwYMHUVhYiIiICCgUf/31HTVqFBwcHLBnzx4AwOnTp3H//n2MGjVKYy76wIED4ezsrPUzrK2toVQqERsbiwcPHjxzn8+dO4ekpCRERESUGYGr6i18iYiIjNHo0aO1bt+7dy8AYPz48RrrIyIiNF6rVCocPHgQvXv3hqenp7Tez88P3bp102i7c+dOqNVq9O/fH/fu3ZMWDw8PNG7cGEeOHHmGb0SkP5yaSEbl9u3bAICmTZtqrFcqlWjYsKG0vfR//fz8NNqZm5ujQYMGWj/D0tISn3zyCT744AO4u7vjpZdeQo8ePTBkyBB4eHjo3OcbN24AAAICAnR+LxERkTHz9fXVuv327dtQKBRo1KiRxvq/53xGRgby8vLK5DpQNuuvXbsGURTRuHHjcj9T17skE1UXFmJE5YiIiEDPnj3xww8/YP/+/Zg5cyYWLlyIw4cPo02bNrXdPSIiIqNgbW1d45+pVqshCAJ++uknmJmZldluZ2dX430iKg+nJpJR8fHxAQAkJiZqrC8sLERSUpK0vfR/r1+/rtGuuLhYuvNhZRo1aoQPPvgABw4cwMWLF1FYWIglS5ZI26s6rbB0lO/ixYtVak9ERCQXPj4+UKvV0uyRUn/PeTc3N1hZWZXJdaBs1jdq1AiiKMLX1xedO3cus7z00kv6/yJET4GFGBmVzp07Q6lUYuXKlRBFUVr/xRdfIDs7W7rL0vPPPw9XV1esX78excXFUrtt27ZVet3X48ePkZ+fr7GuUaNGsLe3R0FBgbTO1ta23Fvm/13btm3h6+uL5cuXl2n/5HcgIiKSm9Lru1auXKmxfvny5RqvzczM0LlzZ/zwww9ITU2V1l+/fh0//fSTRts+ffrAzMwMUVFRZXJWFEXcv39fj9+A6OlxaiIZlbp162L69OmIiopC165d8cYbbyAxMRFr1qzBCy+8gEGDBgEouWZszpw5GDduHDp16oT+/fvj1q1biI6ORqNGjbT+mnX16lW8+uqr6N+/P/z9/WFubo7vv/8e6enpGDBggNQuKCgIa9euxUcffQQ/Pz+4ubmhU6dOZfanUCiwdu1a9OzZE61bt8bw4cNRr149XLlyBZcuXcL+/fv1f6CIiIiMQOvWrfH2229jzZo1yM7Oxssvv4xDhw6V+8vXnDlzcODAAbzyyisYM2YMVCoVVq1ahYCAAMTHx0vtGjVqhI8++gjTp0+XHl1jb2+PpKQkfP/993jvvfcwadKkGvyWROVjIUZGZ86cOahbty5WrVqFyMhIuLi44L333sOCBQs0LsAdO3YsRFHEkiVLMGnSJLRq1Qo//vgjxo8fDysrqwr37+3tjbfffhuHDh3Cf/7zH5ibm6NZs2bYsWMH+vbtK7WbNWsWbt++jUWLFuHhw4cICQkptxADgLCwMBw5cgRRUVFYsmQJ1Go1GjVqhFGjRunvwBARERmhjRs3om7duti2bRt++OEHdOrUCXv27IG3t7dGu6CgIPz000+YNGkSZs6cCW9vb8ydOxcJCQm4cuWKRttp06ahSZMmWLZsGaKiogCU5HuXLl3wxhtv1Nh3I9JGEDk3imRErVajbt266NOnD9avX1/b3SEiIqJn1Lt3b1y6dAnXrl2r7a4Q6YTXiJHJys/PLzM3fMuWLcjMzERoaGjtdIqIiIieWl5ensbra9euYe/evcx1Mkr8RYxMVmxsLCIjI/GPf/wDrq6uOHv2LL744gs0b94cZ86cgVKprO0uEhERkQ7q1auHYcOGSc8OXbt2LQoKCnDu3LkKnxtGZKh4jRiZrAYNGsDb2xsrV65EZmYmXFxcMGTIEHz88ccswoiIiIxQ165d8dVXXyEtLQ2WlpYIDg7GggULWISRUeIvYkRERERERDWM14gRERERERHVMBZiRERERERENYzXiBkYtVqN1NRU2Nvba33oMJEpEkURDx8+hKenJxQK/Y4T5efno7CwsNJ2SqVS63PmiEh+mM0kZ8zm6sNCzMCkpqaWeYAhkdykpKTAy8tLb/vLz8+Hr48d0jJUlbb18PBAUlKSSZ7wiejpMJuJmM3VgYWYgbG3twcA3D7bAA52nDlaG95sEljbXZCtYhThGPZK/w70pbCwEGkZKlw/7Q0H+4r/XeU8VMPv+RQUFhaa3MmeiJ4es7n29WkRVNtdkK1isQhHi39gNlcDFmIGpnTKg4OdQutfSqo+5oJFbXdBvv53D9fqmvpjZy/Azr7ifavBKUdEVBazufYxm2sfs1n/WIgRkWwUiSoUaXliR5GorsHeEBERkZyzmYUYEcmGGiLUqPhkr20bERER6Z+cs5mFGBHJhhoiVDI92RMRERkiOWczJzoTkWwUiepKFyIiIqo51ZnNH3/8MQRBQEREhLQuPz8f4eHhcHV1hZ2dHfr27Yv09HSN9yUnJ6N79+6wsbGBm5s
"text/plain": [
"<Figure size 1200x1000 with 16 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.metrics import ConfusionMatrixDisplay\n",
"import matplotlib.pyplot as plt\n",
"\n",
"_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n",
"for index, key in enumerate(class_models.keys()):\n",
" c_matrix = class_models[key][\"Confusion_matrix\"]\n",
" disp = ConfusionMatrixDisplay(\n",
" confusion_matrix=c_matrix, display_labels=[\"no water\", \"water\"]\n",
" ).plot(ax=ax.flat[index])\n",
" disp.ax_.set_title(key)\n",
"\n",
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Точность, полнота, верность (аккуратность), F-мера"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_060dc_row0_col0 {\n",
" background-color: #73d056;\n",
" color: #000000;\n",
"}\n",
"#T_060dc_row0_col1, #T_060dc_row1_col1 {\n",
" background-color: #52c569;\n",
" color: #000000;\n",
"}\n",
"#T_060dc_row0_col2 {\n",
" background-color: #3dbc74;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row0_col3 {\n",
" background-color: #5cc863;\n",
" color: #000000;\n",
"}\n",
"#T_060dc_row0_col4, #T_060dc_row3_col4 {\n",
" background-color: #d9586a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row0_col5, #T_060dc_row0_col7, #T_060dc_row1_col4, #T_060dc_row1_col5, #T_060dc_row2_col4, #T_060dc_row2_col5, #T_060dc_row2_col6, #T_060dc_row3_col5, #T_060dc_row4_col5 {\n",
" background-color: #da5a6a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row0_col6, #T_060dc_row5_col7 {\n",
" background-color: #bb3488;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row1_col0 {\n",
" background-color: #95d840;\n",
" color: #000000;\n",
"}\n",
"#T_060dc_row1_col2 {\n",
" background-color: #a2da37;\n",
" color: #000000;\n",
"}\n",
"#T_060dc_row1_col3, #T_060dc_row2_col3, #T_060dc_row3_col1 {\n",
" background-color: #3fbc73;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row1_col6 {\n",
" background-color: #d5546e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row1_col7 {\n",
" background-color: #d14e72;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row2_col0, #T_060dc_row2_col2, #T_060dc_row4_col1, #T_060dc_row5_col2, #T_060dc_row6_col2, #T_060dc_row6_col3, #T_060dc_row7_col2 {\n",
" background-color: #a8db34;\n",
" color: #000000;\n",
"}\n",
"#T_060dc_row2_col1 {\n",
" background-color: #44bf70;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row2_col7 {\n",
" background-color: #cc4977;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row3_col0 {\n",
" background-color: #6ccd5a;\n",
" color: #000000;\n",
"}\n",
"#T_060dc_row3_col2 {\n",
" background-color: #31b57b;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row3_col3 {\n",
" background-color: #32b67a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row3_col6, #T_060dc_row6_col7 {\n",
" background-color: #b52f8c;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row3_col7 {\n",
" background-color: #c5407e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row4_col0 {\n",
" background-color: #98d83e;\n",
" color: #000000;\n",
"}\n",
"#T_060dc_row4_col2, #T_060dc_row4_col3, #T_060dc_row7_col0, #T_060dc_row7_col1 {\n",
" background-color: #26818e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row4_col4, #T_060dc_row5_col5 {\n",
" background-color: #d8576b;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row4_col6 {\n",
" background-color: #7501a8;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row4_col7 {\n",
" background-color: #6900a8;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row5_col0 {\n",
" background-color: #21a685;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row5_col1 {\n",
" background-color: #1fa287;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row5_col3 {\n",
" background-color: #93d741;\n",
" color: #000000;\n",
"}\n",
"#T_060dc_row5_col4, #T_060dc_row6_col5 {\n",
" background-color: #d7566c;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row5_col6 {\n",
" background-color: #a31e9a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row6_col0 {\n",
" background-color: #20a386;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row6_col1 {\n",
" background-color: #1fa088;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row6_col4 {\n",
" background-color: #d6556d;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row6_col6 {\n",
" background-color: #a01a9c;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row7_col3 {\n",
" background-color: #20a486;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_060dc_row7_col4, #T_060dc_row7_col5, #T_060dc_row7_col6, #T_060dc_row7_col7 {\n",
" background-color: #4e02a2;\n",
" color: #f1f1f1;\n",
"}\n",
"</style>\n",
"<table id=\"T_060dc\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_060dc_level0_col0\" class=\"col_heading level0 col0\" >Precision_train</th>\n",
" <th id=\"T_060dc_level0_col1\" class=\"col_heading level0 col1\" >Precision_test</th>\n",
" <th id=\"T_060dc_level0_col2\" class=\"col_heading level0 col2\" >Recall_train</th>\n",
" <th id=\"T_060dc_level0_col3\" class=\"col_heading level0 col3\" >Recall_test</th>\n",
" <th id=\"T_060dc_level0_col4\" class=\"col_heading level0 col4\" >Accuracy_train</th>\n",
" <th id=\"T_060dc_level0_col5\" class=\"col_heading level0 col5\" >Accuracy_test</th>\n",
" <th id=\"T_060dc_level0_col6\" class=\"col_heading level0 col6\" >F1_train</th>\n",
" <th id=\"T_060dc_level0_col7\" class=\"col_heading level0 col7\" >F1_test</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_060dc_level0_row0\" class=\"row_heading level0 row0\" >logistic</th>\n",
" <td id=\"T_060dc_row0_col0\" class=\"data row0 col0\" >0.813725</td>\n",
" <td id=\"T_060dc_row0_col1\" class=\"data row0 col1\" >0.676471</td>\n",
" <td id=\"T_060dc_row0_col2\" class=\"data row0 col2\" >0.638462</td>\n",
" <td id=\"T_060dc_row0_col3\" class=\"data row0 col3\" >0.696970</td>\n",
" <td id=\"T_060dc_row0_col4\" class=\"data row0 col4\" >0.996183</td>\n",
" <td id=\"T_060dc_row0_col5\" class=\"data row0 col5\" >0.995142</td>\n",
" <td id=\"T_060dc_row0_col6\" class=\"data row0 col6\" >0.715517</td>\n",
" <td id=\"T_060dc_row0_col7\" class=\"data row0 col7\" >0.686567</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_060dc_level0_row1\" class=\"row_heading level0 row1\" >decision_tree</th>\n",
" <td id=\"T_060dc_row1_col0\" class=\"data row1 col0\" >0.934307</td>\n",
" <td id=\"T_060dc_row1_col1\" class=\"data row1 col1\" >0.678571</td>\n",
" <td id=\"T_060dc_row1_col2\" class=\"data row1 col2\" >0.984615</td>\n",
" <td id=\"T_060dc_row1_col3\" class=\"data row1 col3\" >0.575758</td>\n",
" <td id=\"T_060dc_row1_col4\" class=\"data row1 col4\" >0.999364</td>\n",
" <td id=\"T_060dc_row1_col5\" class=\"data row1 col5\" >0.994680</td>\n",
" <td id=\"T_060dc_row1_col6\" class=\"data row1 col6\" >0.958801</td>\n",
" <td id=\"T_060dc_row1_col7\" class=\"data row1 col7\" >0.622951</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_060dc_level0_row2\" class=\"row_heading level0 row2\" >gradient_boosting</th>\n",
" <td id=\"T_060dc_row2_col0\" class=\"data row2 col0\" >1.000000</td>\n",
" <td id=\"T_060dc_row2_col1\" class=\"data row2 col1\" >0.612903</td>\n",
" <td id=\"T_060dc_row2_col2\" class=\"data row2 col2\" >1.000000</td>\n",
" <td id=\"T_060dc_row2_col3\" class=\"data row2 col3\" >0.575758</td>\n",
" <td id=\"T_060dc_row2_col4\" class=\"data row2 col4\" >1.000000</td>\n",
" <td id=\"T_060dc_row2_col5\" class=\"data row2 col5\" >0.993986</td>\n",
" <td id=\"T_060dc_row2_col6\" class=\"data row2 col6\" >1.000000</td>\n",
" <td id=\"T_060dc_row2_col7\" class=\"data row2 col7\" >0.593750</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_060dc_level0_row3\" class=\"row_heading level0 row3\" >mlp</th>\n",
" <td id=\"T_060dc_row3_col0\" class=\"data row3 col0\" >0.789474</td>\n",
" <td id=\"T_060dc_row3_col1\" class=\"data row3 col1\" >0.586207</td>\n",
" <td id=\"T_060dc_row3_col2\" class=\"data row3 col2\" >0.576923</td>\n",
" <td id=\"T_060dc_row3_col3\" class=\"data row3 col3\" >0.515152</td>\n",
" <td id=\"T_060dc_row3_col4\" class=\"data row3 col4\" >0.995662</td>\n",
" <td id=\"T_060dc_row3_col5\" class=\"data row3 col5\" >0.993523</td>\n",
" <td id=\"T_060dc_row3_col6\" class=\"data row3 col6\" >0.666667</td>\n",
" <td id=\"T_060dc_row3_col7\" class=\"data row3 col7\" >0.548387</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_060dc_level0_row4\" class=\"row_heading level0 row4\" >knn</th>\n",
" <td id=\"T_060dc_row4_col0\" class=\"data row4 col0\" >0.950000</td>\n",
" <td id=\"T_060dc_row4_col1\" class=\"data row4 col1\" >1.000000</td>\n",
" <td id=\"T_060dc_row4_col2\" class=\"data row4 col2\" >0.146154</td>\n",
" <td id=\"T_060dc_row4_col3\" class=\"data row4 col3\" >0.060606</td>\n",
" <td id=\"T_060dc_row4_col4\" class=\"data row4 col4\" >0.993522</td>\n",
" <td id=\"T_060dc_row4_col5\" class=\"data row4 col5\" >0.992829</td>\n",
" <td id=\"T_060dc_row4_col6\" class=\"data row4 col6\" >0.253333</td>\n",
" <td id=\"T_060dc_row4_col7\" class=\"data row4 col7\" >0.114286</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_060dc_level0_row5\" class=\"row_heading level0 row5\" >random_forest</th>\n",
" <td id=\"T_060dc_row5_col0\" class=\"data row5 col0\" >0.372493</td>\n",
" <td id=\"T_060dc_row5_col1\" class=\"data row5 col1\" >0.333333</td>\n",
" <td id=\"T_060dc_row5_col2\" class=\"data row5 col2\" >1.000000</td>\n",
" <td id=\"T_060dc_row5_col3\" class=\"data row5 col3\" >0.878788</td>\n",
" <td id=\"T_060dc_row5_col4\" class=\"data row5 col4\" >0.987334</td>\n",
" <td id=\"T_060dc_row5_col5\" class=\"data row5 col5\" >0.985658</td>\n",
" <td id=\"T_060dc_row5_col6\" class=\"data row5 col6\" >0.542797</td>\n",
" <td id=\"T_060dc_row5_col7\" class=\"data row5 col7\" >0.483333</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_060dc_level0_row6\" class=\"row_heading level0 row6\" >ridge</th>\n",
" <td id=\"T_060dc_row6_col0\" class=\"data row6 col0\" >0.343915</td>\n",
" <td id=\"T_060dc_row6_col1\" class=\"data row6 col1\" >0.300971</td>\n",
" <td id=\"T_060dc_row6_col2\" class=\"data row6 col2\" >1.000000</td>\n",
" <td id=\"T_060dc_row6_col3\" class=\"data row6 col3\" >0.939394</td>\n",
" <td id=\"T_060dc_row6_col4\" class=\"data row6 col4\" >0.985656</td>\n",
" <td id=\"T_060dc_row6_col5\" class=\"data row6 col5\" >0.982882</td>\n",
" <td id=\"T_060dc_row6_col6\" class=\"data row6 col6\" >0.511811</td>\n",
" <td id=\"T_060dc_row6_col7\" class=\"data row6 col7\" >0.455882</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_060dc_level0_row7\" class=\"row_heading level0 row7\" >naive_bayes</th>\n",
" <td id=\"T_060dc_row7_col0\" class=\"data row7 col0\" >0.018619</td>\n",
" <td id=\"T_060dc_row7_col1\" class=\"data row7 col1\" >0.006916</td>\n",
" <td id=\"T_060dc_row7_col2\" class=\"data row7 col2\" >1.000000</td>\n",
" <td id=\"T_060dc_row7_col3\" class=\"data row7 col3\" >0.363636</td>\n",
" <td id=\"T_060dc_row7_col4\" class=\"data row7 col4\" >0.603702</td>\n",
" <td id=\"T_060dc_row7_col5\" class=\"data row7 col5\" >0.596576</td>\n",
" <td id=\"T_060dc_row7_col6\" class=\"data row7 col6\" >0.036558</td>\n",
" <td id=\"T_060dc_row7_col7\" class=\"data row7 col7\" >0.013575</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x21a8419a540>"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
" [\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" \"Accuracy_train\",\n",
" \"Accuracy_test\",\n",
" \"F1_train\",\n",
" \"F1_test\",\n",
" ]\n",
"]\n",
"class_metrics.sort_values(\n",
" by=\"Accuracy_test\", ascending=False\n",
").style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_8c989_row0_col0, #T_8c989_row0_col1, #T_8c989_row2_col0, #T_8c989_row3_col0, #T_8c989_row6_col0 {\n",
" background-color: #a8db34;\n",
" color: #000000;\n",
"}\n",
"#T_8c989_row0_col2, #T_8c989_row0_col3, #T_8c989_row0_col4, #T_8c989_row1_col2, #T_8c989_row2_col2, #T_8c989_row3_col2, #T_8c989_row4_col2 {\n",
" background-color: #da5a6a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_8c989_row1_col0, #T_8c989_row4_col0 {\n",
" background-color: #a0da39;\n",
" color: #000000;\n",
"}\n",
"#T_8c989_row1_col1 {\n",
" background-color: #4ec36b;\n",
" color: #000000;\n",
"}\n",
"#T_8c989_row1_col3 {\n",
" background-color: #b52f8c;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_8c989_row1_col4 {\n",
" background-color: #c33d80;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_8c989_row2_col1 {\n",
" background-color: #6ece58;\n",
" color: #000000;\n",
"}\n",
"#T_8c989_row2_col3, #T_8c989_row2_col4 {\n",
" background-color: #c6417d;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_8c989_row3_col1 {\n",
" background-color: #81d34d;\n",
" color: #000000;\n",
"}\n",
"#T_8c989_row3_col3, #T_8c989_row3_col4 {\n",
" background-color: #cc4977;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_8c989_row4_col1 {\n",
" background-color: #56c667;\n",
" color: #000000;\n",
"}\n",
"#T_8c989_row4_col3 {\n",
" background-color: #bb3488;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_8c989_row4_col4 {\n",
" background-color: #c43e7f;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_8c989_row5_col0 {\n",
" background-color: #a5db36;\n",
" color: #000000;\n",
"}\n",
"#T_8c989_row5_col1 {\n",
" background-color: #21908d;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_8c989_row5_col2 {\n",
" background-color: #bc3587;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_8c989_row5_col3 {\n",
" background-color: #6c00a8;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_8c989_row5_col4 {\n",
" background-color: #8b0aa5;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_8c989_row6_col1 {\n",
" background-color: #8ed645;\n",
" color: #000000;\n",
"}\n",
"#T_8c989_row6_col2 {\n",
" background-color: #ad2793;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_8c989_row6_col3, #T_8c989_row6_col4 {\n",
" background-color: #d14e72;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_8c989_row7_col0, #T_8c989_row7_col1 {\n",
" background-color: #26818e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_8c989_row7_col2, #T_8c989_row7_col3, #T_8c989_row7_col4 {\n",
" background-color: #4e02a2;\n",
" color: #f1f1f1;\n",
"}\n",
"</style>\n",
"<table id=\"T_8c989\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_8c989_level0_col0\" class=\"col_heading level0 col0\" >Accuracy_test</th>\n",
" <th id=\"T_8c989_level0_col1\" class=\"col_heading level0 col1\" >F1_test</th>\n",
" <th id=\"T_8c989_level0_col2\" class=\"col_heading level0 col2\" >ROC_AUC_test</th>\n",
" <th id=\"T_8c989_level0_col3\" class=\"col_heading level0 col3\" >Cohen_kappa_test</th>\n",
" <th id=\"T_8c989_level0_col4\" class=\"col_heading level0 col4\" >MCC_test</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_8c989_level0_row0\" class=\"row_heading level0 row0\" >logistic</th>\n",
" <td id=\"T_8c989_row0_col0\" class=\"data row0 col0\" >0.995142</td>\n",
" <td id=\"T_8c989_row0_col1\" class=\"data row0 col1\" >0.686567</td>\n",
" <td id=\"T_8c989_row0_col2\" class=\"data row0 col2\" >0.996073</td>\n",
" <td id=\"T_8c989_row0_col3\" class=\"data row0 col3\" >0.684120</td>\n",
" <td id=\"T_8c989_row0_col4\" class=\"data row0 col4\" >0.684197</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_8c989_level0_row1\" class=\"row_heading level0 row1\" >ridge</th>\n",
" <td id=\"T_8c989_row1_col0\" class=\"data row1 col0\" >0.982882</td>\n",
" <td id=\"T_8c989_row1_col1\" class=\"data row1 col1\" >0.455882</td>\n",
" <td id=\"T_8c989_row1_col2\" class=\"data row1 col2\" >0.995416</td>\n",
" <td id=\"T_8c989_row1_col3\" class=\"data row1 col3\" >0.449517</td>\n",
" <td id=\"T_8c989_row1_col4\" class=\"data row1 col4\" >0.526537</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_8c989_level0_row2\" class=\"row_heading level0 row2\" >mlp</th>\n",
" <td id=\"T_8c989_row2_col0\" class=\"data row2 col0\" >0.993523</td>\n",
" <td id=\"T_8c989_row2_col1\" class=\"data row2 col1\" >0.548387</td>\n",
" <td id=\"T_8c989_row2_col2\" class=\"data row2 col2\" >0.994420</td>\n",
" <td id=\"T_8c989_row2_col3\" class=\"data row2 col3\" >0.545139</td>\n",
" <td id=\"T_8c989_row2_col4\" class=\"data row2 col4\" >0.546293</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_8c989_level0_row3\" class=\"row_heading level0 row3\" >gradient_boosting</th>\n",
" <td id=\"T_8c989_row3_col0\" class=\"data row3 col0\" >0.993986</td>\n",
" <td id=\"T_8c989_row3_col1\" class=\"data row3 col1\" >0.593750</td>\n",
" <td id=\"T_8c989_row3_col2\" class=\"data row3 col2\" >0.994137</td>\n",
" <td id=\"T_8c989_row3_col3\" class=\"data row3 col3\" >0.590723</td>\n",
" <td id=\"T_8c989_row3_col4\" class=\"data row3 col4\" >0.591016</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_8c989_level0_row4\" class=\"row_heading level0 row4\" >random_forest</th>\n",
" <td id=\"T_8c989_row4_col0\" class=\"data row4 col0\" >0.985658</td>\n",
" <td id=\"T_8c989_row4_col1\" class=\"data row4 col1\" >0.483333</td>\n",
" <td id=\"T_8c989_row4_col2\" class=\"data row4 col2\" >0.992880</td>\n",
" <td id=\"T_8c989_row4_col3\" class=\"data row4 col3\" >0.477550</td>\n",
" <td id=\"T_8c989_row4_col4\" class=\"data row4 col4\" >0.536289</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_8c989_level0_row5\" class=\"row_heading level0 row5\" >knn</th>\n",
" <td id=\"T_8c989_row5_col0\" class=\"data row5 col0\" >0.992829</td>\n",
" <td id=\"T_8c989_row5_col1\" class=\"data row5 col1\" >0.114286</td>\n",
" <td id=\"T_8c989_row5_col2\" class=\"data row5 col2\" >0.844971</td>\n",
" <td id=\"T_8c989_row5_col3\" class=\"data row5 col3\" >0.113512</td>\n",
" <td id=\"T_8c989_row5_col4\" class=\"data row5 col4\" >0.245298</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_8c989_level0_row6\" class=\"row_heading level0 row6\" >decision_tree</th>\n",
" <td id=\"T_8c989_row6_col0\" class=\"data row6 col0\" >0.994680</td>\n",
" <td id=\"T_8c989_row6_col1\" class=\"data row6 col1\" >0.622951</td>\n",
" <td id=\"T_8c989_row6_col2\" class=\"data row6 col2\" >0.786180</td>\n",
" <td id=\"T_8c989_row6_col3\" class=\"data row6 col3\" >0.620290</td>\n",
" <td id=\"T_8c989_row6_col4\" class=\"data row6 col4\" >0.622414</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_8c989_level0_row7\" class=\"row_heading level0 row7\" >naive_bayes</th>\n",
" <td id=\"T_8c989_row7_col0\" class=\"data row7 col0\" >0.596576</td>\n",
" <td id=\"T_8c989_row7_col1\" class=\"data row7 col1\" >0.013575</td>\n",
" <td id=\"T_8c989_row7_col2\" class=\"data row7 col2\" >0.481002</td>\n",
" <td id=\"T_8c989_row7_col3\" class=\"data row7 col3\" >-0.001429</td>\n",
" <td id=\"T_8c989_row7_col4\" class=\"data row7 col4\" >-0.006747</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x21a86eff920>"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
" [\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" \"ROC_AUC_test\",\n",
" \"Cohen_kappa_test\",\n",
" \"MCC_test\",\n",
" ]\n",
"]\n",
"class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\n",
" \"ROC_AUC_test\",\n",
" \"MCC_test\",\n",
" \"Cohen_kappa_test\",\n",
" ],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'logistic'"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n",
"\n",
"display(best_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Вывод данных с ошибкой предсказания для оценки"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"'Error items count: 21'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>Predicted</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>grade</th>\n",
" <th>sqft_above</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>121039042</th>\n",
" <td>20150313T000000</td>\n",
" <td>0</td>\n",
" <td>425000.0</td>\n",
" <td>3</td>\n",
" <td>2.75</td>\n",
" <td>3610</td>\n",
" <td>107386</td>\n",
" <td>1.5</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>8</td>\n",
" <td>3130</td>\n",
" <td>480</td>\n",
" <td>1918</td>\n",
" <td>1962</td>\n",
" <td>98023</td>\n",
" <td>47.3351</td>\n",
" <td>-122.362</td>\n",
" <td>2630</td>\n",
" <td>42126</td>\n",
" </tr>\n",
" <tr>\n",
" <th>624069108</th>\n",
" <td>20140812T000000</td>\n",
" <td>0</td>\n",
" <td>3200000.0</td>\n",
" <td>4</td>\n",
" <td>3.25</td>\n",
" <td>7000</td>\n",
" <td>28206</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>12</td>\n",
" <td>3500</td>\n",
" <td>3500</td>\n",
" <td>1991</td>\n",
" <td>0</td>\n",
" <td>98075</td>\n",
" <td>47.5928</td>\n",
" <td>-122.086</td>\n",
" <td>4913</td>\n",
" <td>14663</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1025039086</th>\n",
" <td>20140916T000000</td>\n",
" <td>0</td>\n",
" <td>1875000.0</td>\n",
" <td>3</td>\n",
" <td>2.50</td>\n",
" <td>3280</td>\n",
" <td>29111</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>11</td>\n",
" <td>3280</td>\n",
" <td>0</td>\n",
" <td>1925</td>\n",
" <td>0</td>\n",
" <td>98199</td>\n",
" <td>47.6699</td>\n",
" <td>-122.416</td>\n",
" <td>3530</td>\n",
" <td>21074</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1732800780</th>\n",
" <td>20150212T000000</td>\n",
" <td>1</td>\n",
" <td>3065000.0</td>\n",
" <td>5</td>\n",
" <td>3.00</td>\n",
" <td>4150</td>\n",
" <td>7500</td>\n",
" <td>2.5</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>11</td>\n",
" <td>3510</td>\n",
" <td>640</td>\n",
" <td>1909</td>\n",
" <td>0</td>\n",
" <td>98119</td>\n",
" <td>47.6303</td>\n",
" <td>-122.362</td>\n",
" <td>2250</td>\n",
" <td>4050</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2122039094</th>\n",
" <td>20141126T000000</td>\n",
" <td>0</td>\n",
" <td>705000.0</td>\n",
" <td>3</td>\n",
" <td>3.00</td>\n",
" <td>1970</td>\n",
" <td>20978</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>9</td>\n",
" <td>1770</td>\n",
" <td>200</td>\n",
" <td>1980</td>\n",
" <td>0</td>\n",
" <td>98070</td>\n",
" <td>47.3844</td>\n",
" <td>-122.438</td>\n",
" <td>2280</td>\n",
" <td>75396</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2923039243</th>\n",
" <td>20141113T000000</td>\n",
" <td>0</td>\n",
" <td>340000.0</td>\n",
" <td>4</td>\n",
" <td>1.00</td>\n",
" <td>1200</td>\n",
" <td>11834</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>1200</td>\n",
" <td>0</td>\n",
" <td>1972</td>\n",
" <td>0</td>\n",
" <td>98070</td>\n",
" <td>47.4557</td>\n",
" <td>-122.443</td>\n",
" <td>1670</td>\n",
" <td>47462</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3024059014</th>\n",
" <td>20150325T000000</td>\n",
" <td>0</td>\n",
" <td>1900000.0</td>\n",
" <td>4</td>\n",
" <td>2.25</td>\n",
" <td>3020</td>\n",
" <td>11489</td>\n",
" <td>1.5</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>10</td>\n",
" <td>2110</td>\n",
" <td>910</td>\n",
" <td>1916</td>\n",
" <td>1988</td>\n",
" <td>98040</td>\n",
" <td>47.5395</td>\n",
" <td>-122.210</td>\n",
" <td>3890</td>\n",
" <td>11489</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3222049024</th>\n",
" <td>20140522T000000</td>\n",
" <td>1</td>\n",
" <td>361000.0</td>\n",
" <td>3</td>\n",
" <td>1.00</td>\n",
" <td>1100</td>\n",
" <td>4046</td>\n",
" <td>1.5</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>1100</td>\n",
" <td>0</td>\n",
" <td>1922</td>\n",
" <td>0</td>\n",
" <td>98198</td>\n",
" <td>47.3440</td>\n",
" <td>-122.331</td>\n",
" <td>2550</td>\n",
" <td>7847</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3426049284</th>\n",
" <td>20140819T000000</td>\n",
" <td>0</td>\n",
" <td>2300000.0</td>\n",
" <td>4</td>\n",
" <td>3.25</td>\n",
" <td>4110</td>\n",
" <td>15929</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>12</td>\n",
" <td>2720</td>\n",
" <td>1390</td>\n",
" <td>2001</td>\n",
" <td>0</td>\n",
" <td>98115</td>\n",
" <td>47.6934</td>\n",
" <td>-122.271</td>\n",
" <td>2640</td>\n",
" <td>15929</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3741600020</th>\n",
" <td>20140915T000000</td>\n",
" <td>1</td>\n",
" <td>540000.0</td>\n",
" <td>3</td>\n",
" <td>2.25</td>\n",
" <td>2100</td>\n",
" <td>20018</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>8</td>\n",
" <td>1470</td>\n",
" <td>630</td>\n",
" <td>1948</td>\n",
" <td>0</td>\n",
" <td>98166</td>\n",
" <td>47.4544</td>\n",
" <td>-122.366</td>\n",
" <td>2410</td>\n",
" <td>17196</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3760500336</th>\n",
" <td>20141126T000000</td>\n",
" <td>1</td>\n",
" <td>2125000.0</td>\n",
" <td>4</td>\n",
" <td>2.75</td>\n",
" <td>3190</td>\n",
" <td>19513</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>10</td>\n",
" <td>3190</td>\n",
" <td>0</td>\n",
" <td>1982</td>\n",
" <td>0</td>\n",
" <td>98034</td>\n",
" <td>47.6991</td>\n",
" <td>-122.235</td>\n",
" <td>2750</td>\n",
" <td>13496</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3867400175</th>\n",
" <td>20150224T000000</td>\n",
" <td>1</td>\n",
" <td>850000.0</td>\n",
" <td>2</td>\n",
" <td>1.50</td>\n",
" <td>1800</td>\n",
" <td>4144</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>900</td>\n",
" <td>900</td>\n",
" <td>1962</td>\n",
" <td>0</td>\n",
" <td>98116</td>\n",
" <td>47.5934</td>\n",
" <td>-122.390</td>\n",
" <td>2090</td>\n",
" <td>4173</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6329000050</th>\n",
" <td>20150310T000000</td>\n",
" <td>0</td>\n",
" <td>641500.0</td>\n",
" <td>1</td>\n",
" <td>1.00</td>\n",
" <td>1000</td>\n",
" <td>9084</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>1000</td>\n",
" <td>0</td>\n",
" <td>1950</td>\n",
" <td>0</td>\n",
" <td>98146</td>\n",
" <td>47.5007</td>\n",
" <td>-122.382</td>\n",
" <td>1090</td>\n",
" <td>6536</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6762700020</th>\n",
" <td>20141013T000000</td>\n",
" <td>1</td>\n",
" <td>7700000.0</td>\n",
" <td>6</td>\n",
" <td>8.00</td>\n",
" <td>12050</td>\n",
" <td>27600</td>\n",
" <td>2.5</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>13</td>\n",
" <td>8570</td>\n",
" <td>3480</td>\n",
" <td>1910</td>\n",
" <td>1987</td>\n",
" <td>98102</td>\n",
" <td>47.6298</td>\n",
" <td>-122.323</td>\n",
" <td>3940</td>\n",
" <td>8800</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7278100515</th>\n",
" <td>20140821T000000</td>\n",
" <td>0</td>\n",
" <td>1295000.0</td>\n",
" <td>2</td>\n",
" <td>2.50</td>\n",
" <td>2910</td>\n",
" <td>19449</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>9</td>\n",
" <td>1940</td>\n",
" <td>970</td>\n",
" <td>1985</td>\n",
" <td>0</td>\n",
" <td>98177</td>\n",
" <td>47.7729</td>\n",
" <td>-122.393</td>\n",
" <td>2540</td>\n",
" <td>23598</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7490000040</th>\n",
" <td>20140718T000000</td>\n",
" <td>1</td>\n",
" <td>2535000.0</td>\n",
" <td>5</td>\n",
" <td>3.25</td>\n",
" <td>3730</td>\n",
" <td>10626</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>10</td>\n",
" <td>3730</td>\n",
" <td>0</td>\n",
" <td>1963</td>\n",
" <td>0</td>\n",
" <td>98004</td>\n",
" <td>47.6240</td>\n",
" <td>-122.221</td>\n",
" <td>4180</td>\n",
" <td>19110</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7631200292</th>\n",
" <td>20140626T000000</td>\n",
" <td>1</td>\n",
" <td>669000.0</td>\n",
" <td>2</td>\n",
" <td>1.75</td>\n",
" <td>1950</td>\n",
" <td>10766</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>1160</td>\n",
" <td>790</td>\n",
" <td>1952</td>\n",
" <td>0</td>\n",
" <td>98166</td>\n",
" <td>47.4504</td>\n",
" <td>-122.377</td>\n",
" <td>1780</td>\n",
" <td>11721</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7636800041</th>\n",
" <td>20140625T000000</td>\n",
" <td>0</td>\n",
" <td>995000.0</td>\n",
" <td>3</td>\n",
" <td>4.50</td>\n",
" <td>4380</td>\n",
" <td>47044</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>9</td>\n",
" <td>3720</td>\n",
" <td>660</td>\n",
" <td>1968</td>\n",
" <td>1990</td>\n",
" <td>98166</td>\n",
" <td>47.4734</td>\n",
" <td>-122.365</td>\n",
" <td>2460</td>\n",
" <td>18512</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8907500070</th>\n",
" <td>20150413T000000</td>\n",
" <td>1</td>\n",
" <td>5350000.0</td>\n",
" <td>5</td>\n",
" <td>5.00</td>\n",
" <td>8000</td>\n",
" <td>23985</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>12</td>\n",
" <td>6720</td>\n",
" <td>1280</td>\n",
" <td>2009</td>\n",
" <td>0</td>\n",
" <td>98004</td>\n",
" <td>47.6232</td>\n",
" <td>-122.220</td>\n",
" <td>4600</td>\n",
" <td>21750</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8964800890</th>\n",
" <td>20150109T000000</td>\n",
" <td>1</td>\n",
" <td>3200000.0</td>\n",
" <td>3</td>\n",
" <td>3.25</td>\n",
" <td>4560</td>\n",
" <td>13363</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>11</td>\n",
" <td>2760</td>\n",
" <td>1800</td>\n",
" <td>1995</td>\n",
" <td>0</td>\n",
" <td>98004</td>\n",
" <td>47.6205</td>\n",
" <td>-122.214</td>\n",
" <td>4060</td>\n",
" <td>13362</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9208900037</th>\n",
" <td>20140919T000000</td>\n",
" <td>1</td>\n",
" <td>6885000.0</td>\n",
" <td>6</td>\n",
" <td>7.75</td>\n",
" <td>9890</td>\n",
" <td>31374</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>13</td>\n",
" <td>8860</td>\n",
" <td>1030</td>\n",
" <td>2001</td>\n",
" <td>0</td>\n",
" <td>98039</td>\n",
" <td>47.6305</td>\n",
" <td>-122.240</td>\n",
" <td>4540</td>\n",
" <td>42730</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>21 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" date Predicted price bedrooms bathrooms \\\n",
"id \n",
"121039042 20150313T000000 0 425000.0 3 2.75 \n",
"624069108 20140812T000000 0 3200000.0 4 3.25 \n",
"1025039086 20140916T000000 0 1875000.0 3 2.50 \n",
"1732800780 20150212T000000 1 3065000.0 5 3.00 \n",
"2122039094 20141126T000000 0 705000.0 3 3.00 \n",
"2923039243 20141113T000000 0 340000.0 4 1.00 \n",
"3024059014 20150325T000000 0 1900000.0 4 2.25 \n",
"3222049024 20140522T000000 1 361000.0 3 1.00 \n",
"3426049284 20140819T000000 0 2300000.0 4 3.25 \n",
"3741600020 20140915T000000 1 540000.0 3 2.25 \n",
"3760500336 20141126T000000 1 2125000.0 4 2.75 \n",
"3867400175 20150224T000000 1 850000.0 2 1.50 \n",
"6329000050 20150310T000000 0 641500.0 1 1.00 \n",
"6762700020 20141013T000000 1 7700000.0 6 8.00 \n",
"7278100515 20140821T000000 0 1295000.0 2 2.50 \n",
"7490000040 20140718T000000 1 2535000.0 5 3.25 \n",
"7631200292 20140626T000000 1 669000.0 2 1.75 \n",
"7636800041 20140625T000000 0 995000.0 3 4.50 \n",
"8907500070 20150413T000000 1 5350000.0 5 5.00 \n",
"8964800890 20150109T000000 1 3200000.0 3 3.25 \n",
"9208900037 20140919T000000 1 6885000.0 6 7.75 \n",
"\n",
" sqft_living sqft_lot floors waterfront view ... grade \\\n",
"id ... \n",
"121039042 3610 107386 1.5 1 3 ... 8 \n",
"624069108 7000 28206 1.0 1 4 ... 12 \n",
"1025039086 3280 29111 2.0 1 3 ... 11 \n",
"1732800780 4150 7500 2.5 0 4 ... 11 \n",
"2122039094 1970 20978 2.0 1 3 ... 9 \n",
"2923039243 1200 11834 1.0 1 3 ... 6 \n",
"3024059014 3020 11489 1.5 1 3 ... 10 \n",
"3222049024 1100 4046 1.5 0 4 ... 6 \n",
"3426049284 4110 15929 2.0 1 4 ... 12 \n",
"3741600020 2100 20018 1.0 0 4 ... 8 \n",
"3760500336 3190 19513 2.0 0 4 ... 10 \n",
"3867400175 1800 4144 1.0 0 4 ... 7 \n",
"6329000050 1000 9084 1.0 1 3 ... 7 \n",
"6762700020 12050 27600 2.5 0 3 ... 13 \n",
"7278100515 2910 19449 2.0 1 4 ... 9 \n",
"7490000040 3730 10626 1.0 0 4 ... 10 \n",
"7631200292 1950 10766 1.0 0 3 ... 6 \n",
"7636800041 4380 47044 2.0 1 3 ... 9 \n",
"8907500070 8000 23985 2.0 0 4 ... 12 \n",
"8964800890 4560 13363 1.0 0 4 ... 11 \n",
"9208900037 9890 31374 2.0 0 4 ... 13 \n",
"\n",
" sqft_above sqft_basement yr_built yr_renovated zipcode \\\n",
"id \n",
"121039042 3130 480 1918 1962 98023 \n",
"624069108 3500 3500 1991 0 98075 \n",
"1025039086 3280 0 1925 0 98199 \n",
"1732800780 3510 640 1909 0 98119 \n",
"2122039094 1770 200 1980 0 98070 \n",
"2923039243 1200 0 1972 0 98070 \n",
"3024059014 2110 910 1916 1988 98040 \n",
"3222049024 1100 0 1922 0 98198 \n",
"3426049284 2720 1390 2001 0 98115 \n",
"3741600020 1470 630 1948 0 98166 \n",
"3760500336 3190 0 1982 0 98034 \n",
"3867400175 900 900 1962 0 98116 \n",
"6329000050 1000 0 1950 0 98146 \n",
"6762700020 8570 3480 1910 1987 98102 \n",
"7278100515 1940 970 1985 0 98177 \n",
"7490000040 3730 0 1963 0 98004 \n",
"7631200292 1160 790 1952 0 98166 \n",
"7636800041 3720 660 1968 1990 98166 \n",
"8907500070 6720 1280 2009 0 98004 \n",
"8964800890 2760 1800 1995 0 98004 \n",
"9208900037 8860 1030 2001 0 98039 \n",
"\n",
" lat long sqft_living15 sqft_lot15 \n",
"id \n",
"121039042 47.3351 -122.362 2630 42126 \n",
"624069108 47.5928 -122.086 4913 14663 \n",
"1025039086 47.6699 -122.416 3530 21074 \n",
"1732800780 47.6303 -122.362 2250 4050 \n",
"2122039094 47.3844 -122.438 2280 75396 \n",
"2923039243 47.4557 -122.443 1670 47462 \n",
"3024059014 47.5395 -122.210 3890 11489 \n",
"3222049024 47.3440 -122.331 2550 7847 \n",
"3426049284 47.6934 -122.271 2640 15929 \n",
"3741600020 47.4544 -122.366 2410 17196 \n",
"3760500336 47.6991 -122.235 2750 13496 \n",
"3867400175 47.5934 -122.390 2090 4173 \n",
"6329000050 47.5007 -122.382 1090 6536 \n",
"6762700020 47.6298 -122.323 3940 8800 \n",
"7278100515 47.7729 -122.393 2540 23598 \n",
"7490000040 47.6240 -122.221 4180 19110 \n",
"7631200292 47.4504 -122.377 1780 11721 \n",
"7636800041 47.4734 -122.365 2460 18512 \n",
"8907500070 47.6232 -122.220 4600 21750 \n",
"8964800890 47.6205 -122.214 4060 13362 \n",
"9208900037 47.6305 -122.240 4540 42730 \n",
"\n",
"[21 rows x 21 columns]"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessing_result = pipeline_end.transform(X_test)\n",
"preprocessed_df = pd.DataFrame(\n",
" preprocessing_result,\n",
" columns=pipeline_end.get_feature_names_out(),\n",
")\n",
"\n",
"y_pred = class_models[best_model][\"preds\"]\n",
"\n",
"error_index = y_test[y_test[\"waterfront\"] != y_pred].index.tolist()\n",
"display(f\"Error items count: {len(error_index)}\")\n",
"\n",
"error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n",
"error_df = X_test.loc[error_index].copy()\n",
"error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n",
"error_df.sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Пример использования обученной модели (конвейера) для предсказания"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>condition</th>\n",
" <th>grade</th>\n",
" <th>sqft_above</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>624069108</th>\n",
" <td>20140812T000000</td>\n",
" <td>3200000.0</td>\n",
" <td>4</td>\n",
" <td>3.25</td>\n",
" <td>7000</td>\n",
" <td>28206</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>12</td>\n",
" <td>3500</td>\n",
" <td>3500</td>\n",
" <td>1991</td>\n",
" <td>0</td>\n",
" <td>98075</td>\n",
" <td>47.5928</td>\n",
" <td>-122.086</td>\n",
" <td>4913</td>\n",
" <td>14663</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date price bedrooms bathrooms sqft_living sqft_lot \\\n",
"624069108 20140812T000000 3200000.0 4 3.25 7000 28206 \n",
"\n",
" floors waterfront view condition grade sqft_above sqft_basement \\\n",
"624069108 1.0 1 4 4 12 3500 3500 \n",
"\n",
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
"624069108 1991 0 98075 47.5928 -122.086 4913 \n",
"\n",
" sqft_lot15 \n",
"624069108 14663 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Region_north</th>\n",
" <th>House_age</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>view</th>\n",
" <th>condition</th>\n",
" <th>...</th>\n",
" <th>date_20150506T000000</th>\n",
" <th>date_20150507T000000</th>\n",
" <th>date_20150508T000000</th>\n",
" <th>date_20150509T000000</th>\n",
" <th>date_20150510T000000</th>\n",
" <th>date_20150511T000000</th>\n",
" <th>date_20150512T000000</th>\n",
" <th>date_20150513T000000</th>\n",
" <th>date_20150514T000000</th>\n",
" <th>date_20150515T000000</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>624069108</th>\n",
" <td>1.0</td>\n",
" <td>33.0</td>\n",
" <td>7.494206</td>\n",
" <td>0.6818</td>\n",
" <td>1.479217</td>\n",
" <td>5.372072</td>\n",
" <td>0.29821</td>\n",
" <td>-0.918509</td>\n",
" <td>4.922704</td>\n",
" <td>0.909775</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1 rows × 384 columns</p>\n",
"</div>"
],
"text/plain": [
" Region_north House_age price bedrooms bathrooms \\\n",
"624069108 1.0 33.0 7.494206 0.6818 1.479217 \n",
"\n",
" sqft_living sqft_lot floors view condition ... \\\n",
"624069108 5.372072 0.29821 -0.918509 4.922704 0.909775 ... \n",
"\n",
" date_20150506T000000 date_20150507T000000 date_20150508T000000 \\\n",
"624069108 0.0 0.0 0.0 \n",
"\n",
" date_20150509T000000 date_20150510T000000 date_20150511T000000 \\\n",
"624069108 0.0 0.0 0.0 \n",
"\n",
" date_20150512T000000 date_20150513T000000 date_20150514T000000 \\\n",
"624069108 0.0 0.0 0.0 \n",
"\n",
" date_20150515T000000 \n",
"624069108 0.0 \n",
"\n",
"[1 rows x 384 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'predicted: 0 (proba: [0.8437713 0.1562287])'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'real: 1'"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"model = class_models[best_model][\"pipeline\"]\n",
"\n",
"example_id = 624069108\n",
"test = pd.DataFrame(X_test.loc[example_id, :]).T\n",
"test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T\n",
"display(test)\n",
"display(test_preprocessed)\n",
"result_proba = model.predict_proba(test)[0]\n",
"result = model.predict(test)[0]\n",
"real = int(y_test.loc[example_id].values[0])\n",
"display(f\"predicted: {result} (proba: {result_proba})\")\n",
"display(f\"real: {real}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Подбор гиперпараметров методом поиска по сетке\n",
"\n",
"https://www.kaggle.com/code/sociopath00/random-forest-using-gridsearchcv\n",
"\n",
"https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import GridSearchCV\n",
"\n",
"optimized_model_type = \"random_forest\"\n",
"\n",
"random_forest_model = class_models[optimized_model_type][\"pipeline\"]\n",
"\n",
"param_grid = {\n",
" \"model__n_estimators\": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],\n",
" \"model__max_features\": [\"sqrt\", \"log2\", 2],\n",
" \"model__max_depth\": [2, 3, 4, 5, 6, 7, 8, 9 ,10],\n",
" \"model__criterion\": [\"gini\", \"entropy\", \"log_loss\"],\n",
"}\n",
"\n",
"gs_optomizer = GridSearchCV(\n",
" estimator=random_forest_model, param_grid=param_grid, n_jobs=-1\n",
")\n",
"gs_optomizer.fit(X_train, y_train.values.ravel())\n",
"gs_optomizer.best_params_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Обучение модели с новыми гиперпараметрами"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"optimized_model = ensemble.RandomForestClassifier(\n",
" random_state=random_state,\n",
" criterion=\"gini\",\n",
" max_depth=7,\n",
" max_features=\"sqrt\",\n",
" n_estimators=30,\n",
")\n",
"\n",
"result = {}\n",
"\n",
"result[\"pipeline\"] = Pipeline([(\"pipeline\", pipeline_end), (\"model\", optimized_model)]).fit(X_train, y_train.values.ravel())\n",
"result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n",
"result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n",
"result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n",
"\n",
"result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n",
"result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n",
"result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n",
"result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n",
"result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n",
"result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n",
"result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n",
"result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n",
"result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n",
"result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n",
"result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n",
"result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Формирование данных для оценки старой и новой версии модели"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n",
"optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
" data=class_models[optimized_model_type]\n",
")\n",
"optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
" data=result\n",
")\n",
"optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n",
"optimized_metrics = optimized_metrics.set_index(\"Name\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оценка параметров старой и новой модели"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_c81c1_row0_col0, #T_c81c1_row0_col2, #T_c81c1_row0_col3, #T_c81c1_row1_col1 {\n",
" background-color: #a8db34;\n",
" color: #000000;\n",
"}\n",
"#T_c81c1_row0_col1, #T_c81c1_row1_col0, #T_c81c1_row1_col2, #T_c81c1_row1_col3 {\n",
" background-color: #26818e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_c81c1_row0_col4, #T_c81c1_row0_col6, #T_c81c1_row0_col7 {\n",
" background-color: #da5a6a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_c81c1_row0_col5, #T_c81c1_row1_col5 {\n",
" background-color: #0d0887;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_c81c1_row1_col4, #T_c81c1_row1_col6, #T_c81c1_row1_col7 {\n",
" background-color: #4e02a2;\n",
" color: #f1f1f1;\n",
"}\n",
"</style>\n",
"<table id=\"T_c81c1\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_c81c1_level0_col0\" class=\"col_heading level0 col0\" >Precision_train</th>\n",
" <th id=\"T_c81c1_level0_col1\" class=\"col_heading level0 col1\" >Precision_test</th>\n",
" <th id=\"T_c81c1_level0_col2\" class=\"col_heading level0 col2\" >Recall_train</th>\n",
" <th id=\"T_c81c1_level0_col3\" class=\"col_heading level0 col3\" >Recall_test</th>\n",
" <th id=\"T_c81c1_level0_col4\" class=\"col_heading level0 col4\" >Accuracy_train</th>\n",
" <th id=\"T_c81c1_level0_col5\" class=\"col_heading level0 col5\" >Accuracy_test</th>\n",
" <th id=\"T_c81c1_level0_col6\" class=\"col_heading level0 col6\" >F1_train</th>\n",
" <th id=\"T_c81c1_level0_col7\" class=\"col_heading level0 col7\" >F1_test</th>\n",
" </tr>\n",
" <tr>\n",
" <th class=\"index_name level0\" >Name</th>\n",
" <th class=\"blank col0\" >&nbsp;</th>\n",
" <th class=\"blank col1\" >&nbsp;</th>\n",
" <th class=\"blank col2\" >&nbsp;</th>\n",
" <th class=\"blank col3\" >&nbsp;</th>\n",
" <th class=\"blank col4\" >&nbsp;</th>\n",
" <th class=\"blank col5\" >&nbsp;</th>\n",
" <th class=\"blank col6\" >&nbsp;</th>\n",
" <th class=\"blank col7\" >&nbsp;</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_c81c1_level0_row0\" class=\"row_heading level0 row0\" >Old</th>\n",
" <td id=\"T_c81c1_row0_col0\" class=\"data row0 col0\" >0.894340</td>\n",
" <td id=\"T_c81c1_row0_col1\" class=\"data row0 col1\" >0.794118</td>\n",
" <td id=\"T_c81c1_row0_col2\" class=\"data row0 col2\" >0.868132</td>\n",
" <td id=\"T_c81c1_row0_col3\" class=\"data row0 col3\" >0.782609</td>\n",
" <td id=\"T_c81c1_row0_col4\" class=\"data row0 col4\" >0.910112</td>\n",
" <td id=\"T_c81c1_row0_col5\" class=\"data row0 col5\" >0.837989</td>\n",
" <td id=\"T_c81c1_row0_col6\" class=\"data row0 col6\" >0.881041</td>\n",
" <td id=\"T_c81c1_row0_col7\" class=\"data row0 col7\" >0.788321</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_c81c1_level0_row1\" class=\"row_heading level0 row1\" >New</th>\n",
" <td id=\"T_c81c1_row1_col0\" class=\"data row1 col0\" >0.867220</td>\n",
" <td id=\"T_c81c1_row1_col1\" class=\"data row1 col1\" >0.822581</td>\n",
" <td id=\"T_c81c1_row1_col2\" class=\"data row1 col2\" >0.765568</td>\n",
" <td id=\"T_c81c1_row1_col3\" class=\"data row1 col3\" >0.739130</td>\n",
" <td id=\"T_c81c1_row1_col4\" class=\"data row1 col4\" >0.865169</td>\n",
" <td id=\"T_c81c1_row1_col5\" class=\"data row1 col5\" >0.837989</td>\n",
" <td id=\"T_c81c1_row1_col6\" class=\"data row1 col6\" >0.813230</td>\n",
" <td id=\"T_c81c1_row1_col7\" class=\"data row1 col7\" >0.778626</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x1f1f1135d00>"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"optimized_metrics[\n",
" [\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" \"Accuracy_train\",\n",
" \"Accuracy_test\",\n",
" \"F1_train\",\n",
" \"F1_test\",\n",
" ]\n",
"].style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_fbb13_row0_col0, #T_fbb13_row1_col0 {\n",
" background-color: #440154;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_fbb13_row0_col1 {\n",
" background-color: #a8db34;\n",
" color: #000000;\n",
"}\n",
"#T_fbb13_row0_col2, #T_fbb13_row1_col3, #T_fbb13_row1_col4 {\n",
" background-color: #4e02a2;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_fbb13_row0_col3, #T_fbb13_row0_col4, #T_fbb13_row1_col2 {\n",
" background-color: #da5a6a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_fbb13_row1_col1 {\n",
" background-color: #26818e;\n",
" color: #f1f1f1;\n",
"}\n",
"</style>\n",
"<table id=\"T_fbb13\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_fbb13_level0_col0\" class=\"col_heading level0 col0\" >Accuracy_test</th>\n",
" <th id=\"T_fbb13_level0_col1\" class=\"col_heading level0 col1\" >F1_test</th>\n",
" <th id=\"T_fbb13_level0_col2\" class=\"col_heading level0 col2\" >ROC_AUC_test</th>\n",
" <th id=\"T_fbb13_level0_col3\" class=\"col_heading level0 col3\" >Cohen_kappa_test</th>\n",
" <th id=\"T_fbb13_level0_col4\" class=\"col_heading level0 col4\" >MCC_test</th>\n",
" </tr>\n",
" <tr>\n",
" <th class=\"index_name level0\" >Name</th>\n",
" <th class=\"blank col0\" >&nbsp;</th>\n",
" <th class=\"blank col1\" >&nbsp;</th>\n",
" <th class=\"blank col2\" >&nbsp;</th>\n",
" <th class=\"blank col3\" >&nbsp;</th>\n",
" <th class=\"blank col4\" >&nbsp;</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_fbb13_level0_row0\" class=\"row_heading level0 row0\" >Old</th>\n",
" <td id=\"T_fbb13_row0_col0\" class=\"data row0 col0\" >0.837989</td>\n",
" <td id=\"T_fbb13_row0_col1\" class=\"data row0 col1\" >0.788321</td>\n",
" <td id=\"T_fbb13_row0_col2\" class=\"data row0 col2\" >0.858893</td>\n",
" <td id=\"T_fbb13_row0_col3\" class=\"data row0 col3\" >0.657111</td>\n",
" <td id=\"T_fbb13_row0_col4\" class=\"data row0 col4\" >0.657157</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_fbb13_level0_row1\" class=\"row_heading level0 row1\" >New</th>\n",
" <td id=\"T_fbb13_row1_col0\" class=\"data row1 col0\" >0.837989</td>\n",
" <td id=\"T_fbb13_row1_col1\" class=\"data row1 col1\" >0.778626</td>\n",
" <td id=\"T_fbb13_row1_col2\" class=\"data row1 col2\" >0.859750</td>\n",
" <td id=\"T_fbb13_row1_col3\" class=\"data row1 col3\" >0.651447</td>\n",
" <td id=\"T_fbb13_row1_col4\" class=\"data row1 col4\" >0.653765</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x1f1f11345c0>"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"optimized_metrics[\n",
" [\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" \"ROC_AUC_test\",\n",
" \"Cohen_kappa_test\",\n",
" \"MCC_test\",\n",
" ]\n",
"].style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\n",
" \"ROC_AUC_test\",\n",
" \"MCC_test\",\n",
" \"Cohen_kappa_test\",\n",
" ],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA20AAAGjCAYAAAC/j/0nAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABXeklEQVR4nO3deXQUZfr28atDyAJJdwAhCwQIsiTIJqAYcVAwLC4jSAaVyYyAoK8KyKIy4sgqCjIq/FAEZ0QWBRFEEEVgEAUBASUKoiICsgSyoGISFrOQ7vcPhpY2LGlSSXeqvp9z6syku7r66RBz5a7nrqdsLpfLJQAAAACAXwrw9QAAAAAAABdG0QYAAAAAfoyiDQAAAAD8GEUbAAAAAPgxijYAAAAA8GMUbQAAAADgxyjaAAAAAMCPBfp6AACA8pWXl6eCggLDjhcUFKSQkBDDjgcAgDeskGsUbQBgIXl5eYqrF6bMo0WGHTMqKkr79+/3u4ADAJifVXKNog0ALKSgoECZR4t0MLW+7OGl75DPPe5UvTYHVFBQ4FfhBgCwBqvkGkUbAFhQWLhNYeG2Uh/HqdIfAwCA0jJ7rlG0AYAFFbmcKnIZcxwAAHzN7LnG6pEAAAAA4MeYaQMAC3LKJadKf0rSiGMAAFBaZs81ijYAsCCnnDKiAcSYowAAUDpmzzXaIwEAAADAjzHTBgAWVORyqchV+hYQI44BAEBpmT3XKNoAwILM3vsPALAWs+ca7ZEAAAAA4MeYaQMAC3LKpSITn5EEAFiL2XONog0AAABAhWb29kiKNgCwILOHGwAAZkLRBgAWZPZVtgAA1mL2XGMhEgAAAADwY8y0AYAFOf+3GXEcAAB8zey5RtEGABZUZNAqW0YcAwCA0jJ7rtEeCQAAAAB+jJk2ALCgIteZzYjjAADga2bPNYo2ALAgs/f+AwCsxey5RnskAAAAAPgxZtoAwIKcsqlINkOOAwCAr5k91yjaAMCCnK4zmxHHAQDA18yea7RHAgAAAIAfY6YNACyoyKA2EiOOAQBAaZk915hpAwCUm+PHj2vo0KGqV6+eQkNDdf311+uLL75wP+9yuTR69GhFR0crNDRUSUlJ2rNnjw9HDACA71G0AYAFnT0jacTmjQEDBmjNmjV64403tHPnTnXp0kVJSUk6cuSIJGny5MmaNm2aZs6cqa1bt6pq1arq2rWr8vLyyuLbAAAwCV/lWnmhaAMAC3K6bIZtkpSbm+ux5efnF3vP3377TUuWLNHkyZPVoUMHNWzYUGPHjlXDhg01Y8YMuVwuTZ06VU899ZS6d++uFi1aaN68eUpPT9eyZcvK+TsEAKhIjM41f0PRBgAotdjYWDkcDvc2ceLEYvucPn1aRUVFCgkJ8Xg8NDRUGzdu1P79+5WZmamkpCT3cw6HQ+3atdPmzZvL/DMAAOCvWIgEACzI6Au209LSZLfb3Y8HBwcX2zc8PFyJiYl6+umnlZCQoMjISL311lvavHmzGjZsqMzMTElSZGSkx+siIyPdzwEAcD5mX4iEog0ALKhIASoyoNmi6H//a7fbPYq2C3njjTd03333qXbt2qpUqZJat26t3r17KzU1tdRjAQBYl9G55m9ojwQAlJsrr7xS69ev14kTJ5SWlqbPP/9chYWFatCggaKioiRJWVlZHq/JyspyPwcAgBVRtAGABbkMuljbdZkXbFetWlXR0dH69ddftXr1anXv3l1xcXGKiorS2rVr3fvl5uZq69atSkxMNOqjAwBMyNe5VtZojwQAC/JV7//q1avlcrnUpEkT7d27V48//rji4+PVr18/2Ww2DR06VBMmTFCjRo0UFxenUaNGKSYmRj169Cj1WAEA5sU1bQAAGCQnJ0cjR47U4cOHVb16dSUnJ+uZZ55R5cqVJUkjRozQyZMn9cADDyg7O1s33HCDVq1aVWzFSQAArMTmcrlcvh4EAKB85ObmyuFwaOXXcaoaXvoO+ZPHnbqlxX7l5OSUaCESAACMZJVc45o2AAAAABWaUzY5FWDA5l175PHjxzV06FDVq1dPoaGhuv766/XFF1+4n3e5XBo9erSio6MVGhqqpKQk7dmzx+vPR9EGABbkq3ADAMBMBgwYoDVr1uiNN97Qzp071aVLFyUlJenIkSOSpMmTJ2vatGmaOXOmtm7dqqpVq6pr167Ky8vz6n0o2gDAgs5esG3EBgCAr/ki13777TctWbJEkydPVocOHdSwYUONHTtWDRs21IwZM+RyuTR16lQ99dRT6t69u1q0aKF58+YpPT1dy5Yt8+rzUbQBAAAAwDlyc3M9tvz8/GL7nD59WkVFRcUWywoNDdXGjRu1f/9+ZWZmKikpyf2cw+FQu3bttHnzZq/GQ9EGABZU5AowbAMAwNeMzrXY2Fg5HA73NnHixGLvGR4ersTERD399NNKT09XUVGR3nzzTW3evFkZGRnKzMyUJEVGRnq8LjIy0v1cSbHkPwBY0Jlr2krf2sg1bQAAf2B0rqWlpXmsHhkcHHze/d944w3dd999ql27tipVqqTWrVurd+/eSk1NLfVYzsUpUgAAAAA4h91u99guVLRdeeWVWr9+vU6cOKG0tDR9/vnnKiwsVIMGDRQVFSVJysrK8nhNVlaW+7mSomgDAAtyKkBFBmxOYgQA4Ad8nWtVq1ZVdHS0fv31V61evVrdu3dXXFycoqKitHbtWvd+ubm52rp1qxITE706Pu2RAGBBRl2PVuRyGTAaAABKx1e5tnr1arlcLjVp0kR79+7V448/rvj4ePXr1082m01Dhw7VhAkT1KhRI8XFxWnUqFGKiYlRjx49vHofijYAAAAAuAw5OTkaOXKkDh8+rOrVqys5OVnPPPOMKleuLEkaMWKETp48qQceeEDZ2dm64YYbtGrVqmIrTl6KzeXiNCkAWEVubq4cDocWbG+mKuGVSn28U8eL9NdW3ygnJ8fjgm0AAMqDVXKNmTYAsKAil01FrtKvsmXEMQAAKC2z5xpXkAMAAACAH2OmDQAs6OwqWaU/Dh32AADfM3uuMdMGAAAAAH6MmTYAsCCnK0BOA5ZGdrKWFQDAD5g91yjaAMCCzN5GAgCwFrPnGu2RAAAAAODHmGkDAAtyyphljZ2lHwoAAKVm9lyjaAMAC3IqQE4Dmi2MOAYAAKVl9lzzz1EBAAAAACQx0wYAllTkClCRAatsGXEMAABKy+y5RtEGABbklE1OGdH7X/pjAABQWmbPNf8sJQEAAAAAkphpAwBLMnsbCQDAWsyeaxRtAAAAACo0426uTdEGAPATZg83AADMhKLNR5xOp9LT0xUeHi6bzT8veATgX1wul44fP66YmBgFBJSuWHK6bHIacRNSA44BcyDXAHiLXCs5ijYfSU9PV2xsrK+HAaACSktLU506dXw9DMADuQbgcpFrl0bR5iPh4eGSpINf1pc9jPYieLqzcXNfDwF+6LQKtVEfun9/lIbToPZIJ+2R+B9yDRdDruF8yLWSo2jzkbOtI/awANnD/fOHA74TaKvs6yHAH7nO/I8RrWdOV4CcBqyQZcQxYA7kGi6GXMN5kWsl5p+jAgAAAABIYqYNACypSDYVqfRnNo04BgAApWX2XKNoAwALMnsbCQDAWsyea/45KgAAAACAJGbaAMCSimRMC0hR6YcCAECpmT3XKNoAwILM3kYCALAWs+eaf44KAAAAACCJog0ALKnIFWDY5tX7FhVp1KhRiouLU2hoqK688ko9/fTTcrlc7n1cLpdGjx6t6OhohYaGKikpSXv27DH6WwAAMBFf5Vp58c9RAQBM6bnnntOMGTP08ssva9euXXruuec0efJkvfTSS+59Jk+erGnTpmnmzJnaunWrqlatqq5duyovL8+HIwcAwHe4pg0ALMglm5wGXLDt+t8xcnNzPR4PDg5WcHBwsf0/++wzde/eXbfddpskqX79+nrrrbf0+eefnzmey6WpU6fqqaeeUvfu3SVJ8+bNU2RkpJYtW6Z77rmn1GMGAJiP0bnmb5hpAwALMrqNJDY2Vg6Hw71NnDjxvO97/fXXa+3atfrhhx8kSTt27NDGjRt1yy23SJL279+
"text/plain": [
"<Figure size 1000x400 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False\n",
")\n",
"\n",
"for index in range(0, len(optimized_metrics)):\n",
" c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n",
" disp = ConfusionMatrixDisplay(\n",
" confusion_matrix=c_matrix, display_labels=[\"no water\", \"water\"]\n",
" ).plot(ax=ax.flat[index])\n",
"\n",
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}