4257 lines
308 KiB
Plaintext
4257 lines
308 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Загрузка набора данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>date</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>grade</th>\n",
|
|||
|
" <th>sqft_above</th>\n",
|
|||
|
" <th>sqft_basement</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_renovated</th>\n",
|
|||
|
" <th>zipcode</th>\n",
|
|||
|
" <th>lat</th>\n",
|
|||
|
" <th>long</th>\n",
|
|||
|
" <th>sqft_living15</th>\n",
|
|||
|
" <th>sqft_lot15</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7129300520</th>\n",
|
|||
|
" <td>20141013T000000</td>\n",
|
|||
|
" <td>221900.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>1180</td>\n",
|
|||
|
" <td>5650</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1180</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1955</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98178</td>\n",
|
|||
|
" <td>47.5112</td>\n",
|
|||
|
" <td>-122.257</td>\n",
|
|||
|
" <td>1340</td>\n",
|
|||
|
" <td>5650</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6414100192</th>\n",
|
|||
|
" <td>20141209T000000</td>\n",
|
|||
|
" <td>538000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>2570</td>\n",
|
|||
|
" <td>7242</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>2170</td>\n",
|
|||
|
" <td>400</td>\n",
|
|||
|
" <td>1951</td>\n",
|
|||
|
" <td>1991</td>\n",
|
|||
|
" <td>98125</td>\n",
|
|||
|
" <td>47.7210</td>\n",
|
|||
|
" <td>-122.319</td>\n",
|
|||
|
" <td>1690</td>\n",
|
|||
|
" <td>7639</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5631500400</th>\n",
|
|||
|
" <td>20150225T000000</td>\n",
|
|||
|
" <td>180000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>770</td>\n",
|
|||
|
" <td>10000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>770</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1933</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98028</td>\n",
|
|||
|
" <td>47.7379</td>\n",
|
|||
|
" <td>-122.233</td>\n",
|
|||
|
" <td>2720</td>\n",
|
|||
|
" <td>8062</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2487200875</th>\n",
|
|||
|
" <td>20141209T000000</td>\n",
|
|||
|
" <td>604000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>3.00</td>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>5000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1050</td>\n",
|
|||
|
" <td>910</td>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98136</td>\n",
|
|||
|
" <td>47.5208</td>\n",
|
|||
|
" <td>-122.393</td>\n",
|
|||
|
" <td>1360</td>\n",
|
|||
|
" <td>5000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1954400510</th>\n",
|
|||
|
" <td>20150218T000000</td>\n",
|
|||
|
" <td>510000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.00</td>\n",
|
|||
|
" <td>1680</td>\n",
|
|||
|
" <td>8080</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1680</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1987</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98074</td>\n",
|
|||
|
" <td>47.6168</td>\n",
|
|||
|
" <td>-122.045</td>\n",
|
|||
|
" <td>1800</td>\n",
|
|||
|
" <td>7503</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>263000018</th>\n",
|
|||
|
" <td>20140521T000000</td>\n",
|
|||
|
" <td>360000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1530</td>\n",
|
|||
|
" <td>1131</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1530</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2009</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98103</td>\n",
|
|||
|
" <td>47.6993</td>\n",
|
|||
|
" <td>-122.346</td>\n",
|
|||
|
" <td>1530</td>\n",
|
|||
|
" <td>1509</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6600060120</th>\n",
|
|||
|
" <td>20150223T000000</td>\n",
|
|||
|
" <td>400000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2310</td>\n",
|
|||
|
" <td>5813</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>2310</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98146</td>\n",
|
|||
|
" <td>47.5107</td>\n",
|
|||
|
" <td>-122.362</td>\n",
|
|||
|
" <td>1830</td>\n",
|
|||
|
" <td>7200</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1523300141</th>\n",
|
|||
|
" <td>20140623T000000</td>\n",
|
|||
|
" <td>402101.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>1350</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2009</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98144</td>\n",
|
|||
|
" <td>47.5944</td>\n",
|
|||
|
" <td>-122.299</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>2007</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>291310100</th>\n",
|
|||
|
" <td>20150116T000000</td>\n",
|
|||
|
" <td>400000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1600</td>\n",
|
|||
|
" <td>2388</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1600</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2004</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98027</td>\n",
|
|||
|
" <td>47.5345</td>\n",
|
|||
|
" <td>-122.069</td>\n",
|
|||
|
" <td>1410</td>\n",
|
|||
|
" <td>1287</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1523300157</th>\n",
|
|||
|
" <td>20141015T000000</td>\n",
|
|||
|
" <td>325000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>1076</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2008</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98144</td>\n",
|
|||
|
" <td>47.5941</td>\n",
|
|||
|
" <td>-122.299</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>1357</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>21613 rows × 20 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" date price bedrooms bathrooms sqft_living \\\n",
|
|||
|
"id \n",
|
|||
|
"7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
|
|||
|
"6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
|
|||
|
"5631500400 20150225T000000 180000.0 2 1.00 770 \n",
|
|||
|
"2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
|
|||
|
"1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"263000018 20140521T000000 360000.0 3 2.50 1530 \n",
|
|||
|
"6600060120 20150223T000000 400000.0 4 2.50 2310 \n",
|
|||
|
"1523300141 20140623T000000 402101.0 2 0.75 1020 \n",
|
|||
|
"291310100 20150116T000000 400000.0 3 2.50 1600 \n",
|
|||
|
"1523300157 20141015T000000 325000.0 2 0.75 1020 \n",
|
|||
|
"\n",
|
|||
|
" sqft_lot floors waterfront view condition grade sqft_above \\\n",
|
|||
|
"id \n",
|
|||
|
"7129300520 5650 1.0 0 0 3 7 1180 \n",
|
|||
|
"6414100192 7242 2.0 0 0 3 7 2170 \n",
|
|||
|
"5631500400 10000 1.0 0 0 3 6 770 \n",
|
|||
|
"2487200875 5000 1.0 0 0 5 7 1050 \n",
|
|||
|
"1954400510 8080 1.0 0 0 3 8 1680 \n",
|
|||
|
"... ... ... ... ... ... ... ... \n",
|
|||
|
"263000018 1131 3.0 0 0 3 8 1530 \n",
|
|||
|
"6600060120 5813 2.0 0 0 3 8 2310 \n",
|
|||
|
"1523300141 1350 2.0 0 0 3 7 1020 \n",
|
|||
|
"291310100 2388 2.0 0 0 3 8 1600 \n",
|
|||
|
"1523300157 1076 2.0 0 0 3 7 1020 \n",
|
|||
|
"\n",
|
|||
|
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
|
|||
|
"id \n",
|
|||
|
"7129300520 0 1955 0 98178 47.5112 -122.257 \n",
|
|||
|
"6414100192 400 1951 1991 98125 47.7210 -122.319 \n",
|
|||
|
"5631500400 0 1933 0 98028 47.7379 -122.233 \n",
|
|||
|
"2487200875 910 1965 0 98136 47.5208 -122.393 \n",
|
|||
|
"1954400510 0 1987 0 98074 47.6168 -122.045 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"263000018 0 2009 0 98103 47.6993 -122.346 \n",
|
|||
|
"6600060120 0 2014 0 98146 47.5107 -122.362 \n",
|
|||
|
"1523300141 0 2009 0 98144 47.5944 -122.299 \n",
|
|||
|
"291310100 0 2004 0 98027 47.5345 -122.069 \n",
|
|||
|
"1523300157 0 2008 0 98144 47.5941 -122.299 \n",
|
|||
|
"\n",
|
|||
|
" sqft_living15 sqft_lot15 \n",
|
|||
|
"id \n",
|
|||
|
"7129300520 1340 5650 \n",
|
|||
|
"6414100192 1690 7639 \n",
|
|||
|
"5631500400 2720 8062 \n",
|
|||
|
"2487200875 1360 5000 \n",
|
|||
|
"1954400510 1800 7503 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"263000018 1530 1509 \n",
|
|||
|
"6600060120 1830 7200 \n",
|
|||
|
"1523300141 1020 2007 \n",
|
|||
|
"291310100 1410 1287 \n",
|
|||
|
"1523300157 1020 1357 \n",
|
|||
|
"\n",
|
|||
|
"[21613 rows x 20 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"from sklearn import set_config\n",
|
|||
|
"\n",
|
|||
|
"set_config(transform_output=\"pandas\")\n",
|
|||
|
"\n",
|
|||
|
"random_state=9\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"data/kc_house_data.csv\", index_col=\"id\")\n",
|
|||
|
"\n",
|
|||
|
"df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации\n",
|
|||
|
"\n",
|
|||
|
"Целевой признак -- waterfront"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'X_train'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>date</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>grade</th>\n",
|
|||
|
" <th>sqft_above</th>\n",
|
|||
|
" <th>sqft_basement</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_renovated</th>\n",
|
|||
|
" <th>zipcode</th>\n",
|
|||
|
" <th>lat</th>\n",
|
|||
|
" <th>long</th>\n",
|
|||
|
" <th>sqft_living15</th>\n",
|
|||
|
" <th>sqft_lot15</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3046200125</th>\n",
|
|||
|
" <td>20150406T000000</td>\n",
|
|||
|
" <td>202000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>740</td>\n",
|
|||
|
" <td>6550</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>740</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1946</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98168</td>\n",
|
|||
|
" <td>47.4807</td>\n",
|
|||
|
" <td>-122.332</td>\n",
|
|||
|
" <td>1080</td>\n",
|
|||
|
" <td>8515</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1853000030</th>\n",
|
|||
|
" <td>20150416T000000</td>\n",
|
|||
|
" <td>775000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>3550</td>\n",
|
|||
|
" <td>32807</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>3550</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1989</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98077</td>\n",
|
|||
|
" <td>47.7292</td>\n",
|
|||
|
" <td>-122.082</td>\n",
|
|||
|
" <td>3270</td>\n",
|
|||
|
" <td>35001</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1825079005</th>\n",
|
|||
|
" <td>20140609T000000</td>\n",
|
|||
|
" <td>739000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2800</td>\n",
|
|||
|
" <td>246114</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>2800</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1999</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98014</td>\n",
|
|||
|
" <td>47.6586</td>\n",
|
|||
|
" <td>-121.962</td>\n",
|
|||
|
" <td>2750</td>\n",
|
|||
|
" <td>60351</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2523039315</th>\n",
|
|||
|
" <td>20141022T000000</td>\n",
|
|||
|
" <td>481000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.00</td>\n",
|
|||
|
" <td>2580</td>\n",
|
|||
|
" <td>15653</td>\n",
|
|||
|
" <td>1.5</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>2580</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1990</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98166</td>\n",
|
|||
|
" <td>47.4561</td>\n",
|
|||
|
" <td>-122.361</td>\n",
|
|||
|
" <td>1920</td>\n",
|
|||
|
" <td>9840</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6623400246</th>\n",
|
|||
|
" <td>20140523T000000</td>\n",
|
|||
|
" <td>200000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>1350</td>\n",
|
|||
|
" <td>11507</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1350</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1966</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98055</td>\n",
|
|||
|
" <td>47.4269</td>\n",
|
|||
|
" <td>-122.197</td>\n",
|
|||
|
" <td>1320</td>\n",
|
|||
|
" <td>25675</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2523069134</th>\n",
|
|||
|
" <td>20150406T000000</td>\n",
|
|||
|
" <td>495000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2480</td>\n",
|
|||
|
" <td>91911</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1470</td>\n",
|
|||
|
" <td>1010</td>\n",
|
|||
|
" <td>1973</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98027</td>\n",
|
|||
|
" <td>47.4579</td>\n",
|
|||
|
" <td>-121.981</td>\n",
|
|||
|
" <td>2540</td>\n",
|
|||
|
" <td>91911</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1931300412</th>\n",
|
|||
|
" <td>20150416T000000</td>\n",
|
|||
|
" <td>475000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>1190</td>\n",
|
|||
|
" <td>1200</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1190</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2008</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98103</td>\n",
|
|||
|
" <td>47.6542</td>\n",
|
|||
|
" <td>-122.346</td>\n",
|
|||
|
" <td>1180</td>\n",
|
|||
|
" <td>1224</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4331000400</th>\n",
|
|||
|
" <td>20150220T000000</td>\n",
|
|||
|
" <td>252000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1.50</td>\n",
|
|||
|
" <td>1150</td>\n",
|
|||
|
" <td>13200</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1150</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1956</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98166</td>\n",
|
|||
|
" <td>47.4752</td>\n",
|
|||
|
" <td>-122.345</td>\n",
|
|||
|
" <td>1220</td>\n",
|
|||
|
" <td>13066</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9212900180</th>\n",
|
|||
|
" <td>20140625T000000</td>\n",
|
|||
|
" <td>760000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2760</td>\n",
|
|||
|
" <td>6000</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>2230</td>\n",
|
|||
|
" <td>530</td>\n",
|
|||
|
" <td>1942</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98115</td>\n",
|
|||
|
" <td>47.6877</td>\n",
|
|||
|
" <td>-122.295</td>\n",
|
|||
|
" <td>1600</td>\n",
|
|||
|
" <td>6000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7000100775</th>\n",
|
|||
|
" <td>20140721T000000</td>\n",
|
|||
|
" <td>625000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.00</td>\n",
|
|||
|
" <td>1730</td>\n",
|
|||
|
" <td>12219</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1730</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1986</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98004</td>\n",
|
|||
|
" <td>47.5825</td>\n",
|
|||
|
" <td>-122.189</td>\n",
|
|||
|
" <td>2470</td>\n",
|
|||
|
" <td>13594</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>17290 rows × 20 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" date price bedrooms bathrooms sqft_living \\\n",
|
|||
|
"id \n",
|
|||
|
"3046200125 20150406T000000 202000.0 2 1.00 740 \n",
|
|||
|
"1853000030 20150416T000000 775000.0 3 2.50 3550 \n",
|
|||
|
"1825079005 20140609T000000 739000.0 4 2.50 2800 \n",
|
|||
|
"2523039315 20141022T000000 481000.0 3 2.00 2580 \n",
|
|||
|
"6623400246 20140523T000000 200000.0 4 1.00 1350 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"2523069134 20150406T000000 495000.0 4 2.50 2480 \n",
|
|||
|
"1931300412 20150416T000000 475000.0 3 2.25 1190 \n",
|
|||
|
"4331000400 20150220T000000 252000.0 3 1.50 1150 \n",
|
|||
|
"9212900180 20140625T000000 760000.0 4 2.50 2760 \n",
|
|||
|
"7000100775 20140721T000000 625000.0 3 2.00 1730 \n",
|
|||
|
"\n",
|
|||
|
" sqft_lot floors waterfront view condition grade sqft_above \\\n",
|
|||
|
"id \n",
|
|||
|
"3046200125 6550 1.0 0 0 4 5 740 \n",
|
|||
|
"1853000030 32807 2.0 0 0 3 9 3550 \n",
|
|||
|
"1825079005 246114 2.0 0 0 3 9 2800 \n",
|
|||
|
"2523039315 15653 1.5 0 0 3 9 2580 \n",
|
|||
|
"6623400246 11507 1.0 0 0 3 7 1350 \n",
|
|||
|
"... ... ... ... ... ... ... ... \n",
|
|||
|
"2523069134 91911 1.0 0 2 4 7 1470 \n",
|
|||
|
"1931300412 1200 3.0 0 0 3 8 1190 \n",
|
|||
|
"4331000400 13200 1.0 0 0 3 7 1150 \n",
|
|||
|
"9212900180 6000 2.0 0 0 5 7 2230 \n",
|
|||
|
"7000100775 12219 1.0 0 0 4 7 1730 \n",
|
|||
|
"\n",
|
|||
|
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
|
|||
|
"id \n",
|
|||
|
"3046200125 0 1946 0 98168 47.4807 -122.332 \n",
|
|||
|
"1853000030 0 1989 0 98077 47.7292 -122.082 \n",
|
|||
|
"1825079005 0 1999 0 98014 47.6586 -121.962 \n",
|
|||
|
"2523039315 0 1990 0 98166 47.4561 -122.361 \n",
|
|||
|
"6623400246 0 1966 0 98055 47.4269 -122.197 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"2523069134 1010 1973 0 98027 47.4579 -121.981 \n",
|
|||
|
"1931300412 0 2008 0 98103 47.6542 -122.346 \n",
|
|||
|
"4331000400 0 1956 0 98166 47.4752 -122.345 \n",
|
|||
|
"9212900180 530 1942 0 98115 47.6877 -122.295 \n",
|
|||
|
"7000100775 0 1986 0 98004 47.5825 -122.189 \n",
|
|||
|
"\n",
|
|||
|
" sqft_living15 sqft_lot15 \n",
|
|||
|
"id \n",
|
|||
|
"3046200125 1080 8515 \n",
|
|||
|
"1853000030 3270 35001 \n",
|
|||
|
"1825079005 2750 60351 \n",
|
|||
|
"2523039315 1920 9840 \n",
|
|||
|
"6623400246 1320 25675 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"2523069134 2540 91911 \n",
|
|||
|
"1931300412 1180 1224 \n",
|
|||
|
"4331000400 1220 13066 \n",
|
|||
|
"9212900180 1600 6000 \n",
|
|||
|
"7000100775 2470 13594 \n",
|
|||
|
"\n",
|
|||
|
"[17290 rows x 20 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'y_train'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3046200125</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1853000030</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1825079005</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2523039315</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6623400246</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2523069134</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1931300412</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4331000400</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9212900180</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7000100775</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>17290 rows × 1 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" waterfront\n",
|
|||
|
"id \n",
|
|||
|
"3046200125 0\n",
|
|||
|
"1853000030 0\n",
|
|||
|
"1825079005 0\n",
|
|||
|
"2523039315 0\n",
|
|||
|
"6623400246 0\n",
|
|||
|
"... ...\n",
|
|||
|
"2523069134 0\n",
|
|||
|
"1931300412 0\n",
|
|||
|
"4331000400 0\n",
|
|||
|
"9212900180 0\n",
|
|||
|
"7000100775 0\n",
|
|||
|
"\n",
|
|||
|
"[17290 rows x 1 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'X_test'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>date</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>grade</th>\n",
|
|||
|
" <th>sqft_above</th>\n",
|
|||
|
" <th>sqft_basement</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_renovated</th>\n",
|
|||
|
" <th>zipcode</th>\n",
|
|||
|
" <th>lat</th>\n",
|
|||
|
" <th>long</th>\n",
|
|||
|
" <th>sqft_living15</th>\n",
|
|||
|
" <th>sqft_lot15</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1775950100</th>\n",
|
|||
|
" <td>20150113T000000</td>\n",
|
|||
|
" <td>357823.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1.50</td>\n",
|
|||
|
" <td>1240</td>\n",
|
|||
|
" <td>9196</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1240</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1968</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98072</td>\n",
|
|||
|
" <td>47.7562</td>\n",
|
|||
|
" <td>-122.094</td>\n",
|
|||
|
" <td>1690</td>\n",
|
|||
|
" <td>10800</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3550800040</th>\n",
|
|||
|
" <td>20141114T000000</td>\n",
|
|||
|
" <td>223000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>940</td>\n",
|
|||
|
" <td>7980</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>940</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1961</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98146</td>\n",
|
|||
|
" <td>47.5107</td>\n",
|
|||
|
" <td>-122.345</td>\n",
|
|||
|
" <td>1050</td>\n",
|
|||
|
" <td>7980</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1454600256</th>\n",
|
|||
|
" <td>20141013T000000</td>\n",
|
|||
|
" <td>710000.0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2570</td>\n",
|
|||
|
" <td>9600</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1620</td>\n",
|
|||
|
" <td>950</td>\n",
|
|||
|
" <td>1956</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98125</td>\n",
|
|||
|
" <td>47.7216</td>\n",
|
|||
|
" <td>-122.282</td>\n",
|
|||
|
" <td>2680</td>\n",
|
|||
|
" <td>9900</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1467400095</th>\n",
|
|||
|
" <td>20150224T000000</td>\n",
|
|||
|
" <td>545000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>1.75</td>\n",
|
|||
|
" <td>2040</td>\n",
|
|||
|
" <td>53578</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1160</td>\n",
|
|||
|
" <td>880</td>\n",
|
|||
|
" <td>1959</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98038</td>\n",
|
|||
|
" <td>47.3844</td>\n",
|
|||
|
" <td>-122.000</td>\n",
|
|||
|
" <td>2040</td>\n",
|
|||
|
" <td>53578</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>624069003</th>\n",
|
|||
|
" <td>20150102T000000</td>\n",
|
|||
|
" <td>829000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.75</td>\n",
|
|||
|
" <td>2970</td>\n",
|
|||
|
" <td>59677</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1610</td>\n",
|
|||
|
" <td>1360</td>\n",
|
|||
|
" <td>1973</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98075</td>\n",
|
|||
|
" <td>47.5953</td>\n",
|
|||
|
" <td>-122.080</td>\n",
|
|||
|
" <td>2930</td>\n",
|
|||
|
" <td>42489</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3500100189</th>\n",
|
|||
|
" <td>20140630T000000</td>\n",
|
|||
|
" <td>300000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>960</td>\n",
|
|||
|
" <td>8153</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>960</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1947</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98155</td>\n",
|
|||
|
" <td>47.7341</td>\n",
|
|||
|
" <td>-122.300</td>\n",
|
|||
|
" <td>1160</td>\n",
|
|||
|
" <td>8199</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>952001495</th>\n",
|
|||
|
" <td>20150306T000000</td>\n",
|
|||
|
" <td>588000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>1.75</td>\n",
|
|||
|
" <td>2170</td>\n",
|
|||
|
" <td>5750</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1370</td>\n",
|
|||
|
" <td>800</td>\n",
|
|||
|
" <td>1975</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98116</td>\n",
|
|||
|
" <td>47.5668</td>\n",
|
|||
|
" <td>-122.383</td>\n",
|
|||
|
" <td>1450</td>\n",
|
|||
|
" <td>5750</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6072300800</th>\n",
|
|||
|
" <td>20150505T000000</td>\n",
|
|||
|
" <td>595000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>1.75</td>\n",
|
|||
|
" <td>2510</td>\n",
|
|||
|
" <td>8989</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1680</td>\n",
|
|||
|
" <td>830</td>\n",
|
|||
|
" <td>1964</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98006</td>\n",
|
|||
|
" <td>47.5569</td>\n",
|
|||
|
" <td>-122.172</td>\n",
|
|||
|
" <td>2510</td>\n",
|
|||
|
" <td>8931</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2944010240</th>\n",
|
|||
|
" <td>20140908T000000</td>\n",
|
|||
|
" <td>988000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>3.00</td>\n",
|
|||
|
" <td>4040</td>\n",
|
|||
|
" <td>19700</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>11</td>\n",
|
|||
|
" <td>4040</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1987</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98052</td>\n",
|
|||
|
" <td>47.7205</td>\n",
|
|||
|
" <td>-122.127</td>\n",
|
|||
|
" <td>3930</td>\n",
|
|||
|
" <td>21887</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7893802670</th>\n",
|
|||
|
" <td>20150424T000000</td>\n",
|
|||
|
" <td>279900.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>3.25</td>\n",
|
|||
|
" <td>2240</td>\n",
|
|||
|
" <td>5000</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>1540</td>\n",
|
|||
|
" <td>700</td>\n",
|
|||
|
" <td>1989</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98198</td>\n",
|
|||
|
" <td>47.4114</td>\n",
|
|||
|
" <td>-122.334</td>\n",
|
|||
|
" <td>1800</td>\n",
|
|||
|
" <td>7500</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>4323 rows × 20 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" date price bedrooms bathrooms sqft_living \\\n",
|
|||
|
"id \n",
|
|||
|
"1775950100 20150113T000000 357823.0 3 1.50 1240 \n",
|
|||
|
"3550800040 20141114T000000 223000.0 3 1.00 940 \n",
|
|||
|
"1454600256 20141013T000000 710000.0 5 2.50 2570 \n",
|
|||
|
"1467400095 20150224T000000 545000.0 4 1.75 2040 \n",
|
|||
|
"624069003 20150102T000000 829000.0 4 2.75 2970 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"3500100189 20140630T000000 300000.0 2 1.00 960 \n",
|
|||
|
"952001495 20150306T000000 588000.0 4 1.75 2170 \n",
|
|||
|
"6072300800 20150505T000000 595000.0 4 1.75 2510 \n",
|
|||
|
"2944010240 20140908T000000 988000.0 4 3.00 4040 \n",
|
|||
|
"7893802670 20150424T000000 279900.0 3 3.25 2240 \n",
|
|||
|
"\n",
|
|||
|
" sqft_lot floors waterfront view condition grade sqft_above \\\n",
|
|||
|
"id \n",
|
|||
|
"1775950100 9196 1.0 0 0 3 8 1240 \n",
|
|||
|
"3550800040 7980 1.0 0 0 3 6 940 \n",
|
|||
|
"1454600256 9600 1.0 0 2 3 8 1620 \n",
|
|||
|
"1467400095 53578 1.0 0 0 5 7 1160 \n",
|
|||
|
"624069003 59677 1.0 0 2 4 8 1610 \n",
|
|||
|
"... ... ... ... ... ... ... ... \n",
|
|||
|
"3500100189 8153 1.0 0 0 3 6 960 \n",
|
|||
|
"952001495 5750 1.0 0 2 3 7 1370 \n",
|
|||
|
"6072300800 8989 1.0 0 0 4 8 1680 \n",
|
|||
|
"2944010240 19700 2.0 0 0 3 11 4040 \n",
|
|||
|
"7893802670 5000 2.0 0 0 3 9 1540 \n",
|
|||
|
"\n",
|
|||
|
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
|
|||
|
"id \n",
|
|||
|
"1775950100 0 1968 0 98072 47.7562 -122.094 \n",
|
|||
|
"3550800040 0 1961 0 98146 47.5107 -122.345 \n",
|
|||
|
"1454600256 950 1956 0 98125 47.7216 -122.282 \n",
|
|||
|
"1467400095 880 1959 0 98038 47.3844 -122.000 \n",
|
|||
|
"624069003 1360 1973 0 98075 47.5953 -122.080 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"3500100189 0 1947 0 98155 47.7341 -122.300 \n",
|
|||
|
"952001495 800 1975 0 98116 47.5668 -122.383 \n",
|
|||
|
"6072300800 830 1964 0 98006 47.5569 -122.172 \n",
|
|||
|
"2944010240 0 1987 0 98052 47.7205 -122.127 \n",
|
|||
|
"7893802670 700 1989 0 98198 47.4114 -122.334 \n",
|
|||
|
"\n",
|
|||
|
" sqft_living15 sqft_lot15 \n",
|
|||
|
"id \n",
|
|||
|
"1775950100 1690 10800 \n",
|
|||
|
"3550800040 1050 7980 \n",
|
|||
|
"1454600256 2680 9900 \n",
|
|||
|
"1467400095 2040 53578 \n",
|
|||
|
"624069003 2930 42489 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"3500100189 1160 8199 \n",
|
|||
|
"952001495 1450 5750 \n",
|
|||
|
"6072300800 2510 8931 \n",
|
|||
|
"2944010240 3930 21887 \n",
|
|||
|
"7893802670 1800 7500 \n",
|
|||
|
"\n",
|
|||
|
"[4323 rows x 20 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'y_test'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1775950100</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3550800040</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1454600256</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1467400095</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>624069003</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3500100189</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>952001495</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6072300800</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2944010240</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7893802670</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>4323 rows × 1 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" waterfront\n",
|
|||
|
"id \n",
|
|||
|
"1775950100 0\n",
|
|||
|
"3550800040 0\n",
|
|||
|
"1454600256 0\n",
|
|||
|
"1467400095 0\n",
|
|||
|
"624069003 0\n",
|
|||
|
"... ...\n",
|
|||
|
"3500100189 0\n",
|
|||
|
"952001495 0\n",
|
|||
|
"6072300800 0\n",
|
|||
|
"2944010240 0\n",
|
|||
|
"7893802670 0\n",
|
|||
|
"\n",
|
|||
|
"[4323 rows x 1 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from utils import split_stratified_into_train_val_test\n",
|
|||
|
"\n",
|
|||
|
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
|
|||
|
" df,\n",
|
|||
|
" stratify_colname=\"waterfront\",\n",
|
|||
|
" frac_train=0.80,\n",
|
|||
|
" frac_val=0,\n",
|
|||
|
" frac_test=0.20,\n",
|
|||
|
" random_state=random_state,\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"display(\"X_train\", X_train)\n",
|
|||
|
"display(\"y_train\", y_train)\n",
|
|||
|
"\n",
|
|||
|
"display(\"X_test\", X_test)\n",
|
|||
|
"display(\"y_test\", y_test)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Формирование конвейера для классификации данных\n",
|
|||
|
"\n",
|
|||
|
"preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
|
|||
|
"\n",
|
|||
|
"preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
|
|||
|
"\n",
|
|||
|
"features_preprocessing -- трансформер для предобработки признаков\n",
|
|||
|
"\n",
|
|||
|
"features_engineering -- трансформер для конструирования признаков\n",
|
|||
|
"\n",
|
|||
|
"drop_columns -- трансформер для удаления колонок\n",
|
|||
|
"\n",
|
|||
|
"features_postprocessing -- трансформер для унитарного кодирования новых признаков\n",
|
|||
|
"\n",
|
|||
|
"pipeline_end -- основной конвейер предобработки данных и конструирования признаков\n",
|
|||
|
"\n",
|
|||
|
"Конвейер выполняется последовательно.\n",
|
|||
|
"\n",
|
|||
|
"Трансформер выполняет параллельно для указанного набора колонок.\n",
|
|||
|
"\n",
|
|||
|
"Документация: \n",
|
|||
|
"\n",
|
|||
|
"https://scikit-learn.org/1.5/api/sklearn.pipeline.html\n",
|
|||
|
"\n",
|
|||
|
"https://scikit-learn.org/1.5/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 26,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from sklearn.compose import ColumnTransformer\n",
|
|||
|
"from sklearn.discriminant_analysis import StandardScaler\n",
|
|||
|
"from sklearn.impute import SimpleImputer\n",
|
|||
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"\n",
|
|||
|
"from custom_transformers import HouseFeatures\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"columns_to_drop = [\"waterfront\", \"yr_built\", \"zipcode\"]\n",
|
|||
|
"num_columns = [\n",
|
|||
|
" column\n",
|
|||
|
" for column in df.columns\n",
|
|||
|
" if column not in columns_to_drop and df[column].dtype != \"object\"\n",
|
|||
|
"]\n",
|
|||
|
"cat_columns = [\n",
|
|||
|
" column\n",
|
|||
|
" for column in df.columns\n",
|
|||
|
" if column not in columns_to_drop and df[column].dtype == \"object\"\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"num_imputer = SimpleImputer(strategy=\"median\")\n",
|
|||
|
"num_scaler = StandardScaler()\n",
|
|||
|
"preprocessing_num = Pipeline(\n",
|
|||
|
" [\n",
|
|||
|
" (\"imputer\", num_imputer),\n",
|
|||
|
" (\"scaler\", num_scaler),\n",
|
|||
|
" ]\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=-1)\n",
|
|||
|
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
|
|||
|
"preprocessing_cat = Pipeline(\n",
|
|||
|
" [\n",
|
|||
|
" (\"imputer\", cat_imputer),\n",
|
|||
|
" (\"encoder\", cat_encoder),\n",
|
|||
|
" ]\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"features_preprocessing = ColumnTransformer(\n",
|
|||
|
" verbose_feature_names_out=False,\n",
|
|||
|
" transformers=[\n",
|
|||
|
" (\"prepocessing_num\", preprocessing_num, num_columns),\n",
|
|||
|
" (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
|
|||
|
" (\"prepocessing_features\", cat_imputer, [\"yr_built\", \"zipcode\"]),\n",
|
|||
|
" ],\n",
|
|||
|
" remainder=\"passthrough\",\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"features_engineering = ColumnTransformer(\n",
|
|||
|
" verbose_feature_names_out=False,\n",
|
|||
|
" transformers=[\n",
|
|||
|
" (\"add_features\", HouseFeatures(), [\"yr_built\", \"zipcode\"]),\n",
|
|||
|
" ],\n",
|
|||
|
" remainder=\"passthrough\",\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"drop_columns = ColumnTransformer(\n",
|
|||
|
" verbose_feature_names_out=False,\n",
|
|||
|
" transformers=[\n",
|
|||
|
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
|
|||
|
" ],\n",
|
|||
|
" remainder=\"passthrough\",\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"features_postprocessing = ColumnTransformer(\n",
|
|||
|
" verbose_feature_names_out=False,\n",
|
|||
|
" transformers=[\n",
|
|||
|
" (\"prepocessing_cat\", preprocessing_cat, [\"Region\"]),\n",
|
|||
|
" ],\n",
|
|||
|
" remainder=\"passthrough\",\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"pipeline_end = Pipeline(\n",
|
|||
|
" [\n",
|
|||
|
" (\"features_preprocessing\", features_preprocessing),\n",
|
|||
|
" (\"features_engineering\", features_engineering),\n",
|
|||
|
" (\"drop_columns\", drop_columns),\n",
|
|||
|
" (\"features_postprocessing\", features_postprocessing),\n",
|
|||
|
" ]\n",
|
|||
|
")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Демонстрация работы конвейера для предобработки данных при классификации"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 27,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Region_north</th>\n",
|
|||
|
" <th>House_age</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>date_20150506T000000</th>\n",
|
|||
|
" <th>date_20150507T000000</th>\n",
|
|||
|
" <th>date_20150508T000000</th>\n",
|
|||
|
" <th>date_20150509T000000</th>\n",
|
|||
|
" <th>date_20150510T000000</th>\n",
|
|||
|
" <th>date_20150511T000000</th>\n",
|
|||
|
" <th>date_20150512T000000</th>\n",
|
|||
|
" <th>date_20150513T000000</th>\n",
|
|||
|
" <th>date_20150514T000000</th>\n",
|
|||
|
" <th>date_20150515T000000</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3046200125</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>78</td>\n",
|
|||
|
" <td>-0.945119</td>\n",
|
|||
|
" <td>-1.468373</td>\n",
|
|||
|
" <td>-1.448400</td>\n",
|
|||
|
" <td>-1.462069</td>\n",
|
|||
|
" <td>-0.205788</td>\n",
|
|||
|
" <td>-0.918509</td>\n",
|
|||
|
" <td>-0.305883</td>\n",
|
|||
|
" <td>0.909775</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1853000030</th>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>35</td>\n",
|
|||
|
" <td>0.667867</td>\n",
|
|||
|
" <td>-0.393286</td>\n",
|
|||
|
" <td>0.503345</td>\n",
|
|||
|
" <td>1.605653</td>\n",
|
|||
|
" <td>0.405288</td>\n",
|
|||
|
" <td>0.935992</td>\n",
|
|||
|
" <td>-0.305883</td>\n",
|
|||
|
" <td>-0.628763</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1825079005</th>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>25</td>\n",
|
|||
|
" <td>0.566528</td>\n",
|
|||
|
" <td>0.681800</td>\n",
|
|||
|
" <td>0.503345</td>\n",
|
|||
|
" <td>0.786866</td>\n",
|
|||
|
" <td>5.369556</td>\n",
|
|||
|
" <td>0.935992</td>\n",
|
|||
|
" <td>-0.305883</td>\n",
|
|||
|
" <td>-0.628763</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2523039315</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>34</td>\n",
|
|||
|
" <td>-0.159739</td>\n",
|
|||
|
" <td>-0.393286</td>\n",
|
|||
|
" <td>-0.147237</td>\n",
|
|||
|
" <td>0.546688</td>\n",
|
|||
|
" <td>0.006065</td>\n",
|
|||
|
" <td>0.008742</td>\n",
|
|||
|
" <td>-0.305883</td>\n",
|
|||
|
" <td>-0.628763</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6623400246</th>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>58</td>\n",
|
|||
|
" <td>-0.950749</td>\n",
|
|||
|
" <td>0.681800</td>\n",
|
|||
|
" <td>-1.448400</td>\n",
|
|||
|
" <td>-0.796122</td>\n",
|
|||
|
" <td>-0.090424</td>\n",
|
|||
|
" <td>-0.918509</td>\n",
|
|||
|
" <td>-0.305883</td>\n",
|
|||
|
" <td>-0.628763</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2523069134</th>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>51</td>\n",
|
|||
|
" <td>-0.120329</td>\n",
|
|||
|
" <td>0.681800</td>\n",
|
|||
|
" <td>0.503345</td>\n",
|
|||
|
" <td>0.437517</td>\n",
|
|||
|
" <td>1.780808</td>\n",
|
|||
|
" <td>-0.918509</td>\n",
|
|||
|
" <td>2.308411</td>\n",
|
|||
|
" <td>0.909775</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1931300412</th>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>16</td>\n",
|
|||
|
" <td>-0.176628</td>\n",
|
|||
|
" <td>-0.393286</td>\n",
|
|||
|
" <td>0.178054</td>\n",
|
|||
|
" <td>-0.970797</td>\n",
|
|||
|
" <td>-0.330298</td>\n",
|
|||
|
" <td>2.790494</td>\n",
|
|||
|
" <td>-0.305883</td>\n",
|
|||
|
" <td>-0.628763</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4331000400</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>68</td>\n",
|
|||
|
" <td>-0.804370</td>\n",
|
|||
|
" <td>-0.393286</td>\n",
|
|||
|
" <td>-0.797819</td>\n",
|
|||
|
" <td>-1.014465</td>\n",
|
|||
|
" <td>-0.051023</td>\n",
|
|||
|
" <td>-0.918509</td>\n",
|
|||
|
" <td>-0.305883</td>\n",
|
|||
|
" <td>-0.628763</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9212900180</th>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>82</td>\n",
|
|||
|
" <td>0.625642</td>\n",
|
|||
|
" <td>0.681800</td>\n",
|
|||
|
" <td>0.503345</td>\n",
|
|||
|
" <td>0.743197</td>\n",
|
|||
|
" <td>-0.218588</td>\n",
|
|||
|
" <td>0.935992</td>\n",
|
|||
|
" <td>-0.305883</td>\n",
|
|||
|
" <td>2.448313</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7000100775</th>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>38</td>\n",
|
|||
|
" <td>0.245619</td>\n",
|
|||
|
" <td>-0.393286</td>\n",
|
|||
|
" <td>-0.147237</td>\n",
|
|||
|
" <td>-0.381270</td>\n",
|
|||
|
" <td>-0.073854</td>\n",
|
|||
|
" <td>-0.918509</td>\n",
|
|||
|
" <td>-0.305883</td>\n",
|
|||
|
" <td>0.909775</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>17290 rows × 384 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Region_north House_age price bedrooms bathrooms \\\n",
|
|||
|
"id \n",
|
|||
|
"3046200125 0.0 78 -0.945119 -1.468373 -1.448400 \n",
|
|||
|
"1853000030 1.0 35 0.667867 -0.393286 0.503345 \n",
|
|||
|
"1825079005 1.0 25 0.566528 0.681800 0.503345 \n",
|
|||
|
"2523039315 0.0 34 -0.159739 -0.393286 -0.147237 \n",
|
|||
|
"6623400246 1.0 58 -0.950749 0.681800 -1.448400 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"2523069134 1.0 51 -0.120329 0.681800 0.503345 \n",
|
|||
|
"1931300412 1.0 16 -0.176628 -0.393286 0.178054 \n",
|
|||
|
"4331000400 0.0 68 -0.804370 -0.393286 -0.797819 \n",
|
|||
|
"9212900180 1.0 82 0.625642 0.681800 0.503345 \n",
|
|||
|
"7000100775 1.0 38 0.245619 -0.393286 -0.147237 \n",
|
|||
|
"\n",
|
|||
|
" sqft_living sqft_lot floors view condition ... \\\n",
|
|||
|
"id ... \n",
|
|||
|
"3046200125 -1.462069 -0.205788 -0.918509 -0.305883 0.909775 ... \n",
|
|||
|
"1853000030 1.605653 0.405288 0.935992 -0.305883 -0.628763 ... \n",
|
|||
|
"1825079005 0.786866 5.369556 0.935992 -0.305883 -0.628763 ... \n",
|
|||
|
"2523039315 0.546688 0.006065 0.008742 -0.305883 -0.628763 ... \n",
|
|||
|
"6623400246 -0.796122 -0.090424 -0.918509 -0.305883 -0.628763 ... \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"2523069134 0.437517 1.780808 -0.918509 2.308411 0.909775 ... \n",
|
|||
|
"1931300412 -0.970797 -0.330298 2.790494 -0.305883 -0.628763 ... \n",
|
|||
|
"4331000400 -1.014465 -0.051023 -0.918509 -0.305883 -0.628763 ... \n",
|
|||
|
"9212900180 0.743197 -0.218588 0.935992 -0.305883 2.448313 ... \n",
|
|||
|
"7000100775 -0.381270 -0.073854 -0.918509 -0.305883 0.909775 ... \n",
|
|||
|
"\n",
|
|||
|
" date_20150506T000000 date_20150507T000000 date_20150508T000000 \\\n",
|
|||
|
"id \n",
|
|||
|
"3046200125 0.0 0.0 0.0 \n",
|
|||
|
"1853000030 0.0 0.0 0.0 \n",
|
|||
|
"1825079005 0.0 0.0 0.0 \n",
|
|||
|
"2523039315 0.0 0.0 0.0 \n",
|
|||
|
"6623400246 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"2523069134 0.0 0.0 0.0 \n",
|
|||
|
"1931300412 0.0 0.0 0.0 \n",
|
|||
|
"4331000400 0.0 0.0 0.0 \n",
|
|||
|
"9212900180 0.0 0.0 0.0 \n",
|
|||
|
"7000100775 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" date_20150509T000000 date_20150510T000000 date_20150511T000000 \\\n",
|
|||
|
"id \n",
|
|||
|
"3046200125 0.0 0.0 0.0 \n",
|
|||
|
"1853000030 0.0 0.0 0.0 \n",
|
|||
|
"1825079005 0.0 0.0 0.0 \n",
|
|||
|
"2523039315 0.0 0.0 0.0 \n",
|
|||
|
"6623400246 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"2523069134 0.0 0.0 0.0 \n",
|
|||
|
"1931300412 0.0 0.0 0.0 \n",
|
|||
|
"4331000400 0.0 0.0 0.0 \n",
|
|||
|
"9212900180 0.0 0.0 0.0 \n",
|
|||
|
"7000100775 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" date_20150512T000000 date_20150513T000000 date_20150514T000000 \\\n",
|
|||
|
"id \n",
|
|||
|
"3046200125 0.0 0.0 0.0 \n",
|
|||
|
"1853000030 0.0 0.0 0.0 \n",
|
|||
|
"1825079005 0.0 0.0 0.0 \n",
|
|||
|
"2523039315 0.0 0.0 0.0 \n",
|
|||
|
"6623400246 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"2523069134 0.0 0.0 0.0 \n",
|
|||
|
"1931300412 0.0 0.0 0.0 \n",
|
|||
|
"4331000400 0.0 0.0 0.0 \n",
|
|||
|
"9212900180 0.0 0.0 0.0 \n",
|
|||
|
"7000100775 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" date_20150515T000000 \n",
|
|||
|
"id \n",
|
|||
|
"3046200125 0.0 \n",
|
|||
|
"1853000030 0.0 \n",
|
|||
|
"1825079005 0.0 \n",
|
|||
|
"2523039315 0.0 \n",
|
|||
|
"6623400246 0.0 \n",
|
|||
|
"... ... \n",
|
|||
|
"2523069134 0.0 \n",
|
|||
|
"1931300412 0.0 \n",
|
|||
|
"4331000400 0.0 \n",
|
|||
|
"9212900180 0.0 \n",
|
|||
|
"7000100775 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[17290 rows x 384 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 27,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"preprocessing_result = pipeline_end.fit_transform(X_train)\n",
|
|||
|
"preprocessed_df = pd.DataFrame(\n",
|
|||
|
" preprocessing_result,\n",
|
|||
|
" columns=pipeline_end.get_feature_names_out(),\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"preprocessed_df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Формирование набора моделей для классификации\n",
|
|||
|
"\n",
|
|||
|
"logistic -- логистическая регрессия\n",
|
|||
|
"\n",
|
|||
|
"ridge -- гребневая регрессия\n",
|
|||
|
"\n",
|
|||
|
"decision_tree -- дерево решений\n",
|
|||
|
"\n",
|
|||
|
"knn -- k-ближайших соседей\n",
|
|||
|
"\n",
|
|||
|
"naive_bayes -- наивный Байесовский классификатор\n",
|
|||
|
"\n",
|
|||
|
"gradient_boosting -- метод градиентного бустинга (набор деревьев решений)\n",
|
|||
|
"\n",
|
|||
|
"random_forest -- метод случайного леса (набор деревьев решений)\n",
|
|||
|
"\n",
|
|||
|
"mlp -- многослойный персептрон (нейронная сеть)\n",
|
|||
|
"\n",
|
|||
|
"Документация: https://scikit-learn.org/1.5/supervised_learning.html"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree\n",
|
|||
|
"\n",
|
|||
|
"class_models = {\n",
|
|||
|
" \"logistic\": {\"model\": linear_model.LogisticRegression()},\n",
|
|||
|
" # \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n",
|
|||
|
" \"ridge\": {\"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")},\n",
|
|||
|
" \"decision_tree\": {\n",
|
|||
|
" \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)\n",
|
|||
|
" },\n",
|
|||
|
" \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n",
|
|||
|
" \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n",
|
|||
|
" \"gradient_boosting\": {\n",
|
|||
|
" \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n",
|
|||
|
" },\n",
|
|||
|
" \"random_forest\": {\n",
|
|||
|
" \"model\": ensemble.RandomForestClassifier(\n",
|
|||
|
" max_depth=11, class_weight=\"balanced\", random_state=random_state\n",
|
|||
|
" )\n",
|
|||
|
" },\n",
|
|||
|
" \"mlp\": {\n",
|
|||
|
" \"model\": neural_network.MLPClassifier(\n",
|
|||
|
" hidden_layer_sizes=(7,),\n",
|
|||
|
" max_iter=500,\n",
|
|||
|
" early_stopping=True,\n",
|
|||
|
" random_state=random_state,\n",
|
|||
|
" )\n",
|
|||
|
" },\n",
|
|||
|
"}"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Обучение моделей на обучающем наборе данных и оценка на тестовом"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 29,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: logistic\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
|
|||
|
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
|
|||
|
"\n",
|
|||
|
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
|
|||
|
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
|
|||
|
"Please also refer to the documentation for alternative solver options:\n",
|
|||
|
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
|
|||
|
" n_iter_i = _check_optimize_result(\n",
|
|||
|
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: ridge\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
|
|||
|
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
|
|||
|
"\n",
|
|||
|
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
|
|||
|
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
|
|||
|
"Please also refer to the documentation for alternative solver options:\n",
|
|||
|
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
|
|||
|
" n_iter_i = _check_optimize_result(\n",
|
|||
|
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: decision_tree\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: knn\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: naive_bayes\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: gradient_boosting\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: random_forest\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: mlp\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import numpy as np\n",
|
|||
|
"from sklearn import metrics\n",
|
|||
|
"\n",
|
|||
|
"for model_name in class_models.keys():\n",
|
|||
|
" print(f\"Model: {model_name}\")\n",
|
|||
|
" model = class_models[model_name][\"model\"]\n",
|
|||
|
"\n",
|
|||
|
" model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
|
|||
|
" model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n",
|
|||
|
"\n",
|
|||
|
" y_train_predict = model_pipeline.predict(X_train)\n",
|
|||
|
" y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n",
|
|||
|
" y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n",
|
|||
|
"\n",
|
|||
|
" class_models[model_name][\"pipeline\"] = model_pipeline\n",
|
|||
|
" class_models[model_name][\"probs\"] = y_test_probs\n",
|
|||
|
" class_models[model_name][\"preds\"] = y_test_predict\n",
|
|||
|
"\n",
|
|||
|
" class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n",
|
|||
|
" y_train, y_train_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n",
|
|||
|
" y_test, y_test_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n",
|
|||
|
" y_train, y_train_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n",
|
|||
|
" y_test, y_test_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n",
|
|||
|
" y_train, y_train_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n",
|
|||
|
" y_test, y_test_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n",
|
|||
|
" y_test, y_test_probs\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n",
|
|||
|
" class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n",
|
|||
|
" class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n",
|
|||
|
" y_test, y_test_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n",
|
|||
|
" y_test, y_test_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n",
|
|||
|
" y_test, y_test_predict\n",
|
|||
|
" )"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Сводная таблица оценок качества для использованных моделей классификации\n",
|
|||
|
"\n",
|
|||
|
"Документация: https://scikit-learn.org/1.5/modules/model_evaluation.html"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Матрица неточностей"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 32,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2IAAAQ9CAYAAAA70P4+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeVxU5f4H8M8ZYNh3BSQQUdwQ3KhfUSpoJpqapl6z3FO7Gi5grjc3NLX0uuZWmqJXy6ysm0sqLpCmmRvmgrihUAioCCiyzpzfH1xOTsDA6AAzcz7v1+u8rnPOM2eeOVfPp+8zzzlHEEVRBBEREREREdUYRW13gIiIiIiISG5YiBEREREREdUwFmJEREREREQ1jIUYERERERFRDWMhRkREREREVMNYiBEREREREdUwFmJEREREREQ1jIUYERERERFRDWMhRkREREREVMNYiJHBi46OhiAIuHXrVrXs/9atWxAEAdHR0XrZX2xsLARBQGxsrF72R0REZCrmzJkDQRCq1FYQBMyZM6d6O0RUi1iIET2lNWvW6K14IyIiIiJ5Ma/tDhDVNh8fH+Tl5cHCwkKn961ZswZ16tTBsGHDNNZ36NABeXl5UCqVeuwlERGR8ZsxYwamTZtW290gMggsxEj2BEGAlZWV3vanUCj0uj8iIiJTkJubC1tbW5ib8z8/iQBOTSQjtWbNGrRo0QKWlpbw9PREeHg4srKyyrRbvXo1GjZsCGtra/zf//0fjh49itDQUISGhkptyrtGLC0tDcOHD4eXlxcsLS1Rr1499OrVS7pOrUGDBrh06RLi4uIgCAIEQZD2WdE1YidPnsTrr78OZ2dn2NraomXLllixYoV+DwwREZEBKL0W7PLly3jnnXfg7OyMdu3alXuNWEFBASIjI1G3bl3Y29vjjTfewB9//FHufmNjY/H888/DysoKjRo1wmeffVbhdWdbt25FUFAQrK2t4eLiggEDBiAlJaVavi/R0+CQBBmdOXPmICoqCp07d8aYMWOQmJiItWvX4tSpU/jll1+kKYZr167F2LFj0b59e0RGRuLWrVvo3bs3nJ2d4eXlpfUz+vbti0uXLmHcuHFo0KABMjIyEBMTg+TkZDRo0ADLly/HuHHjYGdnhw8//BAA4O7uXuH+YmJi0KNHD9SrVw8TJkyAh4cHEhISsHv3bkyYMEF/B4eIiMiA/OMf/0Djxo2xYMECiKKIjIyMMm1GjhyJrVu34p133sHLL7+Mw4cPo3v37mXanTt3Dl27dkW9evUQFRUFlUqFuXPnom7dumXazp8/HzNnzkT//v0xcuRI3L17F59++ik6dOiAc+fOwcnJqTq+LpFuRCIDt2nTJhGAmJSUJGZkZIhKpVLs0qWLqFKppDarVq0SAYgbN24URVEUCwoKRFdXV/GFF14Qi4qKpHbR0dEiADEkJERal5SUJAIQN23aJIqiKD548EAEIC5evFhrv1q0aKGxn1JHjhwRAYhHjhwRRVEUi4uLRV9fX9HHx0d88OCBRlu1Wl31A0FERGQkZs+eLQIQ33777XLXl4qPjxcBiO+//75Gu3feeUcEIM6ePVta17NnT9HGxkb8888/pXXXrl0Tzc3NNfZ569Yt0czMTJw/f77GPi9cuCCam5uXWU9UWzg1kYzKwYMHUVhYiIiICCgUf/31HTVqFBwcHLBnzx4AwOnTp3H//n2MGjVKYy76wIED4ezsrPUzrK2toVQqERsbiwcPHjxzn8+dO4ekpCRERESUGYGr6i18iYiIjNHo0aO1bt+7dy8AYPz48RrrIyIiNF6rVCocPHgQvXv3hqenp7Tez88P3bp102i7c+dOqNVq9O/fH/fu3ZMWDw8PNG7cGEeOHHmGb0SkP5yaSEbl9u3bAICmTZtqrFcqlWjYsKG0vfR//fz8NNqZm5ujQYMGWj/D0tISn3zyCT744AO4u7vjpZdeQo8ePTBkyBB4eHjo3OcbN24AAAICAnR+LxERkTHz9fXVuv327dtQKBRo1KiRxvq/53xGRgby8vLK5DpQNuuvXbsGURTRuHHjcj9T17skE1UXFmJE5YiIiEDPnj3xww8/YP/+/Zg5cyYWLlyIw4cPo02bNrXdPSIiIqNgbW1d45+pVqshCAJ++uknmJmZldluZ2dX430iKg+nJpJR8fHxAQAkJiZqrC8sLERSUpK0vfR/r1+/rtGuuLhYuvNhZRo1aoQPPvgABw4cwMWLF1FYWIglS5ZI26s6rbB0lO/ixYtVak9ERCQXPj4+UKvV0uyRUn/PeTc3N1hZWZXJdaBs1jdq1AiiKMLX1xedO3cus7z00kv6/yJET4GFGBmVzp07Q6lUYuXKlRBFUVr/xRdfIDs7W7rL0vPPPw9XV1esX78excXFUrtt27ZVet3X48ePkZ+fr7GuUaNGsLe3R0FBgbTO1ta23Fvm/13btm3h6+uL5cuXl2n/5HcgIiKSm9Lru1auXKmxfvny5RqvzczM0LlzZ/zwww9ITU2V1l+/fh0//fSTRts+ffrAzMwMUVFRZXJWFEXcv39fj9+A6OlxaiIZlbp162L69OmIiopC165d8cYbbyAxMRFr1qzBCy+8gEGDBgEouWZszpw5GDduHDp16oT+/fvj1q1biI6ORqNGjbT+mnX16lW8+uqr6N+/P/z9/WFubo7vv/8e6enpGDBggNQuKCgIa9euxUcffQQ/Pz+4ubmhU6dOZfanUCiwdu1a9OzZE61bt8bw4cNRr149XLlyBZcuXcL+/fv1f6CIiIiMQOvWrfH2229jzZo1yM7Oxssvv4xDhw6V+8vXnDlzcODAAbzyyisYM2YMVCoVVq1ahYCAAMTHx0vtGjVqhI8++gjTp0+XHl1jb2+PpKQkfP/993jvvfcwadKkGvyWROVjIUZGZ86cOahbty5WrVqFyMhIuLi44L333sOCBQs0LsAdO3YsRFHEkiVLMGnSJLRq1Qo//vgjxo8fDysrqwr37+3tjbfffhuHDh3Cf/7zH5ibm6NZs2bYsWMH+vbtK7WbNWsWbt++jUWLFuHhw4cICQkptxADgLCwMBw5cgRRUVFYsmQJ1Go1GjVqhFGjRunvwBARERmhjRs3om7duti2bRt++OEHdOrUCXv27IG3t7dGu6CgIPz000+YNGkSZs6cCW9vb8ydOxcJCQm4cuWKRttp06ahSZMmWLZsGaKiogCU5HuXLl3wxhtv1Nh3I9JGEDk3imRErVajbt266NOnD9avX1/b3SEiIqJn1Lt3b1y6dAnXrl2r7a4Q6YTXiJHJys/PLzM3fMuWLcjMzERoaGjtdIqIiIieWl5ensbra9euYe/evcx1Mkr8RYxMVmxsLCIjI/GPf/wDrq6uOHv2LL744gs0b94cZ86cgVKprO0uEhERkQ7q1auHYcOGSc8OXbt2LQoKCnDu3LkKnxtGZKh4jRiZrAYNGsDb2xsrV65EZmYmXFxcMGTIEHz88ccswoiIiIxQ165d8dVXXyEtLQ2WlpYIDg7GggULWISRUeIvYkRERERERDWM14gRERERERHVMBZiRERERERENYzXiBkYtVqN1NRU2Nvba33oMJEpEkURDx8+hKenJxQK/Y4T5efno7CwsNJ2SqVS63PmiEh+mM0kZ8zm6sNCzMCkpqaWeYAhkdykpKTAy8tLb/vLz8+Hr48d0jJUlbb18PBAUlKSSZ7wiejpMJuJmM3VgYWYgbG3twcA3D7bAA52nDlaG95sEljbXZCtYhThGPZK/w70pbCwEGkZKlw/7Q0H+4r/XeU8VMPv+RQUFhaa3MmeiJ4es7n29WkRVNtdkK1isQhHi39gNlcDFmIGpnTKg4OdQutfSqo+5oJFbXdBvv53D9fqmvpjZy/Azr7ifavBKUdEVBazufYxm2sfs1n/WIgRkWwUiSoUaXliR5GorsHeEBERkZyzmYUYEcmGGiLUqPhkr20bERER6Z+cs5mFGBHJhhoiVDI92RMRERkiOWczJzoTkWwUiepKFyIiIqo51ZnNH3/8MQRBQEREhLQuPz8f4eHhcHV1hZ2dHfr27Yv09HSN9yUnJ6N79+6wsbGBm5s
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1200x1000 with 16 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.metrics import ConfusionMatrixDisplay\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n",
|
|||
|
"for index, key in enumerate(class_models.keys()):\n",
|
|||
|
" c_matrix = class_models[key][\"Confusion_matrix\"]\n",
|
|||
|
" disp = ConfusionMatrixDisplay(\n",
|
|||
|
" confusion_matrix=c_matrix, display_labels=[\"no water\", \"water\"]\n",
|
|||
|
" ).plot(ax=ax.flat[index])\n",
|
|||
|
" disp.ax_.set_title(key)\n",
|
|||
|
"\n",
|
|||
|
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Точность, полнота, верность (аккуратность), F-мера"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 34,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style type=\"text/css\">\n",
|
|||
|
"#T_060dc_row0_col0 {\n",
|
|||
|
" background-color: #73d056;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row0_col1, #T_060dc_row1_col1 {\n",
|
|||
|
" background-color: #52c569;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row0_col2 {\n",
|
|||
|
" background-color: #3dbc74;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row0_col3 {\n",
|
|||
|
" background-color: #5cc863;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row0_col4, #T_060dc_row3_col4 {\n",
|
|||
|
" background-color: #d9586a;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row0_col5, #T_060dc_row0_col7, #T_060dc_row1_col4, #T_060dc_row1_col5, #T_060dc_row2_col4, #T_060dc_row2_col5, #T_060dc_row2_col6, #T_060dc_row3_col5, #T_060dc_row4_col5 {\n",
|
|||
|
" background-color: #da5a6a;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row0_col6, #T_060dc_row5_col7 {\n",
|
|||
|
" background-color: #bb3488;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row1_col0 {\n",
|
|||
|
" background-color: #95d840;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row1_col2 {\n",
|
|||
|
" background-color: #a2da37;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row1_col3, #T_060dc_row2_col3, #T_060dc_row3_col1 {\n",
|
|||
|
" background-color: #3fbc73;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row1_col6 {\n",
|
|||
|
" background-color: #d5546e;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row1_col7 {\n",
|
|||
|
" background-color: #d14e72;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row2_col0, #T_060dc_row2_col2, #T_060dc_row4_col1, #T_060dc_row5_col2, #T_060dc_row6_col2, #T_060dc_row6_col3, #T_060dc_row7_col2 {\n",
|
|||
|
" background-color: #a8db34;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row2_col1 {\n",
|
|||
|
" background-color: #44bf70;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row2_col7 {\n",
|
|||
|
" background-color: #cc4977;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row3_col0 {\n",
|
|||
|
" background-color: #6ccd5a;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row3_col2 {\n",
|
|||
|
" background-color: #31b57b;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row3_col3 {\n",
|
|||
|
" background-color: #32b67a;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row3_col6, #T_060dc_row6_col7 {\n",
|
|||
|
" background-color: #b52f8c;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row3_col7 {\n",
|
|||
|
" background-color: #c5407e;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row4_col0 {\n",
|
|||
|
" background-color: #98d83e;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row4_col2, #T_060dc_row4_col3, #T_060dc_row7_col0, #T_060dc_row7_col1 {\n",
|
|||
|
" background-color: #26818e;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row4_col4, #T_060dc_row5_col5 {\n",
|
|||
|
" background-color: #d8576b;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row4_col6 {\n",
|
|||
|
" background-color: #7501a8;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row4_col7 {\n",
|
|||
|
" background-color: #6900a8;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row5_col0 {\n",
|
|||
|
" background-color: #21a685;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row5_col1 {\n",
|
|||
|
" background-color: #1fa287;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row5_col3 {\n",
|
|||
|
" background-color: #93d741;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row5_col4, #T_060dc_row6_col5 {\n",
|
|||
|
" background-color: #d7566c;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row5_col6 {\n",
|
|||
|
" background-color: #a31e9a;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row6_col0 {\n",
|
|||
|
" background-color: #20a386;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row6_col1 {\n",
|
|||
|
" background-color: #1fa088;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row6_col4 {\n",
|
|||
|
" background-color: #d6556d;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row6_col6 {\n",
|
|||
|
" background-color: #a01a9c;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row7_col3 {\n",
|
|||
|
" background-color: #20a486;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_060dc_row7_col4, #T_060dc_row7_col5, #T_060dc_row7_col6, #T_060dc_row7_col7 {\n",
|
|||
|
" background-color: #4e02a2;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"</style>\n",
|
|||
|
"<table id=\"T_060dc\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"blank level0\" > </th>\n",
|
|||
|
" <th id=\"T_060dc_level0_col0\" class=\"col_heading level0 col0\" >Precision_train</th>\n",
|
|||
|
" <th id=\"T_060dc_level0_col1\" class=\"col_heading level0 col1\" >Precision_test</th>\n",
|
|||
|
" <th id=\"T_060dc_level0_col2\" class=\"col_heading level0 col2\" >Recall_train</th>\n",
|
|||
|
" <th id=\"T_060dc_level0_col3\" class=\"col_heading level0 col3\" >Recall_test</th>\n",
|
|||
|
" <th id=\"T_060dc_level0_col4\" class=\"col_heading level0 col4\" >Accuracy_train</th>\n",
|
|||
|
" <th id=\"T_060dc_level0_col5\" class=\"col_heading level0 col5\" >Accuracy_test</th>\n",
|
|||
|
" <th id=\"T_060dc_level0_col6\" class=\"col_heading level0 col6\" >F1_train</th>\n",
|
|||
|
" <th id=\"T_060dc_level0_col7\" class=\"col_heading level0 col7\" >F1_test</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_060dc_level0_row0\" class=\"row_heading level0 row0\" >logistic</th>\n",
|
|||
|
" <td id=\"T_060dc_row0_col0\" class=\"data row0 col0\" >0.813725</td>\n",
|
|||
|
" <td id=\"T_060dc_row0_col1\" class=\"data row0 col1\" >0.676471</td>\n",
|
|||
|
" <td id=\"T_060dc_row0_col2\" class=\"data row0 col2\" >0.638462</td>\n",
|
|||
|
" <td id=\"T_060dc_row0_col3\" class=\"data row0 col3\" >0.696970</td>\n",
|
|||
|
" <td id=\"T_060dc_row0_col4\" class=\"data row0 col4\" >0.996183</td>\n",
|
|||
|
" <td id=\"T_060dc_row0_col5\" class=\"data row0 col5\" >0.995142</td>\n",
|
|||
|
" <td id=\"T_060dc_row0_col6\" class=\"data row0 col6\" >0.715517</td>\n",
|
|||
|
" <td id=\"T_060dc_row0_col7\" class=\"data row0 col7\" >0.686567</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_060dc_level0_row1\" class=\"row_heading level0 row1\" >decision_tree</th>\n",
|
|||
|
" <td id=\"T_060dc_row1_col0\" class=\"data row1 col0\" >0.934307</td>\n",
|
|||
|
" <td id=\"T_060dc_row1_col1\" class=\"data row1 col1\" >0.678571</td>\n",
|
|||
|
" <td id=\"T_060dc_row1_col2\" class=\"data row1 col2\" >0.984615</td>\n",
|
|||
|
" <td id=\"T_060dc_row1_col3\" class=\"data row1 col3\" >0.575758</td>\n",
|
|||
|
" <td id=\"T_060dc_row1_col4\" class=\"data row1 col4\" >0.999364</td>\n",
|
|||
|
" <td id=\"T_060dc_row1_col5\" class=\"data row1 col5\" >0.994680</td>\n",
|
|||
|
" <td id=\"T_060dc_row1_col6\" class=\"data row1 col6\" >0.958801</td>\n",
|
|||
|
" <td id=\"T_060dc_row1_col7\" class=\"data row1 col7\" >0.622951</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_060dc_level0_row2\" class=\"row_heading level0 row2\" >gradient_boosting</th>\n",
|
|||
|
" <td id=\"T_060dc_row2_col0\" class=\"data row2 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_060dc_row2_col1\" class=\"data row2 col1\" >0.612903</td>\n",
|
|||
|
" <td id=\"T_060dc_row2_col2\" class=\"data row2 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_060dc_row2_col3\" class=\"data row2 col3\" >0.575758</td>\n",
|
|||
|
" <td id=\"T_060dc_row2_col4\" class=\"data row2 col4\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_060dc_row2_col5\" class=\"data row2 col5\" >0.993986</td>\n",
|
|||
|
" <td id=\"T_060dc_row2_col6\" class=\"data row2 col6\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_060dc_row2_col7\" class=\"data row2 col7\" >0.593750</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_060dc_level0_row3\" class=\"row_heading level0 row3\" >mlp</th>\n",
|
|||
|
" <td id=\"T_060dc_row3_col0\" class=\"data row3 col0\" >0.789474</td>\n",
|
|||
|
" <td id=\"T_060dc_row3_col1\" class=\"data row3 col1\" >0.586207</td>\n",
|
|||
|
" <td id=\"T_060dc_row3_col2\" class=\"data row3 col2\" >0.576923</td>\n",
|
|||
|
" <td id=\"T_060dc_row3_col3\" class=\"data row3 col3\" >0.515152</td>\n",
|
|||
|
" <td id=\"T_060dc_row3_col4\" class=\"data row3 col4\" >0.995662</td>\n",
|
|||
|
" <td id=\"T_060dc_row3_col5\" class=\"data row3 col5\" >0.993523</td>\n",
|
|||
|
" <td id=\"T_060dc_row3_col6\" class=\"data row3 col6\" >0.666667</td>\n",
|
|||
|
" <td id=\"T_060dc_row3_col7\" class=\"data row3 col7\" >0.548387</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_060dc_level0_row4\" class=\"row_heading level0 row4\" >knn</th>\n",
|
|||
|
" <td id=\"T_060dc_row4_col0\" class=\"data row4 col0\" >0.950000</td>\n",
|
|||
|
" <td id=\"T_060dc_row4_col1\" class=\"data row4 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_060dc_row4_col2\" class=\"data row4 col2\" >0.146154</td>\n",
|
|||
|
" <td id=\"T_060dc_row4_col3\" class=\"data row4 col3\" >0.060606</td>\n",
|
|||
|
" <td id=\"T_060dc_row4_col4\" class=\"data row4 col4\" >0.993522</td>\n",
|
|||
|
" <td id=\"T_060dc_row4_col5\" class=\"data row4 col5\" >0.992829</td>\n",
|
|||
|
" <td id=\"T_060dc_row4_col6\" class=\"data row4 col6\" >0.253333</td>\n",
|
|||
|
" <td id=\"T_060dc_row4_col7\" class=\"data row4 col7\" >0.114286</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_060dc_level0_row5\" class=\"row_heading level0 row5\" >random_forest</th>\n",
|
|||
|
" <td id=\"T_060dc_row5_col0\" class=\"data row5 col0\" >0.372493</td>\n",
|
|||
|
" <td id=\"T_060dc_row5_col1\" class=\"data row5 col1\" >0.333333</td>\n",
|
|||
|
" <td id=\"T_060dc_row5_col2\" class=\"data row5 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_060dc_row5_col3\" class=\"data row5 col3\" >0.878788</td>\n",
|
|||
|
" <td id=\"T_060dc_row5_col4\" class=\"data row5 col4\" >0.987334</td>\n",
|
|||
|
" <td id=\"T_060dc_row5_col5\" class=\"data row5 col5\" >0.985658</td>\n",
|
|||
|
" <td id=\"T_060dc_row5_col6\" class=\"data row5 col6\" >0.542797</td>\n",
|
|||
|
" <td id=\"T_060dc_row5_col7\" class=\"data row5 col7\" >0.483333</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_060dc_level0_row6\" class=\"row_heading level0 row6\" >ridge</th>\n",
|
|||
|
" <td id=\"T_060dc_row6_col0\" class=\"data row6 col0\" >0.343915</td>\n",
|
|||
|
" <td id=\"T_060dc_row6_col1\" class=\"data row6 col1\" >0.300971</td>\n",
|
|||
|
" <td id=\"T_060dc_row6_col2\" class=\"data row6 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_060dc_row6_col3\" class=\"data row6 col3\" >0.939394</td>\n",
|
|||
|
" <td id=\"T_060dc_row6_col4\" class=\"data row6 col4\" >0.985656</td>\n",
|
|||
|
" <td id=\"T_060dc_row6_col5\" class=\"data row6 col5\" >0.982882</td>\n",
|
|||
|
" <td id=\"T_060dc_row6_col6\" class=\"data row6 col6\" >0.511811</td>\n",
|
|||
|
" <td id=\"T_060dc_row6_col7\" class=\"data row6 col7\" >0.455882</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_060dc_level0_row7\" class=\"row_heading level0 row7\" >naive_bayes</th>\n",
|
|||
|
" <td id=\"T_060dc_row7_col0\" class=\"data row7 col0\" >0.018619</td>\n",
|
|||
|
" <td id=\"T_060dc_row7_col1\" class=\"data row7 col1\" >0.006916</td>\n",
|
|||
|
" <td id=\"T_060dc_row7_col2\" class=\"data row7 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_060dc_row7_col3\" class=\"data row7 col3\" >0.363636</td>\n",
|
|||
|
" <td id=\"T_060dc_row7_col4\" class=\"data row7 col4\" >0.603702</td>\n",
|
|||
|
" <td id=\"T_060dc_row7_col5\" class=\"data row7 col5\" >0.596576</td>\n",
|
|||
|
" <td id=\"T_060dc_row7_col6\" class=\"data row7 col6\" >0.036558</td>\n",
|
|||
|
" <td id=\"T_060dc_row7_col7\" class=\"data row7 col7\" >0.013575</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"<pandas.io.formats.style.Styler at 0x21a8419a540>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 34,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
|
|||
|
" [\n",
|
|||
|
" \"Precision_train\",\n",
|
|||
|
" \"Precision_test\",\n",
|
|||
|
" \"Recall_train\",\n",
|
|||
|
" \"Recall_test\",\n",
|
|||
|
" \"Accuracy_train\",\n",
|
|||
|
" \"Accuracy_test\",\n",
|
|||
|
" \"F1_train\",\n",
|
|||
|
" \"F1_test\",\n",
|
|||
|
" ]\n",
|
|||
|
"]\n",
|
|||
|
"class_metrics.sort_values(\n",
|
|||
|
" by=\"Accuracy_test\", ascending=False\n",
|
|||
|
").style.background_gradient(\n",
|
|||
|
" cmap=\"plasma\",\n",
|
|||
|
" low=0.3,\n",
|
|||
|
" high=1,\n",
|
|||
|
" subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
|
|||
|
").background_gradient(\n",
|
|||
|
" cmap=\"viridis\",\n",
|
|||
|
" low=1,\n",
|
|||
|
" high=0.3,\n",
|
|||
|
" subset=[\n",
|
|||
|
" \"Precision_train\",\n",
|
|||
|
" \"Precision_test\",\n",
|
|||
|
" \"Recall_train\",\n",
|
|||
|
" \"Recall_test\",\n",
|
|||
|
" ],\n",
|
|||
|
")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 35,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style type=\"text/css\">\n",
|
|||
|
"#T_8c989_row0_col0, #T_8c989_row0_col1, #T_8c989_row2_col0, #T_8c989_row3_col0, #T_8c989_row6_col0 {\n",
|
|||
|
" background-color: #a8db34;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row0_col2, #T_8c989_row0_col3, #T_8c989_row0_col4, #T_8c989_row1_col2, #T_8c989_row2_col2, #T_8c989_row3_col2, #T_8c989_row4_col2 {\n",
|
|||
|
" background-color: #da5a6a;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row1_col0, #T_8c989_row4_col0 {\n",
|
|||
|
" background-color: #a0da39;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row1_col1 {\n",
|
|||
|
" background-color: #4ec36b;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row1_col3 {\n",
|
|||
|
" background-color: #b52f8c;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row1_col4 {\n",
|
|||
|
" background-color: #c33d80;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row2_col1 {\n",
|
|||
|
" background-color: #6ece58;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row2_col3, #T_8c989_row2_col4 {\n",
|
|||
|
" background-color: #c6417d;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row3_col1 {\n",
|
|||
|
" background-color: #81d34d;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row3_col3, #T_8c989_row3_col4 {\n",
|
|||
|
" background-color: #cc4977;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row4_col1 {\n",
|
|||
|
" background-color: #56c667;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row4_col3 {\n",
|
|||
|
" background-color: #bb3488;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row4_col4 {\n",
|
|||
|
" background-color: #c43e7f;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row5_col0 {\n",
|
|||
|
" background-color: #a5db36;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row5_col1 {\n",
|
|||
|
" background-color: #21908d;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row5_col2 {\n",
|
|||
|
" background-color: #bc3587;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row5_col3 {\n",
|
|||
|
" background-color: #6c00a8;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row5_col4 {\n",
|
|||
|
" background-color: #8b0aa5;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row6_col1 {\n",
|
|||
|
" background-color: #8ed645;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row6_col2 {\n",
|
|||
|
" background-color: #ad2793;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row6_col3, #T_8c989_row6_col4 {\n",
|
|||
|
" background-color: #d14e72;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row7_col0, #T_8c989_row7_col1 {\n",
|
|||
|
" background-color: #26818e;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_8c989_row7_col2, #T_8c989_row7_col3, #T_8c989_row7_col4 {\n",
|
|||
|
" background-color: #4e02a2;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"</style>\n",
|
|||
|
"<table id=\"T_8c989\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"blank level0\" > </th>\n",
|
|||
|
" <th id=\"T_8c989_level0_col0\" class=\"col_heading level0 col0\" >Accuracy_test</th>\n",
|
|||
|
" <th id=\"T_8c989_level0_col1\" class=\"col_heading level0 col1\" >F1_test</th>\n",
|
|||
|
" <th id=\"T_8c989_level0_col2\" class=\"col_heading level0 col2\" >ROC_AUC_test</th>\n",
|
|||
|
" <th id=\"T_8c989_level0_col3\" class=\"col_heading level0 col3\" >Cohen_kappa_test</th>\n",
|
|||
|
" <th id=\"T_8c989_level0_col4\" class=\"col_heading level0 col4\" >MCC_test</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_8c989_level0_row0\" class=\"row_heading level0 row0\" >logistic</th>\n",
|
|||
|
" <td id=\"T_8c989_row0_col0\" class=\"data row0 col0\" >0.995142</td>\n",
|
|||
|
" <td id=\"T_8c989_row0_col1\" class=\"data row0 col1\" >0.686567</td>\n",
|
|||
|
" <td id=\"T_8c989_row0_col2\" class=\"data row0 col2\" >0.996073</td>\n",
|
|||
|
" <td id=\"T_8c989_row0_col3\" class=\"data row0 col3\" >0.684120</td>\n",
|
|||
|
" <td id=\"T_8c989_row0_col4\" class=\"data row0 col4\" >0.684197</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_8c989_level0_row1\" class=\"row_heading level0 row1\" >ridge</th>\n",
|
|||
|
" <td id=\"T_8c989_row1_col0\" class=\"data row1 col0\" >0.982882</td>\n",
|
|||
|
" <td id=\"T_8c989_row1_col1\" class=\"data row1 col1\" >0.455882</td>\n",
|
|||
|
" <td id=\"T_8c989_row1_col2\" class=\"data row1 col2\" >0.995416</td>\n",
|
|||
|
" <td id=\"T_8c989_row1_col3\" class=\"data row1 col3\" >0.449517</td>\n",
|
|||
|
" <td id=\"T_8c989_row1_col4\" class=\"data row1 col4\" >0.526537</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_8c989_level0_row2\" class=\"row_heading level0 row2\" >mlp</th>\n",
|
|||
|
" <td id=\"T_8c989_row2_col0\" class=\"data row2 col0\" >0.993523</td>\n",
|
|||
|
" <td id=\"T_8c989_row2_col1\" class=\"data row2 col1\" >0.548387</td>\n",
|
|||
|
" <td id=\"T_8c989_row2_col2\" class=\"data row2 col2\" >0.994420</td>\n",
|
|||
|
" <td id=\"T_8c989_row2_col3\" class=\"data row2 col3\" >0.545139</td>\n",
|
|||
|
" <td id=\"T_8c989_row2_col4\" class=\"data row2 col4\" >0.546293</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_8c989_level0_row3\" class=\"row_heading level0 row3\" >gradient_boosting</th>\n",
|
|||
|
" <td id=\"T_8c989_row3_col0\" class=\"data row3 col0\" >0.993986</td>\n",
|
|||
|
" <td id=\"T_8c989_row3_col1\" class=\"data row3 col1\" >0.593750</td>\n",
|
|||
|
" <td id=\"T_8c989_row3_col2\" class=\"data row3 col2\" >0.994137</td>\n",
|
|||
|
" <td id=\"T_8c989_row3_col3\" class=\"data row3 col3\" >0.590723</td>\n",
|
|||
|
" <td id=\"T_8c989_row3_col4\" class=\"data row3 col4\" >0.591016</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_8c989_level0_row4\" class=\"row_heading level0 row4\" >random_forest</th>\n",
|
|||
|
" <td id=\"T_8c989_row4_col0\" class=\"data row4 col0\" >0.985658</td>\n",
|
|||
|
" <td id=\"T_8c989_row4_col1\" class=\"data row4 col1\" >0.483333</td>\n",
|
|||
|
" <td id=\"T_8c989_row4_col2\" class=\"data row4 col2\" >0.992880</td>\n",
|
|||
|
" <td id=\"T_8c989_row4_col3\" class=\"data row4 col3\" >0.477550</td>\n",
|
|||
|
" <td id=\"T_8c989_row4_col4\" class=\"data row4 col4\" >0.536289</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_8c989_level0_row5\" class=\"row_heading level0 row5\" >knn</th>\n",
|
|||
|
" <td id=\"T_8c989_row5_col0\" class=\"data row5 col0\" >0.992829</td>\n",
|
|||
|
" <td id=\"T_8c989_row5_col1\" class=\"data row5 col1\" >0.114286</td>\n",
|
|||
|
" <td id=\"T_8c989_row5_col2\" class=\"data row5 col2\" >0.844971</td>\n",
|
|||
|
" <td id=\"T_8c989_row5_col3\" class=\"data row5 col3\" >0.113512</td>\n",
|
|||
|
" <td id=\"T_8c989_row5_col4\" class=\"data row5 col4\" >0.245298</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_8c989_level0_row6\" class=\"row_heading level0 row6\" >decision_tree</th>\n",
|
|||
|
" <td id=\"T_8c989_row6_col0\" class=\"data row6 col0\" >0.994680</td>\n",
|
|||
|
" <td id=\"T_8c989_row6_col1\" class=\"data row6 col1\" >0.622951</td>\n",
|
|||
|
" <td id=\"T_8c989_row6_col2\" class=\"data row6 col2\" >0.786180</td>\n",
|
|||
|
" <td id=\"T_8c989_row6_col3\" class=\"data row6 col3\" >0.620290</td>\n",
|
|||
|
" <td id=\"T_8c989_row6_col4\" class=\"data row6 col4\" >0.622414</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_8c989_level0_row7\" class=\"row_heading level0 row7\" >naive_bayes</th>\n",
|
|||
|
" <td id=\"T_8c989_row7_col0\" class=\"data row7 col0\" >0.596576</td>\n",
|
|||
|
" <td id=\"T_8c989_row7_col1\" class=\"data row7 col1\" >0.013575</td>\n",
|
|||
|
" <td id=\"T_8c989_row7_col2\" class=\"data row7 col2\" >0.481002</td>\n",
|
|||
|
" <td id=\"T_8c989_row7_col3\" class=\"data row7 col3\" >-0.001429</td>\n",
|
|||
|
" <td id=\"T_8c989_row7_col4\" class=\"data row7 col4\" >-0.006747</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"<pandas.io.formats.style.Styler at 0x21a86eff920>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 35,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
|
|||
|
" [\n",
|
|||
|
" \"Accuracy_test\",\n",
|
|||
|
" \"F1_test\",\n",
|
|||
|
" \"ROC_AUC_test\",\n",
|
|||
|
" \"Cohen_kappa_test\",\n",
|
|||
|
" \"MCC_test\",\n",
|
|||
|
" ]\n",
|
|||
|
"]\n",
|
|||
|
"class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n",
|
|||
|
" cmap=\"plasma\",\n",
|
|||
|
" low=0.3,\n",
|
|||
|
" high=1,\n",
|
|||
|
" subset=[\n",
|
|||
|
" \"ROC_AUC_test\",\n",
|
|||
|
" \"MCC_test\",\n",
|
|||
|
" \"Cohen_kappa_test\",\n",
|
|||
|
" ],\n",
|
|||
|
").background_gradient(\n",
|
|||
|
" cmap=\"viridis\",\n",
|
|||
|
" low=1,\n",
|
|||
|
" high=0.3,\n",
|
|||
|
" subset=[\n",
|
|||
|
" \"Accuracy_test\",\n",
|
|||
|
" \"F1_test\",\n",
|
|||
|
" ],\n",
|
|||
|
")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 36,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'logistic'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n",
|
|||
|
"\n",
|
|||
|
"display(best_model)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Вывод данных с ошибкой предсказания для оценки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 38,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\ogoro\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'Error items count: 21'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>date</th>\n",
|
|||
|
" <th>Predicted</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>grade</th>\n",
|
|||
|
" <th>sqft_above</th>\n",
|
|||
|
" <th>sqft_basement</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_renovated</th>\n",
|
|||
|
" <th>zipcode</th>\n",
|
|||
|
" <th>lat</th>\n",
|
|||
|
" <th>long</th>\n",
|
|||
|
" <th>sqft_living15</th>\n",
|
|||
|
" <th>sqft_lot15</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>121039042</th>\n",
|
|||
|
" <td>20150313T000000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>425000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.75</td>\n",
|
|||
|
" <td>3610</td>\n",
|
|||
|
" <td>107386</td>\n",
|
|||
|
" <td>1.5</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>3130</td>\n",
|
|||
|
" <td>480</td>\n",
|
|||
|
" <td>1918</td>\n",
|
|||
|
" <td>1962</td>\n",
|
|||
|
" <td>98023</td>\n",
|
|||
|
" <td>47.3351</td>\n",
|
|||
|
" <td>-122.362</td>\n",
|
|||
|
" <td>2630</td>\n",
|
|||
|
" <td>42126</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>624069108</th>\n",
|
|||
|
" <td>20140812T000000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3200000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>3.25</td>\n",
|
|||
|
" <td>7000</td>\n",
|
|||
|
" <td>28206</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>12</td>\n",
|
|||
|
" <td>3500</td>\n",
|
|||
|
" <td>3500</td>\n",
|
|||
|
" <td>1991</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98075</td>\n",
|
|||
|
" <td>47.5928</td>\n",
|
|||
|
" <td>-122.086</td>\n",
|
|||
|
" <td>4913</td>\n",
|
|||
|
" <td>14663</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1025039086</th>\n",
|
|||
|
" <td>20140916T000000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1875000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>3280</td>\n",
|
|||
|
" <td>29111</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>11</td>\n",
|
|||
|
" <td>3280</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1925</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98199</td>\n",
|
|||
|
" <td>47.6699</td>\n",
|
|||
|
" <td>-122.416</td>\n",
|
|||
|
" <td>3530</td>\n",
|
|||
|
" <td>21074</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1732800780</th>\n",
|
|||
|
" <td>20150212T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3065000.0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>3.00</td>\n",
|
|||
|
" <td>4150</td>\n",
|
|||
|
" <td>7500</td>\n",
|
|||
|
" <td>2.5</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>11</td>\n",
|
|||
|
" <td>3510</td>\n",
|
|||
|
" <td>640</td>\n",
|
|||
|
" <td>1909</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98119</td>\n",
|
|||
|
" <td>47.6303</td>\n",
|
|||
|
" <td>-122.362</td>\n",
|
|||
|
" <td>2250</td>\n",
|
|||
|
" <td>4050</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2122039094</th>\n",
|
|||
|
" <td>20141126T000000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>705000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>3.00</td>\n",
|
|||
|
" <td>1970</td>\n",
|
|||
|
" <td>20978</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>1770</td>\n",
|
|||
|
" <td>200</td>\n",
|
|||
|
" <td>1980</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98070</td>\n",
|
|||
|
" <td>47.3844</td>\n",
|
|||
|
" <td>-122.438</td>\n",
|
|||
|
" <td>2280</td>\n",
|
|||
|
" <td>75396</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2923039243</th>\n",
|
|||
|
" <td>20141113T000000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>340000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>1200</td>\n",
|
|||
|
" <td>11834</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>1200</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1972</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98070</td>\n",
|
|||
|
" <td>47.4557</td>\n",
|
|||
|
" <td>-122.443</td>\n",
|
|||
|
" <td>1670</td>\n",
|
|||
|
" <td>47462</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3024059014</th>\n",
|
|||
|
" <td>20150325T000000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1900000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>3020</td>\n",
|
|||
|
" <td>11489</td>\n",
|
|||
|
" <td>1.5</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" <td>2110</td>\n",
|
|||
|
" <td>910</td>\n",
|
|||
|
" <td>1916</td>\n",
|
|||
|
" <td>1988</td>\n",
|
|||
|
" <td>98040</td>\n",
|
|||
|
" <td>47.5395</td>\n",
|
|||
|
" <td>-122.210</td>\n",
|
|||
|
" <td>3890</td>\n",
|
|||
|
" <td>11489</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3222049024</th>\n",
|
|||
|
" <td>20140522T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>361000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>1100</td>\n",
|
|||
|
" <td>4046</td>\n",
|
|||
|
" <td>1.5</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>1100</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1922</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98198</td>\n",
|
|||
|
" <td>47.3440</td>\n",
|
|||
|
" <td>-122.331</td>\n",
|
|||
|
" <td>2550</td>\n",
|
|||
|
" <td>7847</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3426049284</th>\n",
|
|||
|
" <td>20140819T000000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2300000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>3.25</td>\n",
|
|||
|
" <td>4110</td>\n",
|
|||
|
" <td>15929</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>12</td>\n",
|
|||
|
" <td>2720</td>\n",
|
|||
|
" <td>1390</td>\n",
|
|||
|
" <td>2001</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98115</td>\n",
|
|||
|
" <td>47.6934</td>\n",
|
|||
|
" <td>-122.271</td>\n",
|
|||
|
" <td>2640</td>\n",
|
|||
|
" <td>15929</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3741600020</th>\n",
|
|||
|
" <td>20140915T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>540000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>2100</td>\n",
|
|||
|
" <td>20018</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1470</td>\n",
|
|||
|
" <td>630</td>\n",
|
|||
|
" <td>1948</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98166</td>\n",
|
|||
|
" <td>47.4544</td>\n",
|
|||
|
" <td>-122.366</td>\n",
|
|||
|
" <td>2410</td>\n",
|
|||
|
" <td>17196</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3760500336</th>\n",
|
|||
|
" <td>20141126T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>2125000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.75</td>\n",
|
|||
|
" <td>3190</td>\n",
|
|||
|
" <td>19513</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" <td>3190</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1982</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98034</td>\n",
|
|||
|
" <td>47.6991</td>\n",
|
|||
|
" <td>-122.235</td>\n",
|
|||
|
" <td>2750</td>\n",
|
|||
|
" <td>13496</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3867400175</th>\n",
|
|||
|
" <td>20150224T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>850000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1.50</td>\n",
|
|||
|
" <td>1800</td>\n",
|
|||
|
" <td>4144</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>900</td>\n",
|
|||
|
" <td>900</td>\n",
|
|||
|
" <td>1962</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98116</td>\n",
|
|||
|
" <td>47.5934</td>\n",
|
|||
|
" <td>-122.390</td>\n",
|
|||
|
" <td>2090</td>\n",
|
|||
|
" <td>4173</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6329000050</th>\n",
|
|||
|
" <td>20150310T000000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>641500.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>1000</td>\n",
|
|||
|
" <td>9084</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1950</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98146</td>\n",
|
|||
|
" <td>47.5007</td>\n",
|
|||
|
" <td>-122.382</td>\n",
|
|||
|
" <td>1090</td>\n",
|
|||
|
" <td>6536</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6762700020</th>\n",
|
|||
|
" <td>20141013T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>7700000.0</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>8.00</td>\n",
|
|||
|
" <td>12050</td>\n",
|
|||
|
" <td>27600</td>\n",
|
|||
|
" <td>2.5</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>13</td>\n",
|
|||
|
" <td>8570</td>\n",
|
|||
|
" <td>3480</td>\n",
|
|||
|
" <td>1910</td>\n",
|
|||
|
" <td>1987</td>\n",
|
|||
|
" <td>98102</td>\n",
|
|||
|
" <td>47.6298</td>\n",
|
|||
|
" <td>-122.323</td>\n",
|
|||
|
" <td>3940</td>\n",
|
|||
|
" <td>8800</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7278100515</th>\n",
|
|||
|
" <td>20140821T000000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1295000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2910</td>\n",
|
|||
|
" <td>19449</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>1940</td>\n",
|
|||
|
" <td>970</td>\n",
|
|||
|
" <td>1985</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98177</td>\n",
|
|||
|
" <td>47.7729</td>\n",
|
|||
|
" <td>-122.393</td>\n",
|
|||
|
" <td>2540</td>\n",
|
|||
|
" <td>23598</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7490000040</th>\n",
|
|||
|
" <td>20140718T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>2535000.0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>3.25</td>\n",
|
|||
|
" <td>3730</td>\n",
|
|||
|
" <td>10626</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" <td>3730</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1963</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98004</td>\n",
|
|||
|
" <td>47.6240</td>\n",
|
|||
|
" <td>-122.221</td>\n",
|
|||
|
" <td>4180</td>\n",
|
|||
|
" <td>19110</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7631200292</th>\n",
|
|||
|
" <td>20140626T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>669000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1.75</td>\n",
|
|||
|
" <td>1950</td>\n",
|
|||
|
" <td>10766</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>1160</td>\n",
|
|||
|
" <td>790</td>\n",
|
|||
|
" <td>1952</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98166</td>\n",
|
|||
|
" <td>47.4504</td>\n",
|
|||
|
" <td>-122.377</td>\n",
|
|||
|
" <td>1780</td>\n",
|
|||
|
" <td>11721</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7636800041</th>\n",
|
|||
|
" <td>20140625T000000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>995000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>4.50</td>\n",
|
|||
|
" <td>4380</td>\n",
|
|||
|
" <td>47044</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>3720</td>\n",
|
|||
|
" <td>660</td>\n",
|
|||
|
" <td>1968</td>\n",
|
|||
|
" <td>1990</td>\n",
|
|||
|
" <td>98166</td>\n",
|
|||
|
" <td>47.4734</td>\n",
|
|||
|
" <td>-122.365</td>\n",
|
|||
|
" <td>2460</td>\n",
|
|||
|
" <td>18512</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8907500070</th>\n",
|
|||
|
" <td>20150413T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>5350000.0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>5.00</td>\n",
|
|||
|
" <td>8000</td>\n",
|
|||
|
" <td>23985</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>12</td>\n",
|
|||
|
" <td>6720</td>\n",
|
|||
|
" <td>1280</td>\n",
|
|||
|
" <td>2009</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98004</td>\n",
|
|||
|
" <td>47.6232</td>\n",
|
|||
|
" <td>-122.220</td>\n",
|
|||
|
" <td>4600</td>\n",
|
|||
|
" <td>21750</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8964800890</th>\n",
|
|||
|
" <td>20150109T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3200000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>3.25</td>\n",
|
|||
|
" <td>4560</td>\n",
|
|||
|
" <td>13363</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>11</td>\n",
|
|||
|
" <td>2760</td>\n",
|
|||
|
" <td>1800</td>\n",
|
|||
|
" <td>1995</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98004</td>\n",
|
|||
|
" <td>47.6205</td>\n",
|
|||
|
" <td>-122.214</td>\n",
|
|||
|
" <td>4060</td>\n",
|
|||
|
" <td>13362</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9208900037</th>\n",
|
|||
|
" <td>20140919T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>6885000.0</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>7.75</td>\n",
|
|||
|
" <td>9890</td>\n",
|
|||
|
" <td>31374</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>13</td>\n",
|
|||
|
" <td>8860</td>\n",
|
|||
|
" <td>1030</td>\n",
|
|||
|
" <td>2001</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98039</td>\n",
|
|||
|
" <td>47.6305</td>\n",
|
|||
|
" <td>-122.240</td>\n",
|
|||
|
" <td>4540</td>\n",
|
|||
|
" <td>42730</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>21 rows × 21 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" date Predicted price bedrooms bathrooms \\\n",
|
|||
|
"id \n",
|
|||
|
"121039042 20150313T000000 0 425000.0 3 2.75 \n",
|
|||
|
"624069108 20140812T000000 0 3200000.0 4 3.25 \n",
|
|||
|
"1025039086 20140916T000000 0 1875000.0 3 2.50 \n",
|
|||
|
"1732800780 20150212T000000 1 3065000.0 5 3.00 \n",
|
|||
|
"2122039094 20141126T000000 0 705000.0 3 3.00 \n",
|
|||
|
"2923039243 20141113T000000 0 340000.0 4 1.00 \n",
|
|||
|
"3024059014 20150325T000000 0 1900000.0 4 2.25 \n",
|
|||
|
"3222049024 20140522T000000 1 361000.0 3 1.00 \n",
|
|||
|
"3426049284 20140819T000000 0 2300000.0 4 3.25 \n",
|
|||
|
"3741600020 20140915T000000 1 540000.0 3 2.25 \n",
|
|||
|
"3760500336 20141126T000000 1 2125000.0 4 2.75 \n",
|
|||
|
"3867400175 20150224T000000 1 850000.0 2 1.50 \n",
|
|||
|
"6329000050 20150310T000000 0 641500.0 1 1.00 \n",
|
|||
|
"6762700020 20141013T000000 1 7700000.0 6 8.00 \n",
|
|||
|
"7278100515 20140821T000000 0 1295000.0 2 2.50 \n",
|
|||
|
"7490000040 20140718T000000 1 2535000.0 5 3.25 \n",
|
|||
|
"7631200292 20140626T000000 1 669000.0 2 1.75 \n",
|
|||
|
"7636800041 20140625T000000 0 995000.0 3 4.50 \n",
|
|||
|
"8907500070 20150413T000000 1 5350000.0 5 5.00 \n",
|
|||
|
"8964800890 20150109T000000 1 3200000.0 3 3.25 \n",
|
|||
|
"9208900037 20140919T000000 1 6885000.0 6 7.75 \n",
|
|||
|
"\n",
|
|||
|
" sqft_living sqft_lot floors waterfront view ... grade \\\n",
|
|||
|
"id ... \n",
|
|||
|
"121039042 3610 107386 1.5 1 3 ... 8 \n",
|
|||
|
"624069108 7000 28206 1.0 1 4 ... 12 \n",
|
|||
|
"1025039086 3280 29111 2.0 1 3 ... 11 \n",
|
|||
|
"1732800780 4150 7500 2.5 0 4 ... 11 \n",
|
|||
|
"2122039094 1970 20978 2.0 1 3 ... 9 \n",
|
|||
|
"2923039243 1200 11834 1.0 1 3 ... 6 \n",
|
|||
|
"3024059014 3020 11489 1.5 1 3 ... 10 \n",
|
|||
|
"3222049024 1100 4046 1.5 0 4 ... 6 \n",
|
|||
|
"3426049284 4110 15929 2.0 1 4 ... 12 \n",
|
|||
|
"3741600020 2100 20018 1.0 0 4 ... 8 \n",
|
|||
|
"3760500336 3190 19513 2.0 0 4 ... 10 \n",
|
|||
|
"3867400175 1800 4144 1.0 0 4 ... 7 \n",
|
|||
|
"6329000050 1000 9084 1.0 1 3 ... 7 \n",
|
|||
|
"6762700020 12050 27600 2.5 0 3 ... 13 \n",
|
|||
|
"7278100515 2910 19449 2.0 1 4 ... 9 \n",
|
|||
|
"7490000040 3730 10626 1.0 0 4 ... 10 \n",
|
|||
|
"7631200292 1950 10766 1.0 0 3 ... 6 \n",
|
|||
|
"7636800041 4380 47044 2.0 1 3 ... 9 \n",
|
|||
|
"8907500070 8000 23985 2.0 0 4 ... 12 \n",
|
|||
|
"8964800890 4560 13363 1.0 0 4 ... 11 \n",
|
|||
|
"9208900037 9890 31374 2.0 0 4 ... 13 \n",
|
|||
|
"\n",
|
|||
|
" sqft_above sqft_basement yr_built yr_renovated zipcode \\\n",
|
|||
|
"id \n",
|
|||
|
"121039042 3130 480 1918 1962 98023 \n",
|
|||
|
"624069108 3500 3500 1991 0 98075 \n",
|
|||
|
"1025039086 3280 0 1925 0 98199 \n",
|
|||
|
"1732800780 3510 640 1909 0 98119 \n",
|
|||
|
"2122039094 1770 200 1980 0 98070 \n",
|
|||
|
"2923039243 1200 0 1972 0 98070 \n",
|
|||
|
"3024059014 2110 910 1916 1988 98040 \n",
|
|||
|
"3222049024 1100 0 1922 0 98198 \n",
|
|||
|
"3426049284 2720 1390 2001 0 98115 \n",
|
|||
|
"3741600020 1470 630 1948 0 98166 \n",
|
|||
|
"3760500336 3190 0 1982 0 98034 \n",
|
|||
|
"3867400175 900 900 1962 0 98116 \n",
|
|||
|
"6329000050 1000 0 1950 0 98146 \n",
|
|||
|
"6762700020 8570 3480 1910 1987 98102 \n",
|
|||
|
"7278100515 1940 970 1985 0 98177 \n",
|
|||
|
"7490000040 3730 0 1963 0 98004 \n",
|
|||
|
"7631200292 1160 790 1952 0 98166 \n",
|
|||
|
"7636800041 3720 660 1968 1990 98166 \n",
|
|||
|
"8907500070 6720 1280 2009 0 98004 \n",
|
|||
|
"8964800890 2760 1800 1995 0 98004 \n",
|
|||
|
"9208900037 8860 1030 2001 0 98039 \n",
|
|||
|
"\n",
|
|||
|
" lat long sqft_living15 sqft_lot15 \n",
|
|||
|
"id \n",
|
|||
|
"121039042 47.3351 -122.362 2630 42126 \n",
|
|||
|
"624069108 47.5928 -122.086 4913 14663 \n",
|
|||
|
"1025039086 47.6699 -122.416 3530 21074 \n",
|
|||
|
"1732800780 47.6303 -122.362 2250 4050 \n",
|
|||
|
"2122039094 47.3844 -122.438 2280 75396 \n",
|
|||
|
"2923039243 47.4557 -122.443 1670 47462 \n",
|
|||
|
"3024059014 47.5395 -122.210 3890 11489 \n",
|
|||
|
"3222049024 47.3440 -122.331 2550 7847 \n",
|
|||
|
"3426049284 47.6934 -122.271 2640 15929 \n",
|
|||
|
"3741600020 47.4544 -122.366 2410 17196 \n",
|
|||
|
"3760500336 47.6991 -122.235 2750 13496 \n",
|
|||
|
"3867400175 47.5934 -122.390 2090 4173 \n",
|
|||
|
"6329000050 47.5007 -122.382 1090 6536 \n",
|
|||
|
"6762700020 47.6298 -122.323 3940 8800 \n",
|
|||
|
"7278100515 47.7729 -122.393 2540 23598 \n",
|
|||
|
"7490000040 47.6240 -122.221 4180 19110 \n",
|
|||
|
"7631200292 47.4504 -122.377 1780 11721 \n",
|
|||
|
"7636800041 47.4734 -122.365 2460 18512 \n",
|
|||
|
"8907500070 47.6232 -122.220 4600 21750 \n",
|
|||
|
"8964800890 47.6205 -122.214 4060 13362 \n",
|
|||
|
"9208900037 47.6305 -122.240 4540 42730 \n",
|
|||
|
"\n",
|
|||
|
"[21 rows x 21 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 38,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"preprocessing_result = pipeline_end.transform(X_test)\n",
|
|||
|
"preprocessed_df = pd.DataFrame(\n",
|
|||
|
" preprocessing_result,\n",
|
|||
|
" columns=pipeline_end.get_feature_names_out(),\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"y_pred = class_models[best_model][\"preds\"]\n",
|
|||
|
"\n",
|
|||
|
"error_index = y_test[y_test[\"waterfront\"] != y_pred].index.tolist()\n",
|
|||
|
"display(f\"Error items count: {len(error_index)}\")\n",
|
|||
|
"\n",
|
|||
|
"error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n",
|
|||
|
"error_df = X_test.loc[error_index].copy()\n",
|
|||
|
"error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n",
|
|||
|
"error_df.sort_index()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Пример использования обученной модели (конвейера) для предсказания"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 44,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>date</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>grade</th>\n",
|
|||
|
" <th>sqft_above</th>\n",
|
|||
|
" <th>sqft_basement</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_renovated</th>\n",
|
|||
|
" <th>zipcode</th>\n",
|
|||
|
" <th>lat</th>\n",
|
|||
|
" <th>long</th>\n",
|
|||
|
" <th>sqft_living15</th>\n",
|
|||
|
" <th>sqft_lot15</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>624069108</th>\n",
|
|||
|
" <td>20140812T000000</td>\n",
|
|||
|
" <td>3200000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>3.25</td>\n",
|
|||
|
" <td>7000</td>\n",
|
|||
|
" <td>28206</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>12</td>\n",
|
|||
|
" <td>3500</td>\n",
|
|||
|
" <td>3500</td>\n",
|
|||
|
" <td>1991</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98075</td>\n",
|
|||
|
" <td>47.5928</td>\n",
|
|||
|
" <td>-122.086</td>\n",
|
|||
|
" <td>4913</td>\n",
|
|||
|
" <td>14663</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" date price bedrooms bathrooms sqft_living sqft_lot \\\n",
|
|||
|
"624069108 20140812T000000 3200000.0 4 3.25 7000 28206 \n",
|
|||
|
"\n",
|
|||
|
" floors waterfront view condition grade sqft_above sqft_basement \\\n",
|
|||
|
"624069108 1.0 1 4 4 12 3500 3500 \n",
|
|||
|
"\n",
|
|||
|
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
|
|||
|
"624069108 1991 0 98075 47.5928 -122.086 4913 \n",
|
|||
|
"\n",
|
|||
|
" sqft_lot15 \n",
|
|||
|
"624069108 14663 "
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Region_north</th>\n",
|
|||
|
" <th>House_age</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>date_20150506T000000</th>\n",
|
|||
|
" <th>date_20150507T000000</th>\n",
|
|||
|
" <th>date_20150508T000000</th>\n",
|
|||
|
" <th>date_20150509T000000</th>\n",
|
|||
|
" <th>date_20150510T000000</th>\n",
|
|||
|
" <th>date_20150511T000000</th>\n",
|
|||
|
" <th>date_20150512T000000</th>\n",
|
|||
|
" <th>date_20150513T000000</th>\n",
|
|||
|
" <th>date_20150514T000000</th>\n",
|
|||
|
" <th>date_20150515T000000</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>624069108</th>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>33.0</td>\n",
|
|||
|
" <td>7.494206</td>\n",
|
|||
|
" <td>0.6818</td>\n",
|
|||
|
" <td>1.479217</td>\n",
|
|||
|
" <td>5.372072</td>\n",
|
|||
|
" <td>0.29821</td>\n",
|
|||
|
" <td>-0.918509</td>\n",
|
|||
|
" <td>4.922704</td>\n",
|
|||
|
" <td>0.909775</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>1 rows × 384 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Region_north House_age price bedrooms bathrooms \\\n",
|
|||
|
"624069108 1.0 33.0 7.494206 0.6818 1.479217 \n",
|
|||
|
"\n",
|
|||
|
" sqft_living sqft_lot floors view condition ... \\\n",
|
|||
|
"624069108 5.372072 0.29821 -0.918509 4.922704 0.909775 ... \n",
|
|||
|
"\n",
|
|||
|
" date_20150506T000000 date_20150507T000000 date_20150508T000000 \\\n",
|
|||
|
"624069108 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" date_20150509T000000 date_20150510T000000 date_20150511T000000 \\\n",
|
|||
|
"624069108 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" date_20150512T000000 date_20150513T000000 date_20150514T000000 \\\n",
|
|||
|
"624069108 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" date_20150515T000000 \n",
|
|||
|
"624069108 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[1 rows x 384 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'predicted: 0 (proba: [0.8437713 0.1562287])'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'real: 1'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"model = class_models[best_model][\"pipeline\"]\n",
|
|||
|
"\n",
|
|||
|
"example_id = 624069108\n",
|
|||
|
"test = pd.DataFrame(X_test.loc[example_id, :]).T\n",
|
|||
|
"test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T\n",
|
|||
|
"display(test)\n",
|
|||
|
"display(test_preprocessed)\n",
|
|||
|
"result_proba = model.predict_proba(test)[0]\n",
|
|||
|
"result = model.predict(test)[0]\n",
|
|||
|
"real = int(y_test.loc[example_id].values[0])\n",
|
|||
|
"display(f\"predicted: {result} (proba: {result_proba})\")\n",
|
|||
|
"display(f\"real: {real}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Подбор гиперпараметров методом поиска по сетке\n",
|
|||
|
"\n",
|
|||
|
"https://www.kaggle.com/code/sociopath00/random-forest-using-gridsearchcv\n",
|
|||
|
"\n",
|
|||
|
"https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import GridSearchCV\n",
|
|||
|
"\n",
|
|||
|
"optimized_model_type = \"random_forest\"\n",
|
|||
|
"\n",
|
|||
|
"random_forest_model = class_models[optimized_model_type][\"pipeline\"]\n",
|
|||
|
"\n",
|
|||
|
"param_grid = {\n",
|
|||
|
" \"model__n_estimators\": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],\n",
|
|||
|
" \"model__max_features\": [\"sqrt\", \"log2\", 2],\n",
|
|||
|
" \"model__max_depth\": [2, 3, 4, 5, 6, 7, 8, 9 ,10],\n",
|
|||
|
" \"model__criterion\": [\"gini\", \"entropy\", \"log_loss\"],\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"gs_optomizer = GridSearchCV(\n",
|
|||
|
" estimator=random_forest_model, param_grid=param_grid, n_jobs=-1\n",
|
|||
|
")\n",
|
|||
|
"gs_optomizer.fit(X_train, y_train.values.ravel())\n",
|
|||
|
"gs_optomizer.best_params_"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Обучение модели с новыми гиперпараметрами"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 90,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"optimized_model = ensemble.RandomForestClassifier(\n",
|
|||
|
" random_state=random_state,\n",
|
|||
|
" criterion=\"gini\",\n",
|
|||
|
" max_depth=7,\n",
|
|||
|
" max_features=\"sqrt\",\n",
|
|||
|
" n_estimators=30,\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"result = {}\n",
|
|||
|
"\n",
|
|||
|
"result[\"pipeline\"] = Pipeline([(\"pipeline\", pipeline_end), (\"model\", optimized_model)]).fit(X_train, y_train.values.ravel())\n",
|
|||
|
"result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n",
|
|||
|
"result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n",
|
|||
|
"result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n",
|
|||
|
"\n",
|
|||
|
"result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n",
|
|||
|
"result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n",
|
|||
|
"result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n",
|
|||
|
"result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n",
|
|||
|
"result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n",
|
|||
|
"result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n",
|
|||
|
"result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n",
|
|||
|
"result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n",
|
|||
|
"result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n",
|
|||
|
"result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n",
|
|||
|
"result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n",
|
|||
|
"result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Формирование данных для оценки старой и новой версии модели"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 98,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n",
|
|||
|
"optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
|
|||
|
" data=class_models[optimized_model_type]\n",
|
|||
|
")\n",
|
|||
|
"optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
|
|||
|
" data=result\n",
|
|||
|
")\n",
|
|||
|
"optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n",
|
|||
|
"optimized_metrics = optimized_metrics.set_index(\"Name\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Оценка параметров старой и новой модели"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 99,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style type=\"text/css\">\n",
|
|||
|
"#T_c81c1_row0_col0, #T_c81c1_row0_col2, #T_c81c1_row0_col3, #T_c81c1_row1_col1 {\n",
|
|||
|
" background-color: #a8db34;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_c81c1_row0_col1, #T_c81c1_row1_col0, #T_c81c1_row1_col2, #T_c81c1_row1_col3 {\n",
|
|||
|
" background-color: #26818e;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_c81c1_row0_col4, #T_c81c1_row0_col6, #T_c81c1_row0_col7 {\n",
|
|||
|
" background-color: #da5a6a;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_c81c1_row0_col5, #T_c81c1_row1_col5 {\n",
|
|||
|
" background-color: #0d0887;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_c81c1_row1_col4, #T_c81c1_row1_col6, #T_c81c1_row1_col7 {\n",
|
|||
|
" background-color: #4e02a2;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"</style>\n",
|
|||
|
"<table id=\"T_c81c1\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"blank level0\" > </th>\n",
|
|||
|
" <th id=\"T_c81c1_level0_col0\" class=\"col_heading level0 col0\" >Precision_train</th>\n",
|
|||
|
" <th id=\"T_c81c1_level0_col1\" class=\"col_heading level0 col1\" >Precision_test</th>\n",
|
|||
|
" <th id=\"T_c81c1_level0_col2\" class=\"col_heading level0 col2\" >Recall_train</th>\n",
|
|||
|
" <th id=\"T_c81c1_level0_col3\" class=\"col_heading level0 col3\" >Recall_test</th>\n",
|
|||
|
" <th id=\"T_c81c1_level0_col4\" class=\"col_heading level0 col4\" >Accuracy_train</th>\n",
|
|||
|
" <th id=\"T_c81c1_level0_col5\" class=\"col_heading level0 col5\" >Accuracy_test</th>\n",
|
|||
|
" <th id=\"T_c81c1_level0_col6\" class=\"col_heading level0 col6\" >F1_train</th>\n",
|
|||
|
" <th id=\"T_c81c1_level0_col7\" class=\"col_heading level0 col7\" >F1_test</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"index_name level0\" >Name</th>\n",
|
|||
|
" <th class=\"blank col0\" > </th>\n",
|
|||
|
" <th class=\"blank col1\" > </th>\n",
|
|||
|
" <th class=\"blank col2\" > </th>\n",
|
|||
|
" <th class=\"blank col3\" > </th>\n",
|
|||
|
" <th class=\"blank col4\" > </th>\n",
|
|||
|
" <th class=\"blank col5\" > </th>\n",
|
|||
|
" <th class=\"blank col6\" > </th>\n",
|
|||
|
" <th class=\"blank col7\" > </th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_c81c1_level0_row0\" class=\"row_heading level0 row0\" >Old</th>\n",
|
|||
|
" <td id=\"T_c81c1_row0_col0\" class=\"data row0 col0\" >0.894340</td>\n",
|
|||
|
" <td id=\"T_c81c1_row0_col1\" class=\"data row0 col1\" >0.794118</td>\n",
|
|||
|
" <td id=\"T_c81c1_row0_col2\" class=\"data row0 col2\" >0.868132</td>\n",
|
|||
|
" <td id=\"T_c81c1_row0_col3\" class=\"data row0 col3\" >0.782609</td>\n",
|
|||
|
" <td id=\"T_c81c1_row0_col4\" class=\"data row0 col4\" >0.910112</td>\n",
|
|||
|
" <td id=\"T_c81c1_row0_col5\" class=\"data row0 col5\" >0.837989</td>\n",
|
|||
|
" <td id=\"T_c81c1_row0_col6\" class=\"data row0 col6\" >0.881041</td>\n",
|
|||
|
" <td id=\"T_c81c1_row0_col7\" class=\"data row0 col7\" >0.788321</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_c81c1_level0_row1\" class=\"row_heading level0 row1\" >New</th>\n",
|
|||
|
" <td id=\"T_c81c1_row1_col0\" class=\"data row1 col0\" >0.867220</td>\n",
|
|||
|
" <td id=\"T_c81c1_row1_col1\" class=\"data row1 col1\" >0.822581</td>\n",
|
|||
|
" <td id=\"T_c81c1_row1_col2\" class=\"data row1 col2\" >0.765568</td>\n",
|
|||
|
" <td id=\"T_c81c1_row1_col3\" class=\"data row1 col3\" >0.739130</td>\n",
|
|||
|
" <td id=\"T_c81c1_row1_col4\" class=\"data row1 col4\" >0.865169</td>\n",
|
|||
|
" <td id=\"T_c81c1_row1_col5\" class=\"data row1 col5\" >0.837989</td>\n",
|
|||
|
" <td id=\"T_c81c1_row1_col6\" class=\"data row1 col6\" >0.813230</td>\n",
|
|||
|
" <td id=\"T_c81c1_row1_col7\" class=\"data row1 col7\" >0.778626</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"<pandas.io.formats.style.Styler at 0x1f1f1135d00>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 99,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"optimized_metrics[\n",
|
|||
|
" [\n",
|
|||
|
" \"Precision_train\",\n",
|
|||
|
" \"Precision_test\",\n",
|
|||
|
" \"Recall_train\",\n",
|
|||
|
" \"Recall_test\",\n",
|
|||
|
" \"Accuracy_train\",\n",
|
|||
|
" \"Accuracy_test\",\n",
|
|||
|
" \"F1_train\",\n",
|
|||
|
" \"F1_test\",\n",
|
|||
|
" ]\n",
|
|||
|
"].style.background_gradient(\n",
|
|||
|
" cmap=\"plasma\",\n",
|
|||
|
" low=0.3,\n",
|
|||
|
" high=1,\n",
|
|||
|
" subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
|
|||
|
").background_gradient(\n",
|
|||
|
" cmap=\"viridis\",\n",
|
|||
|
" low=1,\n",
|
|||
|
" high=0.3,\n",
|
|||
|
" subset=[\n",
|
|||
|
" \"Precision_train\",\n",
|
|||
|
" \"Precision_test\",\n",
|
|||
|
" \"Recall_train\",\n",
|
|||
|
" \"Recall_test\",\n",
|
|||
|
" ],\n",
|
|||
|
")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 100,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style type=\"text/css\">\n",
|
|||
|
"#T_fbb13_row0_col0, #T_fbb13_row1_col0 {\n",
|
|||
|
" background-color: #440154;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_fbb13_row0_col1 {\n",
|
|||
|
" background-color: #a8db34;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_fbb13_row0_col2, #T_fbb13_row1_col3, #T_fbb13_row1_col4 {\n",
|
|||
|
" background-color: #4e02a2;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_fbb13_row0_col3, #T_fbb13_row0_col4, #T_fbb13_row1_col2 {\n",
|
|||
|
" background-color: #da5a6a;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_fbb13_row1_col1 {\n",
|
|||
|
" background-color: #26818e;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"</style>\n",
|
|||
|
"<table id=\"T_fbb13\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"blank level0\" > </th>\n",
|
|||
|
" <th id=\"T_fbb13_level0_col0\" class=\"col_heading level0 col0\" >Accuracy_test</th>\n",
|
|||
|
" <th id=\"T_fbb13_level0_col1\" class=\"col_heading level0 col1\" >F1_test</th>\n",
|
|||
|
" <th id=\"T_fbb13_level0_col2\" class=\"col_heading level0 col2\" >ROC_AUC_test</th>\n",
|
|||
|
" <th id=\"T_fbb13_level0_col3\" class=\"col_heading level0 col3\" >Cohen_kappa_test</th>\n",
|
|||
|
" <th id=\"T_fbb13_level0_col4\" class=\"col_heading level0 col4\" >MCC_test</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"index_name level0\" >Name</th>\n",
|
|||
|
" <th class=\"blank col0\" > </th>\n",
|
|||
|
" <th class=\"blank col1\" > </th>\n",
|
|||
|
" <th class=\"blank col2\" > </th>\n",
|
|||
|
" <th class=\"blank col3\" > </th>\n",
|
|||
|
" <th class=\"blank col4\" > </th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_fbb13_level0_row0\" class=\"row_heading level0 row0\" >Old</th>\n",
|
|||
|
" <td id=\"T_fbb13_row0_col0\" class=\"data row0 col0\" >0.837989</td>\n",
|
|||
|
" <td id=\"T_fbb13_row0_col1\" class=\"data row0 col1\" >0.788321</td>\n",
|
|||
|
" <td id=\"T_fbb13_row0_col2\" class=\"data row0 col2\" >0.858893</td>\n",
|
|||
|
" <td id=\"T_fbb13_row0_col3\" class=\"data row0 col3\" >0.657111</td>\n",
|
|||
|
" <td id=\"T_fbb13_row0_col4\" class=\"data row0 col4\" >0.657157</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_fbb13_level0_row1\" class=\"row_heading level0 row1\" >New</th>\n",
|
|||
|
" <td id=\"T_fbb13_row1_col0\" class=\"data row1 col0\" >0.837989</td>\n",
|
|||
|
" <td id=\"T_fbb13_row1_col1\" class=\"data row1 col1\" >0.778626</td>\n",
|
|||
|
" <td id=\"T_fbb13_row1_col2\" class=\"data row1 col2\" >0.859750</td>\n",
|
|||
|
" <td id=\"T_fbb13_row1_col3\" class=\"data row1 col3\" >0.651447</td>\n",
|
|||
|
" <td id=\"T_fbb13_row1_col4\" class=\"data row1 col4\" >0.653765</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"<pandas.io.formats.style.Styler at 0x1f1f11345c0>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 100,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"optimized_metrics[\n",
|
|||
|
" [\n",
|
|||
|
" \"Accuracy_test\",\n",
|
|||
|
" \"F1_test\",\n",
|
|||
|
" \"ROC_AUC_test\",\n",
|
|||
|
" \"Cohen_kappa_test\",\n",
|
|||
|
" \"MCC_test\",\n",
|
|||
|
" ]\n",
|
|||
|
"].style.background_gradient(\n",
|
|||
|
" cmap=\"plasma\",\n",
|
|||
|
" low=0.3,\n",
|
|||
|
" high=1,\n",
|
|||
|
" subset=[\n",
|
|||
|
" \"ROC_AUC_test\",\n",
|
|||
|
" \"MCC_test\",\n",
|
|||
|
" \"Cohen_kappa_test\",\n",
|
|||
|
" ],\n",
|
|||
|
").background_gradient(\n",
|
|||
|
" cmap=\"viridis\",\n",
|
|||
|
" low=1,\n",
|
|||
|
" high=0.3,\n",
|
|||
|
" subset=[\n",
|
|||
|
" \"Accuracy_test\",\n",
|
|||
|
" \"F1_test\",\n",
|
|||
|
" ],\n",
|
|||
|
")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA20AAAGjCAYAAAC/j/0nAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABXeklEQVR4nO3deXQUZfr28atDyAJJdwAhCwQIsiTIJqAYcVAwLC4jSAaVyYyAoK8KyKIy4sgqCjIq/FAEZ0QWBRFEEEVgEAUBASUKoiICsgSyoGISFrOQ7vcPhpY2LGlSSXeqvp9z6syku7r66RBz5a7nrqdsLpfLJQAAAACAXwrw9QAAAAAAABdG0QYAAAAAfoyiDQAAAAD8GEUbAAAAAPgxijYAAAAA8GMUbQAAAADgxyjaAAAAAMCPBfp6AACA8pWXl6eCggLDjhcUFKSQkBDDjgcAgDeskGsUbQBgIXl5eYqrF6bMo0WGHTMqKkr79+/3u4ADAJifVXKNog0ALKSgoECZR4t0MLW+7OGl75DPPe5UvTYHVFBQ4FfhBgCwBqvkGkUbAFhQWLhNYeG2Uh/HqdIfAwCA0jJ7rlG0AYAFFbmcKnIZcxwAAHzN7LnG6pEAAAAA4MeYaQMAC3LKJadKf0rSiGMAAFBaZs81ijYAsCCnnDKiAcSYowAAUDpmzzXaIwEAAADAjzHTBgAWVORyqchV+hYQI44BAEBpmT3XKNoAwILM3vsPALAWs+ca7ZEAAAAA4MeYaQMAC3LKpSITn5EEAFiL2XONog0AAABAhWb29kiKNgCwILOHGwAAZkLRBgAWZPZVtgAA1mL2XGMhEgAAAADwY8y0AYAFOf+3GXEcAAB8zey5RtEGABZUZNAqW0YcAwCA0jJ7rtEeCQAAAAB+jJk2ALCgIteZzYjjAADga2bPNYo2ALAgs/f+AwCsxey5RnskAAAAAPgxZtoAwIKcsqlINkOOAwCAr5k91yjaAMCCnK4zmxHHAQDA18yea7RHAgAAAIAfY6YNACyoyKA2EiOOAQBAaZk915hpAwCUm+PHj2vo0KGqV6+eQkNDdf311+uLL75wP+9yuTR69GhFR0crNDRUSUlJ2rNnjw9HDACA71G0AYAFnT0jacTmjQEDBmjNmjV64403tHPnTnXp0kVJSUk6cuSIJGny5MmaNm2aZs6cqa1bt6pq1arq2rWr8vLyyuLbAAAwCV/lWnmhaAMAC3K6bIZtkpSbm+ux5efnF3vP3377TUuWLNHkyZPVoUMHNWzYUGPHjlXDhg01Y8YMuVwuTZ06VU899ZS6d++uFi1aaN68eUpPT9eyZcvK+TsEAKhIjM41f0PRBgAotdjYWDkcDvc2ceLEYvucPn1aRUVFCgkJ8Xg8NDRUGzdu1P79+5WZmamkpCT3cw6HQ+3atdPmzZvL/DMAAOCvWIgEACzI6Au209LSZLfb3Y8HBwcX2zc8PFyJiYl6+umnlZCQoMjISL311lvavHmzGjZsqMzMTElSZGSkx+siIyPdzwEAcD5mX4iEog0ALKhIASoyoNmi6H//a7fbPYq2C3njjTd03333qXbt2qpUqZJat26t3r17KzU1tdRjAQBYl9G55m9ojwQAlJsrr7xS69ev14kTJ5SWlqbPP/9chYWFatCggaKioiRJWVlZHq/JyspyPwcAgBVRtAGABbkMuljbdZkXbFetWlXR0dH69ddftXr1anXv3l1xcXGKiorS2rVr3fvl5uZq69atSkxMNOqjAwBMyNe5VtZojwQAC/JV7//q1avlcrnUpEkT7d27V48//rji4+PVr18/2Ww2DR06VBMmTFCjRo0UFxenUaNGKSYmRj169Cj1WAEA5sU1bQAAGCQnJ0cjR47U4cOHVb16dSUnJ+uZZ55R5cqVJUkjRozQyZMn9cADDyg7O1s33HCDVq1aVWzFSQAArMTmcrlcvh4EAKB85ObmyuFwaOXXcaoaXvoO+ZPHnbqlxX7l5OSUaCESAACMZJVc45o2AAAAABWaUzY5FWDA5l175PHjxzV06FDVq1dPoaGhuv766/XFF1+4n3e5XBo9erSio6MVGhqqpKQk7dmzx+vPR9EGABbkq3ADAMBMBgwYoDVr1uiNN97Qzp071aVLFyUlJenIkSOSpMmTJ2vatGmaOXOmtm7dqqpVq6pr167Ky8vz6n0o2gDAgs5esG3EBgCAr/ki13777TctWbJEkydPVocOHdSwYUONHTtWDRs21IwZM+RyuTR16lQ99dRT6t69u1q0aKF58+YpPT1dy5Yt8+rzUbQBAAAAwDlyc3M9tvz8/GL7nD59WkVFRcUWywoNDdXGjRu1f/9+ZWZmKikpyf2cw+FQu3bttHnzZq/GQ9EGABZU5AowbAMAwNeMzrXY2Fg5HA73NnHixGLvGR4ersTERD399NNKT09XUVGR3nzzTW3evFkZGRnKzMyUJEVGRnq8LjIy0v1cSbHkPwBY0Jlr2krf2sg1bQAAf2B0rqWlpXmsHhkcHHze/d944w3dd999ql27tipVqqTWrVurd+/eSk1NLfVYzsUpUgAAAAA4h91u99guVLRdeeWVWr9+vU6cOKG0tDR9/vnnKiwsVIMGDRQVFSVJysrK8nhNVlaW+7mSomgDAAtyKkBFBmxOYgQA4Ad8nWtVq1ZVdHS0fv31V61evVrdu3dXXFycoqKitHbtWvd+ubm52rp1qxITE706Pu2RAGBBRl2PVuRyGTAaAABKx1e5tnr1arlcLjVp0kR79+7V448/rvj4ePXr1082m01Dhw7VhAkT1KhRI8XFxWnUqFGKiYlRjx49vHofijYAAAAAuAw5OTkaOXKkDh8+rOrVqys5OVnPPPOMKleuLEkaMWKETp48qQceeEDZ2dm64YYbtGrVqmIrTl6KzeXiNCkAWEVubq4cDocWbG+mKuGVSn28U8eL9NdW3ygnJ8fjgm0AAMqDVXKNmTYAsKAil01FrtKvsmXEMQAAKC2z5xpXkAMAAACAH2OmDQAs6OwqWaU/Dh32AADfM3uuMdMGAAAAAH6MmTYAsCCnK0BOA5ZGdrKWFQDAD5g91yjaAMCCzN5GAgCwFrPnGu2RAAAAAODHmGkDAAtyyphljZ2lHwoAAKVm9lyjaAMAC3IqQE4Dmi2MOAYAAKVl9lzzz1EBAAAAACQx0wYAllTkClCRAatsGXEMAABKy+y5RtEGABbklE1OGdH7X/pjAABQWmbPNf8sJQEAAAAAkphpAwBLMnsbCQDAWsyeaxRtAAAAACo0426uTdEGAPATZg83AADMhKLNR5xOp9LT0xUeHi6bzT8veATgX1wul44fP66YmBgFBJSuWHK6bHIacRNSA44BcyDXAHiLXCs5ijYfSU9PV2xsrK+HAaACSktLU506dXw9DMADuQbgcpFrl0bR5iPh4eGSpINf1pc9jPYieLqzcXNfDwF+6LQKtVEfun9/lIbToPZIJ+2R+B9yDRdDruF8yLWSo2jzkbOtI/awANnD/fOHA74TaKvs6yHAH7nO/I8RrWdOV4CcBqyQZcQxYA7kGi6GXMN5kWsl5p+jAgAAAABIYqYNACypSDYVqfRnNo04BgAApWX2XKNoAwALMnsbCQDAWsyea/45KgAAAACAJGbaAMCSimRMC0hR6YcCAECpmT3XKNoAwILM3kYCALAWs+eaf44KAAAAACCJog0ALKnIFWDY5tX7FhVp1KhRiouLU2hoqK688ko9/fTTcrlc7n1cLpdGjx6t6OhohYaGKikpSXv27DH6WwAAMBFf5Vp58c9RAQBM6bnnntOMGTP08ssva9euXXruuec0efJkvfTSS+59Jk+erGnTpmnmzJnaunWrqlatqq5duyovL8+HIwcAwHe4pg0ALMglm5wGXLDt+t8xcnNzPR4PDg5WcHBwsf0/++wzde/eXbfddpskqX79+nrrrbf0+eefnzmey6WpU6fqqaeeUvfu3SVJ8+bNU2RkpJYtW6Z77rmn1GMGAJiP0bnmb5hpAwALMrqNJDY2Vg6Hw71NnDjxvO97/fXXa+3atfrhhx8kSTt27NDGjRt1yy23SJL279+
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x400 with 4 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"for index in range(0, len(optimized_metrics)):\n",
|
|||
|
" c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n",
|
|||
|
" disp = ConfusionMatrixDisplay(\n",
|
|||
|
" confusion_matrix=c_matrix, display_labels=[\"no water\", \"water\"]\n",
|
|||
|
" ).plot(ax=ax.flat[index])\n",
|
|||
|
"\n",
|
|||
|
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.2"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|