3513 lines
274 KiB
Plaintext
3513 lines
274 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Лабораторная работа 4"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Бизнес цели:\n",
|
|||
|
"1. Оптимизация ценовой стратегии: анализ факторов, влияющих на стоимость недвижимости, чтобы помочь продавцам устанавливать конкурентоспособные цены и увеличивать прибыль.\n",
|
|||
|
"2. Улучшение инвестиционных решений: предоставление аналитики для инвесторов, чтобы они могли определить наиболее выгодные районы и типы недвижимости для вложений."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Загрузка набора данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 26,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Среднее значение поля 'цена': 540088.1417665294\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>grade</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>sqft_basement</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_renovated</th>\n",
|
|||
|
" <th>zipcode</th>\n",
|
|||
|
" <th>lat</th>\n",
|
|||
|
" <th>long</th>\n",
|
|||
|
" <th>sqft_living15</th>\n",
|
|||
|
" <th>sqft_lot15</th>\n",
|
|||
|
" <th>date_numeric</th>\n",
|
|||
|
" <th>above_average_price</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7129300520</th>\n",
|
|||
|
" <td>221900.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>1180</td>\n",
|
|||
|
" <td>5650</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1955</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98178</td>\n",
|
|||
|
" <td>47.5112</td>\n",
|
|||
|
" <td>-122.257</td>\n",
|
|||
|
" <td>1340</td>\n",
|
|||
|
" <td>5650</td>\n",
|
|||
|
" <td>16356</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6414100192</th>\n",
|
|||
|
" <td>538000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>2570</td>\n",
|
|||
|
" <td>7242</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>400</td>\n",
|
|||
|
" <td>1951</td>\n",
|
|||
|
" <td>1991</td>\n",
|
|||
|
" <td>98125</td>\n",
|
|||
|
" <td>47.7210</td>\n",
|
|||
|
" <td>-122.319</td>\n",
|
|||
|
" <td>1690</td>\n",
|
|||
|
" <td>7639</td>\n",
|
|||
|
" <td>16413</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5631500400</th>\n",
|
|||
|
" <td>180000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>770</td>\n",
|
|||
|
" <td>10000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1933</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98028</td>\n",
|
|||
|
" <td>47.7379</td>\n",
|
|||
|
" <td>-122.233</td>\n",
|
|||
|
" <td>2720</td>\n",
|
|||
|
" <td>8062</td>\n",
|
|||
|
" <td>16491</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2487200875</th>\n",
|
|||
|
" <td>604000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>3.00</td>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>5000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>910</td>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98136</td>\n",
|
|||
|
" <td>47.5208</td>\n",
|
|||
|
" <td>-122.393</td>\n",
|
|||
|
" <td>1360</td>\n",
|
|||
|
" <td>5000</td>\n",
|
|||
|
" <td>16413</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1954400510</th>\n",
|
|||
|
" <td>510000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.00</td>\n",
|
|||
|
" <td>1680</td>\n",
|
|||
|
" <td>8080</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1987</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98074</td>\n",
|
|||
|
" <td>47.6168</td>\n",
|
|||
|
" <td>-122.045</td>\n",
|
|||
|
" <td>1800</td>\n",
|
|||
|
" <td>7503</td>\n",
|
|||
|
" <td>16484</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>263000018</th>\n",
|
|||
|
" <td>360000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1530</td>\n",
|
|||
|
" <td>1131</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2009</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98103</td>\n",
|
|||
|
" <td>47.6993</td>\n",
|
|||
|
" <td>-122.346</td>\n",
|
|||
|
" <td>1530</td>\n",
|
|||
|
" <td>1509</td>\n",
|
|||
|
" <td>16211</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6600060120</th>\n",
|
|||
|
" <td>400000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2310</td>\n",
|
|||
|
" <td>5813</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98146</td>\n",
|
|||
|
" <td>47.5107</td>\n",
|
|||
|
" <td>-122.362</td>\n",
|
|||
|
" <td>1830</td>\n",
|
|||
|
" <td>7200</td>\n",
|
|||
|
" <td>16489</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1523300141</th>\n",
|
|||
|
" <td>402101.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>1350</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2009</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98144</td>\n",
|
|||
|
" <td>47.5944</td>\n",
|
|||
|
" <td>-122.299</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>2007</td>\n",
|
|||
|
" <td>16244</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>291310100</th>\n",
|
|||
|
" <td>400000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1600</td>\n",
|
|||
|
" <td>2388</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2004</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98027</td>\n",
|
|||
|
" <td>47.5345</td>\n",
|
|||
|
" <td>-122.069</td>\n",
|
|||
|
" <td>1410</td>\n",
|
|||
|
" <td>1287</td>\n",
|
|||
|
" <td>16451</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1523300157</th>\n",
|
|||
|
" <td>325000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>1076</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2008</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98144</td>\n",
|
|||
|
" <td>47.5941</td>\n",
|
|||
|
" <td>-122.299</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>1357</td>\n",
|
|||
|
" <td>16358</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>21613 rows × 21 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" price bedrooms bathrooms sqft_living sqft_lot floors \\\n",
|
|||
|
"id \n",
|
|||
|
"7129300520 221900.0 3 1.00 1180 5650 1.0 \n",
|
|||
|
"6414100192 538000.0 3 2.25 2570 7242 2.0 \n",
|
|||
|
"5631500400 180000.0 2 1.00 770 10000 1.0 \n",
|
|||
|
"2487200875 604000.0 4 3.00 1960 5000 1.0 \n",
|
|||
|
"1954400510 510000.0 3 2.00 1680 8080 1.0 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"263000018 360000.0 3 2.50 1530 1131 3.0 \n",
|
|||
|
"6600060120 400000.0 4 2.50 2310 5813 2.0 \n",
|
|||
|
"1523300141 402101.0 2 0.75 1020 1350 2.0 \n",
|
|||
|
"291310100 400000.0 3 2.50 1600 2388 2.0 \n",
|
|||
|
"1523300157 325000.0 2 0.75 1020 1076 2.0 \n",
|
|||
|
"\n",
|
|||
|
" waterfront view condition grade ... sqft_basement yr_built \\\n",
|
|||
|
"id ... \n",
|
|||
|
"7129300520 0 0 3 7 ... 0 1955 \n",
|
|||
|
"6414100192 0 0 3 7 ... 400 1951 \n",
|
|||
|
"5631500400 0 0 3 6 ... 0 1933 \n",
|
|||
|
"2487200875 0 0 5 7 ... 910 1965 \n",
|
|||
|
"1954400510 0 0 3 8 ... 0 1987 \n",
|
|||
|
"... ... ... ... ... ... ... ... \n",
|
|||
|
"263000018 0 0 3 8 ... 0 2009 \n",
|
|||
|
"6600060120 0 0 3 8 ... 0 2014 \n",
|
|||
|
"1523300141 0 0 3 7 ... 0 2009 \n",
|
|||
|
"291310100 0 0 3 8 ... 0 2004 \n",
|
|||
|
"1523300157 0 0 3 7 ... 0 2008 \n",
|
|||
|
"\n",
|
|||
|
" yr_renovated zipcode lat long sqft_living15 \\\n",
|
|||
|
"id \n",
|
|||
|
"7129300520 0 98178 47.5112 -122.257 1340 \n",
|
|||
|
"6414100192 1991 98125 47.7210 -122.319 1690 \n",
|
|||
|
"5631500400 0 98028 47.7379 -122.233 2720 \n",
|
|||
|
"2487200875 0 98136 47.5208 -122.393 1360 \n",
|
|||
|
"1954400510 0 98074 47.6168 -122.045 1800 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"263000018 0 98103 47.6993 -122.346 1530 \n",
|
|||
|
"6600060120 0 98146 47.5107 -122.362 1830 \n",
|
|||
|
"1523300141 0 98144 47.5944 -122.299 1020 \n",
|
|||
|
"291310100 0 98027 47.5345 -122.069 1410 \n",
|
|||
|
"1523300157 0 98144 47.5941 -122.299 1020 \n",
|
|||
|
"\n",
|
|||
|
" sqft_lot15 date_numeric above_average_price \n",
|
|||
|
"id \n",
|
|||
|
"7129300520 5650 16356 0 \n",
|
|||
|
"6414100192 7639 16413 0 \n",
|
|||
|
"5631500400 8062 16491 0 \n",
|
|||
|
"2487200875 5000 16413 1 \n",
|
|||
|
"1954400510 7503 16484 0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"263000018 1509 16211 0 \n",
|
|||
|
"6600060120 7200 16489 0 \n",
|
|||
|
"1523300141 2007 16244 0 \n",
|
|||
|
"291310100 1287 16451 0 \n",
|
|||
|
"1523300157 1357 16358 0 \n",
|
|||
|
"\n",
|
|||
|
"[21613 rows x 21 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 26,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"from sklearn import set_config\n",
|
|||
|
"\n",
|
|||
|
"set_config(transform_output=\"pandas\")\n",
|
|||
|
"\n",
|
|||
|
"random_state = 42\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"data/kc_house_data.csv\", index_col=\"id\")\n",
|
|||
|
"\n",
|
|||
|
"df[\"date\"] = pd.to_datetime(df[\"date\"])\n",
|
|||
|
"df[\"date_numeric\"] = (df[\"date\"] - pd.Timestamp(\"1970-01-01\")).dt.days\n",
|
|||
|
"df = df.drop(columns=[\"date\"])\n",
|
|||
|
"\n",
|
|||
|
"average_price = df['price'].mean()\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Среднее значение поля 'цена': {average_price}\")\n",
|
|||
|
"\n",
|
|||
|
"average_price = df[\"price\"].mean()\n",
|
|||
|
"df['above_average_price'] = (df['price'] > average_price).astype(int)\n",
|
|||
|
"\n",
|
|||
|
"df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'X_train'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>grade</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>sqft_basement</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_renovated</th>\n",
|
|||
|
" <th>zipcode</th>\n",
|
|||
|
" <th>lat</th>\n",
|
|||
|
" <th>long</th>\n",
|
|||
|
" <th>sqft_living15</th>\n",
|
|||
|
" <th>sqft_lot15</th>\n",
|
|||
|
" <th>date_numeric</th>\n",
|
|||
|
" <th>above_average_price</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5205000020</th>\n",
|
|||
|
" <td>360000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2610</td>\n",
|
|||
|
" <td>7333</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1988</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98003</td>\n",
|
|||
|
" <td>47.2721</td>\n",
|
|||
|
" <td>-122.293</td>\n",
|
|||
|
" <td>2280</td>\n",
|
|||
|
" <td>9033</td>\n",
|
|||
|
" <td>16534</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4221270290</th>\n",
|
|||
|
" <td>544900.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1990</td>\n",
|
|||
|
" <td>4936</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2004</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98075</td>\n",
|
|||
|
" <td>47.5911</td>\n",
|
|||
|
" <td>-122.018</td>\n",
|
|||
|
" <td>2250</td>\n",
|
|||
|
" <td>4815</td>\n",
|
|||
|
" <td>16395</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3438501327</th>\n",
|
|||
|
" <td>352500.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1570</td>\n",
|
|||
|
" <td>2399</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>390</td>\n",
|
|||
|
" <td>2009</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98106</td>\n",
|
|||
|
" <td>47.5488</td>\n",
|
|||
|
" <td>-122.364</td>\n",
|
|||
|
" <td>1590</td>\n",
|
|||
|
" <td>2306</td>\n",
|
|||
|
" <td>16559</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2726079098</th>\n",
|
|||
|
" <td>560000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2840</td>\n",
|
|||
|
" <td>216493</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1991</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98014</td>\n",
|
|||
|
" <td>47.7020</td>\n",
|
|||
|
" <td>-121.892</td>\n",
|
|||
|
" <td>2820</td>\n",
|
|||
|
" <td>175111</td>\n",
|
|||
|
" <td>16331</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5072200040</th>\n",
|
|||
|
" <td>403000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.00</td>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>13100</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>310</td>\n",
|
|||
|
" <td>1957</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98166</td>\n",
|
|||
|
" <td>47.4419</td>\n",
|
|||
|
" <td>-122.340</td>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>10518</td>\n",
|
|||
|
" <td>16192</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5104531120</th>\n",
|
|||
|
" <td>775000.0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>2.75</td>\n",
|
|||
|
" <td>3750</td>\n",
|
|||
|
" <td>12077</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2005</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98038</td>\n",
|
|||
|
" <td>47.3525</td>\n",
|
|||
|
" <td>-122.002</td>\n",
|
|||
|
" <td>3120</td>\n",
|
|||
|
" <td>7255</td>\n",
|
|||
|
" <td>16517</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2685600090</th>\n",
|
|||
|
" <td>345000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1.50</td>\n",
|
|||
|
" <td>1030</td>\n",
|
|||
|
" <td>6969</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1921</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98108</td>\n",
|
|||
|
" <td>47.5492</td>\n",
|
|||
|
" <td>-122.300</td>\n",
|
|||
|
" <td>1420</td>\n",
|
|||
|
" <td>6000</td>\n",
|
|||
|
" <td>16392</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9528104985</th>\n",
|
|||
|
" <td>611000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>1270</td>\n",
|
|||
|
" <td>5100</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>170</td>\n",
|
|||
|
" <td>1900</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98115</td>\n",
|
|||
|
" <td>47.6771</td>\n",
|
|||
|
" <td>-122.328</td>\n",
|
|||
|
" <td>1670</td>\n",
|
|||
|
" <td>3900</td>\n",
|
|||
|
" <td>16378</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3450300430</th>\n",
|
|||
|
" <td>317500.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>1.50</td>\n",
|
|||
|
" <td>1730</td>\n",
|
|||
|
" <td>7700</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>720</td>\n",
|
|||
|
" <td>1963</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98059</td>\n",
|
|||
|
" <td>47.4996</td>\n",
|
|||
|
" <td>-122.163</td>\n",
|
|||
|
" <td>1650</td>\n",
|
|||
|
" <td>8066</td>\n",
|
|||
|
" <td>16440</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3956900480</th>\n",
|
|||
|
" <td>779000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1.75</td>\n",
|
|||
|
" <td>1990</td>\n",
|
|||
|
" <td>5600</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>660</td>\n",
|
|||
|
" <td>1941</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98199</td>\n",
|
|||
|
" <td>47.6500</td>\n",
|
|||
|
" <td>-122.415</td>\n",
|
|||
|
" <td>2630</td>\n",
|
|||
|
" <td>6780</td>\n",
|
|||
|
" <td>16316</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>17290 rows × 21 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" price bedrooms bathrooms sqft_living sqft_lot floors \\\n",
|
|||
|
"id \n",
|
|||
|
"5205000020 360000.0 4 2.50 2610 7333 2.0 \n",
|
|||
|
"4221270290 544900.0 3 2.50 1990 4936 2.0 \n",
|
|||
|
"3438501327 352500.0 2 2.50 1570 2399 2.0 \n",
|
|||
|
"2726079098 560000.0 3 2.50 2840 216493 2.0 \n",
|
|||
|
"5072200040 403000.0 3 2.00 1960 13100 1.0 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"5104531120 775000.0 5 2.75 3750 12077 2.0 \n",
|
|||
|
"2685600090 345000.0 3 1.50 1030 6969 1.0 \n",
|
|||
|
"9528104985 611000.0 2 1.00 1270 5100 1.0 \n",
|
|||
|
"3450300430 317500.0 4 1.50 1730 7700 1.0 \n",
|
|||
|
"3956900480 779000.0 3 1.75 1990 5600 1.0 \n",
|
|||
|
"\n",
|
|||
|
" waterfront view condition grade ... sqft_basement yr_built \\\n",
|
|||
|
"id ... \n",
|
|||
|
"5205000020 0 0 3 8 ... 0 1988 \n",
|
|||
|
"4221270290 0 0 3 8 ... 0 2004 \n",
|
|||
|
"3438501327 0 0 3 7 ... 390 2009 \n",
|
|||
|
"2726079098 0 0 3 9 ... 0 1991 \n",
|
|||
|
"5072200040 0 2 5 8 ... 310 1957 \n",
|
|||
|
"... ... ... ... ... ... ... ... \n",
|
|||
|
"5104531120 0 4 3 10 ... 0 2005 \n",
|
|||
|
"2685600090 0 0 4 6 ... 0 1921 \n",
|
|||
|
"9528104985 0 0 3 7 ... 170 1900 \n",
|
|||
|
"3450300430 0 0 4 7 ... 720 1963 \n",
|
|||
|
"3956900480 0 1 3 8 ... 660 1941 \n",
|
|||
|
"\n",
|
|||
|
" yr_renovated zipcode lat long sqft_living15 \\\n",
|
|||
|
"id \n",
|
|||
|
"5205000020 0 98003 47.2721 -122.293 2280 \n",
|
|||
|
"4221270290 0 98075 47.5911 -122.018 2250 \n",
|
|||
|
"3438501327 0 98106 47.5488 -122.364 1590 \n",
|
|||
|
"2726079098 0 98014 47.7020 -121.892 2820 \n",
|
|||
|
"5072200040 0 98166 47.4419 -122.340 1960 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"5104531120 0 98038 47.3525 -122.002 3120 \n",
|
|||
|
"2685600090 0 98108 47.5492 -122.300 1420 \n",
|
|||
|
"9528104985 0 98115 47.6771 -122.328 1670 \n",
|
|||
|
"3450300430 0 98059 47.4996 -122.163 1650 \n",
|
|||
|
"3956900480 0 98199 47.6500 -122.415 2630 \n",
|
|||
|
"\n",
|
|||
|
" sqft_lot15 date_numeric above_average_price \n",
|
|||
|
"id \n",
|
|||
|
"5205000020 9033 16534 0 \n",
|
|||
|
"4221270290 4815 16395 1 \n",
|
|||
|
"3438501327 2306 16559 0 \n",
|
|||
|
"2726079098 175111 16331 1 \n",
|
|||
|
"5072200040 10518 16192 0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"5104531120 7255 16517 1 \n",
|
|||
|
"2685600090 6000 16392 0 \n",
|
|||
|
"9528104985 3900 16378 1 \n",
|
|||
|
"3450300430 8066 16440 0 \n",
|
|||
|
"3956900480 6780 16316 1 \n",
|
|||
|
"\n",
|
|||
|
"[17290 rows x 21 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'y_train'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>above_average_price</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5205000020</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4221270290</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3438501327</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2726079098</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5072200040</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5104531120</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2685600090</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9528104985</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3450300430</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3956900480</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>17290 rows × 1 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" above_average_price\n",
|
|||
|
"id \n",
|
|||
|
"5205000020 0\n",
|
|||
|
"4221270290 1\n",
|
|||
|
"3438501327 0\n",
|
|||
|
"2726079098 1\n",
|
|||
|
"5072200040 0\n",
|
|||
|
"... ...\n",
|
|||
|
"5104531120 1\n",
|
|||
|
"2685600090 0\n",
|
|||
|
"9528104985 1\n",
|
|||
|
"3450300430 0\n",
|
|||
|
"3956900480 1\n",
|
|||
|
"\n",
|
|||
|
"[17290 rows x 1 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'X_test'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>grade</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>sqft_basement</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_renovated</th>\n",
|
|||
|
" <th>zipcode</th>\n",
|
|||
|
" <th>lat</th>\n",
|
|||
|
" <th>long</th>\n",
|
|||
|
" <th>sqft_living15</th>\n",
|
|||
|
" <th>sqft_lot15</th>\n",
|
|||
|
" <th>date_numeric</th>\n",
|
|||
|
" <th>above_average_price</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9421500010</th>\n",
|
|||
|
" <td>442500.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>1970</td>\n",
|
|||
|
" <td>7902</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>660</td>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98125</td>\n",
|
|||
|
" <td>47.7249</td>\n",
|
|||
|
" <td>-122.298</td>\n",
|
|||
|
" <td>1860</td>\n",
|
|||
|
" <td>8021</td>\n",
|
|||
|
" <td>16471</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3204800200</th>\n",
|
|||
|
" <td>665000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.75</td>\n",
|
|||
|
" <td>3320</td>\n",
|
|||
|
" <td>10574</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1100</td>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98056</td>\n",
|
|||
|
" <td>47.5376</td>\n",
|
|||
|
" <td>-122.180</td>\n",
|
|||
|
" <td>2720</td>\n",
|
|||
|
" <td>8330</td>\n",
|
|||
|
" <td>16443</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3320000212</th>\n",
|
|||
|
" <td>397500.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>1350</td>\n",
|
|||
|
" <td>980</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>300</td>\n",
|
|||
|
" <td>2007</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98144</td>\n",
|
|||
|
" <td>47.5998</td>\n",
|
|||
|
" <td>-122.312</td>\n",
|
|||
|
" <td>1350</td>\n",
|
|||
|
" <td>1245</td>\n",
|
|||
|
" <td>16349</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9206950100</th>\n",
|
|||
|
" <td>343000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1270</td>\n",
|
|||
|
" <td>2509</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2004</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98106</td>\n",
|
|||
|
" <td>47.5357</td>\n",
|
|||
|
" <td>-122.365</td>\n",
|
|||
|
" <td>1420</td>\n",
|
|||
|
" <td>2206</td>\n",
|
|||
|
" <td>16238</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3121069038</th>\n",
|
|||
|
" <td>355000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2620</td>\n",
|
|||
|
" <td>78843</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1310</td>\n",
|
|||
|
" <td>1964</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98092</td>\n",
|
|||
|
" <td>47.2584</td>\n",
|
|||
|
" <td>-122.093</td>\n",
|
|||
|
" <td>2330</td>\n",
|
|||
|
" <td>130244</td>\n",
|
|||
|
" <td>16520</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7889601165</th>\n",
|
|||
|
" <td>268000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1700</td>\n",
|
|||
|
" <td>2250</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98168</td>\n",
|
|||
|
" <td>47.4914</td>\n",
|
|||
|
" <td>-122.334</td>\n",
|
|||
|
" <td>1520</td>\n",
|
|||
|
" <td>4500</td>\n",
|
|||
|
" <td>16308</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7278700070</th>\n",
|
|||
|
" <td>660000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2400</td>\n",
|
|||
|
" <td>6474</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>840</td>\n",
|
|||
|
" <td>1964</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98177</td>\n",
|
|||
|
" <td>47.7728</td>\n",
|
|||
|
" <td>-122.386</td>\n",
|
|||
|
" <td>2340</td>\n",
|
|||
|
" <td>10856</td>\n",
|
|||
|
" <td>16437</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1823059030</th>\n",
|
|||
|
" <td>159000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>1320</td>\n",
|
|||
|
" <td>6534</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1952</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98055</td>\n",
|
|||
|
" <td>47.4806</td>\n",
|
|||
|
" <td>-122.223</td>\n",
|
|||
|
" <td>2140</td>\n",
|
|||
|
" <td>7405</td>\n",
|
|||
|
" <td>16300</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3448900420</th>\n",
|
|||
|
" <td>620000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2500</td>\n",
|
|||
|
" <td>8282</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2013</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98056</td>\n",
|
|||
|
" <td>47.5127</td>\n",
|
|||
|
" <td>-122.169</td>\n",
|
|||
|
" <td>2500</td>\n",
|
|||
|
" <td>8046</td>\n",
|
|||
|
" <td>16335</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>626059335</th>\n",
|
|||
|
" <td>527000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>2330</td>\n",
|
|||
|
" <td>19436</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1987</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98011</td>\n",
|
|||
|
" <td>47.7663</td>\n",
|
|||
|
" <td>-122.215</td>\n",
|
|||
|
" <td>1910</td>\n",
|
|||
|
" <td>10055</td>\n",
|
|||
|
" <td>16317</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>4323 rows × 21 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" price bedrooms bathrooms sqft_living sqft_lot floors \\\n",
|
|||
|
"id \n",
|
|||
|
"9421500010 442500.0 4 2.25 1970 7902 1.0 \n",
|
|||
|
"3204800200 665000.0 4 2.75 3320 10574 2.0 \n",
|
|||
|
"3320000212 397500.0 3 2.25 1350 980 2.0 \n",
|
|||
|
"9206950100 343000.0 3 2.50 1270 2509 2.0 \n",
|
|||
|
"3121069038 355000.0 3 2.50 2620 78843 1.0 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"7889601165 268000.0 3 2.50 1700 2250 2.0 \n",
|
|||
|
"7278700070 660000.0 3 2.50 2400 6474 1.0 \n",
|
|||
|
"1823059030 159000.0 3 1.00 1320 6534 1.0 \n",
|
|||
|
"3448900420 620000.0 4 2.50 2500 8282 2.0 \n",
|
|||
|
"626059335 527000.0 4 2.25 2330 19436 2.0 \n",
|
|||
|
"\n",
|
|||
|
" waterfront view condition grade ... sqft_basement yr_built \\\n",
|
|||
|
"id ... \n",
|
|||
|
"9421500010 0 0 3 8 ... 660 1960 \n",
|
|||
|
"3204800200 0 0 5 8 ... 1100 1960 \n",
|
|||
|
"3320000212 0 0 3 8 ... 300 2007 \n",
|
|||
|
"9206950100 0 0 3 8 ... 0 2004 \n",
|
|||
|
"3121069038 0 3 4 7 ... 1310 1964 \n",
|
|||
|
"... ... ... ... ... ... ... ... \n",
|
|||
|
"7889601165 0 0 3 7 ... 0 2014 \n",
|
|||
|
"7278700070 0 2 3 8 ... 840 1964 \n",
|
|||
|
"1823059030 0 0 3 7 ... 0 1952 \n",
|
|||
|
"3448900420 0 0 3 9 ... 0 2013 \n",
|
|||
|
"626059335 0 0 3 8 ... 0 1987 \n",
|
|||
|
"\n",
|
|||
|
" yr_renovated zipcode lat long sqft_living15 \\\n",
|
|||
|
"id \n",
|
|||
|
"9421500010 0 98125 47.7249 -122.298 1860 \n",
|
|||
|
"3204800200 0 98056 47.5376 -122.180 2720 \n",
|
|||
|
"3320000212 0 98144 47.5998 -122.312 1350 \n",
|
|||
|
"9206950100 0 98106 47.5357 -122.365 1420 \n",
|
|||
|
"3121069038 0 98092 47.2584 -122.093 2330 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"7889601165 0 98168 47.4914 -122.334 1520 \n",
|
|||
|
"7278700070 0 98177 47.7728 -122.386 2340 \n",
|
|||
|
"1823059030 0 98055 47.4806 -122.223 2140 \n",
|
|||
|
"3448900420 0 98056 47.5127 -122.169 2500 \n",
|
|||
|
"626059335 0 98011 47.7663 -122.215 1910 \n",
|
|||
|
"\n",
|
|||
|
" sqft_lot15 date_numeric above_average_price \n",
|
|||
|
"id \n",
|
|||
|
"9421500010 8021 16471 0 \n",
|
|||
|
"3204800200 8330 16443 1 \n",
|
|||
|
"3320000212 1245 16349 0 \n",
|
|||
|
"9206950100 2206 16238 0 \n",
|
|||
|
"3121069038 130244 16520 0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"7889601165 4500 16308 0 \n",
|
|||
|
"7278700070 10856 16437 1 \n",
|
|||
|
"1823059030 7405 16300 0 \n",
|
|||
|
"3448900420 8046 16335 1 \n",
|
|||
|
"626059335 10055 16317 0 \n",
|
|||
|
"\n",
|
|||
|
"[4323 rows x 21 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'y_test'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>above_average_price</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9421500010</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3204800200</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3320000212</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9206950100</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3121069038</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7889601165</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7278700070</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1823059030</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3448900420</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>626059335</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>4323 rows × 1 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" above_average_price\n",
|
|||
|
"id \n",
|
|||
|
"9421500010 0\n",
|
|||
|
"3204800200 1\n",
|
|||
|
"3320000212 0\n",
|
|||
|
"9206950100 0\n",
|
|||
|
"3121069038 0\n",
|
|||
|
"... ...\n",
|
|||
|
"7889601165 0\n",
|
|||
|
"7278700070 1\n",
|
|||
|
"1823059030 0\n",
|
|||
|
"3448900420 1\n",
|
|||
|
"626059335 0\n",
|
|||
|
"\n",
|
|||
|
"[4323 rows x 1 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from typing import Tuple\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"from pandas import DataFrame\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"def split_stratified_into_train_val_test(\n",
|
|||
|
" df_input,\n",
|
|||
|
" stratify_colname=\"y\",\n",
|
|||
|
" frac_train=0.6,\n",
|
|||
|
" frac_val=0.15,\n",
|
|||
|
" frac_test=0.25,\n",
|
|||
|
" random_state=None,\n",
|
|||
|
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
|
|||
|
" if frac_train + frac_val + frac_test != 1.0:\n",
|
|||
|
" raise ValueError(\n",
|
|||
|
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
|||
|
" % (frac_train, frac_val, frac_test)\n",
|
|||
|
" )\n",
|
|||
|
" if stratify_colname not in df_input.columns:\n",
|
|||
|
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
|||
|
" X = df_input \n",
|
|||
|
" y = df_input[\n",
|
|||
|
" [stratify_colname]\n",
|
|||
|
" ] \n",
|
|||
|
" \n",
|
|||
|
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
|||
|
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
|||
|
" )\n",
|
|||
|
" if frac_val <= 0:\n",
|
|||
|
" assert len(df_input) == len(df_train) + len(df_temp)\n",
|
|||
|
" return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
|
|||
|
" \n",
|
|||
|
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
|||
|
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
|||
|
" df_temp,\n",
|
|||
|
" y_temp,\n",
|
|||
|
" stratify=y_temp,\n",
|
|||
|
" test_size=relative_frac_test,\n",
|
|||
|
" random_state=random_state,\n",
|
|||
|
" )\n",
|
|||
|
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
|||
|
" return df_train, df_val, df_test, y_train, y_val, y_test\n",
|
|||
|
"\n",
|
|||
|
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
|
|||
|
" df,\n",
|
|||
|
" stratify_colname=\"above_average_price\",\n",
|
|||
|
" frac_train=0.80,\n",
|
|||
|
" frac_val=0,\n",
|
|||
|
" frac_test=0.20,\n",
|
|||
|
" random_state=random_state,\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"display(\"X_train\", X_train)\n",
|
|||
|
"display(\"y_train\", y_train)\n",
|
|||
|
"\n",
|
|||
|
"display(\"X_test\", X_test)\n",
|
|||
|
"display(\"y_test\", y_test)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Формирование конвейера для классификации данных\n",
|
|||
|
"\n",
|
|||
|
"preprocessing_num - конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
|
|||
|
"\n",
|
|||
|
"preprocessing_cat - конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
|
|||
|
"\n",
|
|||
|
"features_preprocessing - трансформер для предобработки признаков\n",
|
|||
|
"\n",
|
|||
|
"features_engineering - трансформер для конструирования признаков\n",
|
|||
|
"\n",
|
|||
|
"drop_columns - трансформер для удаления колонок\n",
|
|||
|
"\n",
|
|||
|
"pipeline_end - основной конвейер предобработки данных и конструирования признаков"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 29,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
"from sklearn.compose import ColumnTransformer\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.impute import SimpleImputer\n",
|
|||
|
"\n",
|
|||
|
"# Список числовых колонок\n",
|
|||
|
"num_columns = [\n",
|
|||
|
" \"price\",\n",
|
|||
|
" \"bedrooms\",\n",
|
|||
|
" \"bathrooms\",\n",
|
|||
|
" \"sqft_living\",\n",
|
|||
|
" \"sqft_lot\",\n",
|
|||
|
" \"floors\",\n",
|
|||
|
" \"waterfront\",\n",
|
|||
|
" \"view\",\n",
|
|||
|
" \"condition\",\n",
|
|||
|
" \"grade\",\n",
|
|||
|
" \"sqft_above\",\n",
|
|||
|
" \"sqft_basement\",\n",
|
|||
|
" \"yr_built\",\n",
|
|||
|
" \"yr_renovated\",\n",
|
|||
|
" \"zipcode\",\n",
|
|||
|
" \"lat\",\n",
|
|||
|
" \"long\",\n",
|
|||
|
" \"sqft_living15\",\n",
|
|||
|
" \"sqft_lot15\",\n",
|
|||
|
" \"date_numeric\"\n",
|
|||
|
"]\n",
|
|||
|
"columns_to_drop = [\"date\"]\n",
|
|||
|
"\n",
|
|||
|
"# Конвейер для числовых данных\n",
|
|||
|
"num_imputer = SimpleImputer(strategy=\"median\")\n",
|
|||
|
"num_scaler = StandardScaler()\n",
|
|||
|
"preprocessing_num = Pipeline(\n",
|
|||
|
" [\n",
|
|||
|
" (\"imputer\", num_imputer),\n",
|
|||
|
" (\"scaler\", num_scaler),\n",
|
|||
|
" ]\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Конвейер для удаления колонок\n",
|
|||
|
"drop_columns = ColumnTransformer(\n",
|
|||
|
" transformers=[\n",
|
|||
|
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
|
|||
|
" ],\n",
|
|||
|
" remainder=\"passthrough\",\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Предобработка только для числовых данных\n",
|
|||
|
"features_preprocessing = ColumnTransformer(\n",
|
|||
|
" transformers=[\n",
|
|||
|
" (\"preprocessing_num\", preprocessing_num, num_columns),\n",
|
|||
|
" ],\n",
|
|||
|
" remainder=\"passthrough\",\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Итоговый конвейер\n",
|
|||
|
"pipeline_end = Pipeline(\n",
|
|||
|
" [\n",
|
|||
|
" (\"features_preprocessing\", features_preprocessing),\n",
|
|||
|
" ]\n",
|
|||
|
")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Демонстрация работы конвейера для предобработки данных при классификации"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>preprocessing_num__price</th>\n",
|
|||
|
" <th>preprocessing_num__bedrooms</th>\n",
|
|||
|
" <th>preprocessing_num__bathrooms</th>\n",
|
|||
|
" <th>preprocessing_num__sqft_living</th>\n",
|
|||
|
" <th>preprocessing_num__sqft_lot</th>\n",
|
|||
|
" <th>preprocessing_num__floors</th>\n",
|
|||
|
" <th>preprocessing_num__waterfront</th>\n",
|
|||
|
" <th>preprocessing_num__view</th>\n",
|
|||
|
" <th>preprocessing_num__condition</th>\n",
|
|||
|
" <th>preprocessing_num__grade</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>preprocessing_num__sqft_basement</th>\n",
|
|||
|
" <th>preprocessing_num__yr_built</th>\n",
|
|||
|
" <th>preprocessing_num__yr_renovated</th>\n",
|
|||
|
" <th>preprocessing_num__zipcode</th>\n",
|
|||
|
" <th>preprocessing_num__lat</th>\n",
|
|||
|
" <th>preprocessing_num__long</th>\n",
|
|||
|
" <th>preprocessing_num__sqft_living15</th>\n",
|
|||
|
" <th>preprocessing_num__sqft_lot15</th>\n",
|
|||
|
" <th>preprocessing_num__date_numeric</th>\n",
|
|||
|
" <th>remainder__above_average_price</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5205000020</th>\n",
|
|||
|
" <td>-0.492897</td>\n",
|
|||
|
" <td>0.697500</td>\n",
|
|||
|
" <td>0.497960</td>\n",
|
|||
|
" <td>0.582210</td>\n",
|
|||
|
" <td>-0.181872</td>\n",
|
|||
|
" <td>0.939548</td>\n",
|
|||
|
" <td>-0.087375</td>\n",
|
|||
|
" <td>-0.307461</td>\n",
|
|||
|
" <td>-0.630265</td>\n",
|
|||
|
" <td>0.293371</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>-0.660870</td>\n",
|
|||
|
" <td>0.576070</td>\n",
|
|||
|
" <td>-0.208897</td>\n",
|
|||
|
" <td>-1.397782</td>\n",
|
|||
|
" <td>-2.073883</td>\n",
|
|||
|
" <td>-0.561487</td>\n",
|
|||
|
" <td>0.427608</td>\n",
|
|||
|
" <td>-0.130375</td>\n",
|
|||
|
" <td>1.432062</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4221270290</th>\n",
|
|||
|
" <td>0.014419</td>\n",
|
|||
|
" <td>-0.406066</td>\n",
|
|||
|
" <td>0.497960</td>\n",
|
|||
|
" <td>-0.097029</td>\n",
|
|||
|
" <td>-0.239318</td>\n",
|
|||
|
" <td>0.939548</td>\n",
|
|||
|
" <td>-0.087375</td>\n",
|
|||
|
" <td>-0.307461</td>\n",
|
|||
|
" <td>-0.630265</td>\n",
|
|||
|
" <td>0.293371</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>-0.660870</td>\n",
|
|||
|
" <td>1.122105</td>\n",
|
|||
|
" <td>-0.208897</td>\n",
|
|||
|
" <td>-0.054650</td>\n",
|
|||
|
" <td>0.227682</td>\n",
|
|||
|
" <td>1.403376</td>\n",
|
|||
|
" <td>0.383811</td>\n",
|
|||
|
" <td>-0.289464</td>\n",
|
|||
|
" <td>0.203573</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3438501327</th>\n",
|
|||
|
" <td>-0.513475</td>\n",
|
|||
|
" <td>-1.509633</td>\n",
|
|||
|
" <td>0.497960</td>\n",
|
|||
|
" <td>-0.557159</td>\n",
|
|||
|
" <td>-0.300120</td>\n",
|
|||
|
" <td>0.939548</td>\n",
|
|||
|
" <td>-0.087375</td>\n",
|
|||
|
" <td>-0.307461</td>\n",
|
|||
|
" <td>-0.630265</td>\n",
|
|||
|
" <td>-0.560854</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.221452</td>\n",
|
|||
|
" <td>1.292741</td>\n",
|
|||
|
" <td>-0.208897</td>\n",
|
|||
|
" <td>0.523643</td>\n",
|
|||
|
" <td>-0.077510</td>\n",
|
|||
|
" <td>-1.068778</td>\n",
|
|||
|
" <td>-0.579724</td>\n",
|
|||
|
" <td>-0.384096</td>\n",
|
|||
|
" <td>1.653013</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2726079098</th>\n",
|
|||
|
" <td>0.055850</td>\n",
|
|||
|
" <td>-0.406066</td>\n",
|
|||
|
" <td>0.497960</td>\n",
|
|||
|
" <td>0.834186</td>\n",
|
|||
|
" <td>4.830831</td>\n",
|
|||
|
" <td>0.939548</td>\n",
|
|||
|
" <td>-0.087375</td>\n",
|
|||
|
" <td>-0.307461</td>\n",
|
|||
|
" <td>-0.630265</td>\n",
|
|||
|
" <td>1.147596</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>-0.660870</td>\n",
|
|||
|
" <td>0.678452</td>\n",
|
|||
|
" <td>-0.208897</td>\n",
|
|||
|
" <td>-1.192581</td>\n",
|
|||
|
" <td>1.027819</td>\n",
|
|||
|
" <td>2.303641</td>\n",
|
|||
|
" <td>1.215954</td>\n",
|
|||
|
" <td>6.133562</td>\n",
|
|||
|
" <td>-0.362062</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5072200040</th>\n",
|
|||
|
" <td>-0.374916</td>\n",
|
|||
|
" <td>-0.406066</td>\n",
|
|||
|
" <td>-0.153502</td>\n",
|
|||
|
" <td>-0.129896</td>\n",
|
|||
|
" <td>-0.043661</td>\n",
|
|||
|
" <td>-0.918592</td>\n",
|
|||
|
" <td>-0.087375</td>\n",
|
|||
|
" <td>2.286974</td>\n",
|
|||
|
" <td>2.434645</td>\n",
|
|||
|
" <td>0.293371</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.040463</td>\n",
|
|||
|
" <td>-0.481872</td>\n",
|
|||
|
" <td>-0.208897</td>\n",
|
|||
|
" <td>1.642920</td>\n",
|
|||
|
" <td>-0.848786</td>\n",
|
|||
|
" <td>-0.897299</td>\n",
|
|||
|
" <td>-0.039561</td>\n",
|
|||
|
" <td>-0.074365</td>\n",
|
|||
|
" <td>-1.590551</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5104531120</th>\n",
|
|||
|
" <td>0.645752</td>\n",
|
|||
|
" <td>1.801066</td>\n",
|
|||
|
" <td>0.823691</td>\n",
|
|||
|
" <td>1.831134</td>\n",
|
|||
|
" <td>-0.068178</td>\n",
|
|||
|
" <td>0.939548</td>\n",
|
|||
|
" <td>-0.087375</td>\n",
|
|||
|
" <td>4.881408</td>\n",
|
|||
|
" <td>-0.630265</td>\n",
|
|||
|
" <td>2.001820</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>-0.660870</td>\n",
|
|||
|
" <td>1.156232</td>\n",
|
|||
|
" <td>-0.208897</td>\n",
|
|||
|
" <td>-0.744871</td>\n",
|
|||
|
" <td>-1.493802</td>\n",
|
|||
|
" <td>1.517696</td>\n",
|
|||
|
" <td>1.653925</td>\n",
|
|||
|
" <td>-0.197435</td>\n",
|
|||
|
" <td>1.281815</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2685600090</th>\n",
|
|||
|
" <td>-0.534053</td>\n",
|
|||
|
" <td>-0.406066</td>\n",
|
|||
|
" <td>-0.804965</td>\n",
|
|||
|
" <td>-1.148755</td>\n",
|
|||
|
" <td>-0.190596</td>\n",
|
|||
|
" <td>-0.918592</td>\n",
|
|||
|
" <td>-0.087375</td>\n",
|
|||
|
" <td>-0.307461</td>\n",
|
|||
|
" <td>0.902190</td>\n",
|
|||
|
" <td>-1.415078</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>-0.660870</td>\n",
|
|||
|
" <td>-1.710451</td>\n",
|
|||
|
" <td>-0.208897</td>\n",
|
|||
|
" <td>0.560953</td>\n",
|
|||
|
" <td>-0.074624</td>\n",
|
|||
|
" <td>-0.611501</td>\n",
|
|||
|
" <td>-0.827907</td>\n",
|
|||
|
" <td>-0.244770</td>\n",
|
|||
|
" <td>0.177059</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9528104985</th>\n",
|
|||
|
" <td>0.195780</td>\n",
|
|||
|
" <td>-1.509633</td>\n",
|
|||
|
" <td>-1.456427</td>\n",
|
|||
|
" <td>-0.885823</td>\n",
|
|||
|
" <td>-0.235388</td>\n",
|
|||
|
" <td>-0.918592</td>\n",
|
|||
|
" <td>-0.087375</td>\n",
|
|||
|
" <td>-0.307461</td>\n",
|
|||
|
" <td>-0.630265</td>\n",
|
|||
|
" <td>-0.560854</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>-0.276268</td>\n",
|
|||
|
" <td>-2.427121</td>\n",
|
|||
|
" <td>-0.208897</td>\n",
|
|||
|
" <td>0.691535</td>\n",
|
|||
|
" <td>0.848167</td>\n",
|
|||
|
" <td>-0.811560</td>\n",
|
|||
|
" <td>-0.462932</td>\n",
|
|||
|
" <td>-0.323975</td>\n",
|
|||
|
" <td>0.053326</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3450300430</th>\n",
|
|||
|
" <td>-0.609505</td>\n",
|
|||
|
" <td>0.697500</td>\n",
|
|||
|
" <td>-0.804965</td>\n",
|
|||
|
" <td>-0.381872</td>\n",
|
|||
|
" <td>-0.173076</td>\n",
|
|||
|
" <td>-0.918592</td>\n",
|
|||
|
" <td>-0.087375</td>\n",
|
|||
|
" <td>-0.307461</td>\n",
|
|||
|
" <td>0.902190</td>\n",
|
|||
|
" <td>-0.560854</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.968033</td>\n",
|
|||
|
" <td>-0.277109</td>\n",
|
|||
|
" <td>-0.208897</td>\n",
|
|||
|
" <td>-0.353124</td>\n",
|
|||
|
" <td>-0.432485</td>\n",
|
|||
|
" <td>0.367358</td>\n",
|
|||
|
" <td>-0.492130</td>\n",
|
|||
|
" <td>-0.166847</td>\n",
|
|||
|
" <td>0.601285</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3956900480</th>\n",
|
|||
|
" <td>0.656727</td>\n",
|
|||
|
" <td>-0.406066</td>\n",
|
|||
|
" <td>-0.479234</td>\n",
|
|||
|
" <td>-0.097029</td>\n",
|
|||
|
" <td>-0.223405</td>\n",
|
|||
|
" <td>-0.918592</td>\n",
|
|||
|
" <td>-0.087375</td>\n",
|
|||
|
" <td>0.989756</td>\n",
|
|||
|
" <td>-0.630265</td>\n",
|
|||
|
" <td>0.293371</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.832291</td>\n",
|
|||
|
" <td>-1.027907</td>\n",
|
|||
|
" <td>-0.208897</td>\n",
|
|||
|
" <td>2.258523</td>\n",
|
|||
|
" <td>0.652642</td>\n",
|
|||
|
" <td>-1.433171</td>\n",
|
|||
|
" <td>0.938573</td>\n",
|
|||
|
" <td>-0.215351</td>\n",
|
|||
|
" <td>-0.494633</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>17290 rows × 21 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" preprocessing_num__price preprocessing_num__bedrooms \\\n",
|
|||
|
"id \n",
|
|||
|
"5205000020 -0.492897 0.697500 \n",
|
|||
|
"4221270290 0.014419 -0.406066 \n",
|
|||
|
"3438501327 -0.513475 -1.509633 \n",
|
|||
|
"2726079098 0.055850 -0.406066 \n",
|
|||
|
"5072200040 -0.374916 -0.406066 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5104531120 0.645752 1.801066 \n",
|
|||
|
"2685600090 -0.534053 -0.406066 \n",
|
|||
|
"9528104985 0.195780 -1.509633 \n",
|
|||
|
"3450300430 -0.609505 0.697500 \n",
|
|||
|
"3956900480 0.656727 -0.406066 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__bathrooms preprocessing_num__sqft_living \\\n",
|
|||
|
"id \n",
|
|||
|
"5205000020 0.497960 0.582210 \n",
|
|||
|
"4221270290 0.497960 -0.097029 \n",
|
|||
|
"3438501327 0.497960 -0.557159 \n",
|
|||
|
"2726079098 0.497960 0.834186 \n",
|
|||
|
"5072200040 -0.153502 -0.129896 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5104531120 0.823691 1.831134 \n",
|
|||
|
"2685600090 -0.804965 -1.148755 \n",
|
|||
|
"9528104985 -1.456427 -0.885823 \n",
|
|||
|
"3450300430 -0.804965 -0.381872 \n",
|
|||
|
"3956900480 -0.479234 -0.097029 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__sqft_lot preprocessing_num__floors \\\n",
|
|||
|
"id \n",
|
|||
|
"5205000020 -0.181872 0.939548 \n",
|
|||
|
"4221270290 -0.239318 0.939548 \n",
|
|||
|
"3438501327 -0.300120 0.939548 \n",
|
|||
|
"2726079098 4.830831 0.939548 \n",
|
|||
|
"5072200040 -0.043661 -0.918592 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5104531120 -0.068178 0.939548 \n",
|
|||
|
"2685600090 -0.190596 -0.918592 \n",
|
|||
|
"9528104985 -0.235388 -0.918592 \n",
|
|||
|
"3450300430 -0.173076 -0.918592 \n",
|
|||
|
"3956900480 -0.223405 -0.918592 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__waterfront preprocessing_num__view \\\n",
|
|||
|
"id \n",
|
|||
|
"5205000020 -0.087375 -0.307461 \n",
|
|||
|
"4221270290 -0.087375 -0.307461 \n",
|
|||
|
"3438501327 -0.087375 -0.307461 \n",
|
|||
|
"2726079098 -0.087375 -0.307461 \n",
|
|||
|
"5072200040 -0.087375 2.286974 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5104531120 -0.087375 4.881408 \n",
|
|||
|
"2685600090 -0.087375 -0.307461 \n",
|
|||
|
"9528104985 -0.087375 -0.307461 \n",
|
|||
|
"3450300430 -0.087375 -0.307461 \n",
|
|||
|
"3956900480 -0.087375 0.989756 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__condition preprocessing_num__grade ... \\\n",
|
|||
|
"id ... \n",
|
|||
|
"5205000020 -0.630265 0.293371 ... \n",
|
|||
|
"4221270290 -0.630265 0.293371 ... \n",
|
|||
|
"3438501327 -0.630265 -0.560854 ... \n",
|
|||
|
"2726079098 -0.630265 1.147596 ... \n",
|
|||
|
"5072200040 2.434645 0.293371 ... \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"5104531120 -0.630265 2.001820 ... \n",
|
|||
|
"2685600090 0.902190 -1.415078 ... \n",
|
|||
|
"9528104985 -0.630265 -0.560854 ... \n",
|
|||
|
"3450300430 0.902190 -0.560854 ... \n",
|
|||
|
"3956900480 -0.630265 0.293371 ... \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__sqft_basement preprocessing_num__yr_built \\\n",
|
|||
|
"id \n",
|
|||
|
"5205000020 -0.660870 0.576070 \n",
|
|||
|
"4221270290 -0.660870 1.122105 \n",
|
|||
|
"3438501327 0.221452 1.292741 \n",
|
|||
|
"2726079098 -0.660870 0.678452 \n",
|
|||
|
"5072200040 0.040463 -0.481872 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5104531120 -0.660870 1.156232 \n",
|
|||
|
"2685600090 -0.660870 -1.710451 \n",
|
|||
|
"9528104985 -0.276268 -2.427121 \n",
|
|||
|
"3450300430 0.968033 -0.277109 \n",
|
|||
|
"3956900480 0.832291 -1.027907 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__yr_renovated preprocessing_num__zipcode \\\n",
|
|||
|
"id \n",
|
|||
|
"5205000020 -0.208897 -1.397782 \n",
|
|||
|
"4221270290 -0.208897 -0.054650 \n",
|
|||
|
"3438501327 -0.208897 0.523643 \n",
|
|||
|
"2726079098 -0.208897 -1.192581 \n",
|
|||
|
"5072200040 -0.208897 1.642920 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5104531120 -0.208897 -0.744871 \n",
|
|||
|
"2685600090 -0.208897 0.560953 \n",
|
|||
|
"9528104985 -0.208897 0.691535 \n",
|
|||
|
"3450300430 -0.208897 -0.353124 \n",
|
|||
|
"3956900480 -0.208897 2.258523 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__lat preprocessing_num__long \\\n",
|
|||
|
"id \n",
|
|||
|
"5205000020 -2.073883 -0.561487 \n",
|
|||
|
"4221270290 0.227682 1.403376 \n",
|
|||
|
"3438501327 -0.077510 -1.068778 \n",
|
|||
|
"2726079098 1.027819 2.303641 \n",
|
|||
|
"5072200040 -0.848786 -0.897299 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5104531120 -1.493802 1.517696 \n",
|
|||
|
"2685600090 -0.074624 -0.611501 \n",
|
|||
|
"9528104985 0.848167 -0.811560 \n",
|
|||
|
"3450300430 -0.432485 0.367358 \n",
|
|||
|
"3956900480 0.652642 -1.433171 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__sqft_living15 preprocessing_num__sqft_lot15 \\\n",
|
|||
|
"id \n",
|
|||
|
"5205000020 0.427608 -0.130375 \n",
|
|||
|
"4221270290 0.383811 -0.289464 \n",
|
|||
|
"3438501327 -0.579724 -0.384096 \n",
|
|||
|
"2726079098 1.215954 6.133562 \n",
|
|||
|
"5072200040 -0.039561 -0.074365 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5104531120 1.653925 -0.197435 \n",
|
|||
|
"2685600090 -0.827907 -0.244770 \n",
|
|||
|
"9528104985 -0.462932 -0.323975 \n",
|
|||
|
"3450300430 -0.492130 -0.166847 \n",
|
|||
|
"3956900480 0.938573 -0.215351 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__date_numeric remainder__above_average_price \n",
|
|||
|
"id \n",
|
|||
|
"5205000020 1.432062 0 \n",
|
|||
|
"4221270290 0.203573 1 \n",
|
|||
|
"3438501327 1.653013 0 \n",
|
|||
|
"2726079098 -0.362062 1 \n",
|
|||
|
"5072200040 -1.590551 0 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5104531120 1.281815 1 \n",
|
|||
|
"2685600090 0.177059 0 \n",
|
|||
|
"9528104985 0.053326 1 \n",
|
|||
|
"3450300430 0.601285 0 \n",
|
|||
|
"3956900480 -0.494633 1 \n",
|
|||
|
"\n",
|
|||
|
"[17290 rows x 21 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"preprocessing_result = pipeline_end.fit_transform(X_train)\n",
|
|||
|
"preprocessed_df = pd.DataFrame(\n",
|
|||
|
" preprocessing_result,\n",
|
|||
|
" columns=pipeline_end.get_feature_names_out(),\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"preprocessed_df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Формирование набора моделей для классификации\n",
|
|||
|
"\n",
|
|||
|
"logistic -- логистическая регрессия\n",
|
|||
|
"\n",
|
|||
|
"ridge -- гребневая регрессия\n",
|
|||
|
"\n",
|
|||
|
"decision_tree -- дерево решений\n",
|
|||
|
"\n",
|
|||
|
"knn -- k-ближайших соседей\n",
|
|||
|
"\n",
|
|||
|
"naive_bayes -- наивный Байесовский классификатор\n",
|
|||
|
"\n",
|
|||
|
"gradient_boosting -- метод градиентного бустинга (набор деревьев решений)\n",
|
|||
|
"\n",
|
|||
|
"random_forest -- метод случайного леса (набор деревьев решений)\n",
|
|||
|
"\n",
|
|||
|
"mlp -- многослойный персептрон (нейронная сеть)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree\n",
|
|||
|
"\n",
|
|||
|
"# Сами классификационные модели\n",
|
|||
|
"class_models = {\n",
|
|||
|
" # от 0 до 1, принадлежит ли объект к классу\n",
|
|||
|
" \"logistic\": {\"model\": linear_model.LogisticRegression()},\n",
|
|||
|
" # Логическая, но с регуляризацией (модель не так точно запоминает данные)\n",
|
|||
|
" \"ridge\": {\n",
|
|||
|
" \"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")\n",
|
|||
|
" },\n",
|
|||
|
" # Деления данных на условия с помощью построения дерева\n",
|
|||
|
" \"decision_tree\": {\n",
|
|||
|
" \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)\n",
|
|||
|
" },\n",
|
|||
|
" # Определяет ближайших объектов и находит и класс\n",
|
|||
|
" \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n",
|
|||
|
" # Вероятности для классификации\n",
|
|||
|
" \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n",
|
|||
|
" # Постепенно улучшает предсказания с помощью слабых моделей\n",
|
|||
|
" \"gradient_boosting\": {\n",
|
|||
|
" \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n",
|
|||
|
" },\n",
|
|||
|
" \"random_forest\": {\n",
|
|||
|
" \"model\": ensemble.RandomForestClassifier(\n",
|
|||
|
" max_depth=11, class_weight=\"balanced\", random_state=random_state\n",
|
|||
|
" )\n",
|
|||
|
" },\n",
|
|||
|
" \"mlp\": {\n",
|
|||
|
" \"model\": neural_network.MLPClassifier(\n",
|
|||
|
" hidden_layer_sizes=(7,),\n",
|
|||
|
" max_iter=500,\n",
|
|||
|
" early_stopping=True,\n",
|
|||
|
" random_state=random_state,\n",
|
|||
|
" )\n",
|
|||
|
" },\n",
|
|||
|
"}"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Обучение моделей на обучающем наборе данных и оценка на тестовом"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 32,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: logistic\n",
|
|||
|
"Model: ridge\n",
|
|||
|
"Model: decision_tree\n",
|
|||
|
"Model: knn\n",
|
|||
|
"Model: naive_bayes\n",
|
|||
|
"Model: gradient_boosting\n",
|
|||
|
"Model: random_forest\n",
|
|||
|
"Model: mlp\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import numpy as np\n",
|
|||
|
"from sklearn import metrics\n",
|
|||
|
"\n",
|
|||
|
"for model_name in class_models.keys():\n",
|
|||
|
" print(f\"Model: {model_name}\")\n",
|
|||
|
" model = class_models[model_name][\"model\"]\n",
|
|||
|
"\n",
|
|||
|
" model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
|
|||
|
" model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n",
|
|||
|
"\n",
|
|||
|
" y_train_predict = model_pipeline.predict(X_train)\n",
|
|||
|
" y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n",
|
|||
|
" y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n",
|
|||
|
"\n",
|
|||
|
" class_models[model_name][\"pipeline\"] = model_pipeline\n",
|
|||
|
" class_models[model_name][\"probs\"] = y_test_probs\n",
|
|||
|
" class_models[model_name][\"preds\"] = y_test_predict\n",
|
|||
|
"\n",
|
|||
|
" class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n",
|
|||
|
" y_train, y_train_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n",
|
|||
|
" y_test, y_test_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n",
|
|||
|
" y_train, y_train_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n",
|
|||
|
" y_test, y_test_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n",
|
|||
|
" y_train, y_train_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n",
|
|||
|
" y_test, y_test_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n",
|
|||
|
" y_test, y_test_probs\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n",
|
|||
|
" class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n",
|
|||
|
" class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n",
|
|||
|
" y_test, y_test_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n",
|
|||
|
" y_test, y_test_predict\n",
|
|||
|
" )\n",
|
|||
|
" class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n",
|
|||
|
" y_test, y_test_predict\n",
|
|||
|
" )"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Сводная таблица оценок качества для использованных моделей классификации"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Матрица неточностей"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 33,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0cAAAQ9CAYAAACSpDaqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1gU1/4G8HdpS29KDYgINiL2xBB7JKAxlmiuV6NRrFeDxhLrjQU10cTE2GJJU/RGf8Zr1NiCEhV7TCxYkYhiBywICFJ35/cHl0k2wLKLA7vsvJ/nmee6c87Ont0b5/U7Z4pCEAQBREREREREMmdm6AEQEREREREZAxZHREREREREYHFEREREREQEgMURERERERERABZHREREREREAFgcERERERERAWBxREREREREBIDFEREREREREQAWR0RERERERABYHFElRUdHQ6FQ4ObNm1Wy/Zs3b0KhUCA6OlqS7cXFxUGhUCAuLk6S7REREZmKqKgoKBQKnfoqFApERUVV7YCIDIjFEZmUVatWSVZQEREREZG8WBh6AERl8fPzQ25uLiwtLfV636pVq1C7dm1ERERorO/QoQNyc3NhZWUl4SiJiIhqvpkzZ2L69OmGHgaRUWBxREZJoVDA2tpasu2ZmZlJuj0iIiJTkJOTAzs7O1hY8J+ERABPqyMJrVq1Ci+++CKUSiW8vb0RGRmJjIyMUv1WrlyJevXqwcbGBi+//DKOHj2KTp06oVOnTmKfsq45Sk1NxdChQ+Hj4wOlUgkvLy/06tVLvO6pbt26uHz5Mg4fPgyFQgGFQiFus7xrjk6dOoU33ngDLi4usLOzQ9OmTbFs2TJpfxgiIiIjUHJt0ZUrV/DOO+/AxcUF7dq1K/Oao/z8fEycOBFubm5wcHBAz549cffu3TK3GxcXh9atW8Pa2hoBAQH46quvyr2O6fvvv0erVq1gY2MDV1dX9O/fH3fu3KmS70tUGTxMQJKIiorC3LlzERoaijFjxiAxMRGrV6/G77//juPHj4unx61evRpjx45F+/btMXHiRNy8eRO9e/eGi4sLfHx8tH5G3759cfnyZYwbNw5169bFgwcPEBsbi9u3b6Nu3bpYunQpxo0bB3t7e3z44YcAAA8Pj3K3FxsbizfffBNeXl4YP348PD09kZCQgN27d2P8+PHS/ThERERG5B//+Afq16+PBQsWQBAEPHjwoFSfESNG4Pvvv8c777yDV199FQcPHkT37t1L9Tt37hy6du0KLy8vzJ07FyqVCvPmzYObm1upvh9//DFmzZqFfv36YcSIEXj48CFWrFiBDh064Ny5c3B2dq6Kr0ukH4GoEtatWycAEJKTk4UHDx4IVlZWQlhYmKBSqcQ+X375pQBAWLt2rSAIgpCfny/UqlVLeOmll4TCwkKxX3R0tABA6Nixo7guOTlZACCsW7dOEARBePLkiQBA+Oyzz7SO68UXX9TYTolDhw4JAIRDhw4JgiAIRUVFgr+/v+Dn5yc8efJEo69ardb9hyAiIqoh5syZIwAQBgwYUOb6EvHx8QIA4b333tPo98477wgAhDlz5ojrevToIdja2gr37t0T1127dk2wsLDQ2ObNmzcFc3Nz4eOPP9bY5sWLFwULC4tS64kMhafV0XP75ZdfUFBQgAkTJsDM7M//pEaOHAlHR0fs2bMHAHD69Gk8fvwYI0eO1Di3eeDAgXBxcdH6GTY2NrCyskJcXByePHny3GM+d+4ckpOTMWHChFJHqnS9nSkREVFNNHr0aK3te/fuBQC8//77GusnTJig8VqlUuGXX35B79694e3tLa4PDAxEt27dNPpu27YNarUa/fr1w6NHj8TF09MT9evXx6FDh57jGxFJh6fV0XO7desWAKBhw4Ya662srFCvXj2xveR/AwMDNfpZWFigbt26Wj9DqVTi008/xQcffAAPDw+88sorePPNNzF48GB4enrqPebr168DAJo0aaL3e4mIiGoyf39/re23bt2CmZkZAgICNNb/PecfPHiA3NzcUrkOlM76a9euQRAE1K9fv8zP1PfutERVhcUR1RgTJkxAjx49sGPHDuzbtw+zZs3CwoULcfDgQbRo0cLQwyMiIqoRbGxsqv0z1Wo1FAoFfv75Z5ibm5dqt7e3r/YxEZWFp9XRc/Pz8wMAJCYmaqwvKChAcnKy2F7yv0lJSRr9ioqKxDvOVSQgIAAffPAB9u/fj0uXLqGgoACLFy8W23U9Ja7kaNilS5d06k9ERCQXfn5+UKvV4lkWJf6e8+7u7rC2ti6V60DprA8ICIAgCPD390doaGip5ZVXXpH+ixBVAosjem6hoaGwsrLC8uXLIQiCuP67775DZmameHeb1q1bo1atWvjmm29QVFQk9tu4cWOF1xE9e/YMeXl5GusCAgLg4OCA/Px8cZ2dnV2Ztw//u5YtW8Lf3x9Lly4t1f+v34GIiEhuSq4XWr58ucb6pUuXarw2NzdHaGgoduzYgfv374vrk5KS8PPPP2v07dOnD8zNzTF37txSOSsIAh4/fizhNyCqPJ5WR8/Nzc0NM2bMwNy5c9G1a1f07NkTiYmJWLVqFV566SUMGjQIQPE1SFFRURg3bhxee+019OvXDzdv3kR0dDQCAgK0zvr88ccf6NKlC/r164egoCBYWFhg+/btSEtLQ//+/cV+rVq1wurVq/HRRx8hMDAQ7u7ueO2110ptz8zMDKtXr0aPHj3QvHlzDB06FF5eXrh69SouX76Mffv2Sf9DERER1QDNmzfHgAEDsGrVKmRmZuLVV1/FgQMHypwhioqKwv79+9G2bVuMGTMGKpUKX375JZo0aYL4+HixX0BAAD766CPMmDFDfIyHg4MDkpOTsX37dowaNQqTJ0+uxm9JVDYWRySJqKgouLm54csvv8TEiRPh6uqKUaNGYcGCBRoXWY4dOxaCIGDx4sWYPHkymjVrhp07d+L999+HtbV1udv39fXFgAEDcODAAfznP/+BhYUFGjVqhC1btqBv375iv9mzZ+PWrVtYtGgRnj59io4dO5ZZHAFAeHg4Dh06hLlz52Lx4sVQq9UICAjAyJEjpfthiIiIaqC1a9fCzc0NGzduxI4dO/Daa69hz5498PX11ejXqlUr/Pzzz5g8eTJmzZoFX19fzJs3DwkJCbh69apG3+nTp6NBgwZYsmQJ5s6dC6A438PCwtCzZ89q+25E2igEnkNEBqZWq+Hm5oY+ffrgm2++MfRwiIiI6Dn17t0bly9fxrVr1ww9FCK98JojqlZ5eXmlzjXesGED0tPT0alTJ8MMioiIiCotNzdX4/W1a9ewd+9e5jrVSJw5omoVFxeHiRMn4h//+Adq1aqFs2fP4rvvvkPjxo1x5swZWFlZGXqIREREpAcvLy9ERESIzzZcvXo18vPzce7cuXKfa0RkrHjNEVWrunXrwtfXF8uXL0d6ejpcXV0xePBgfPLJJyyMiIiIaqCuXbvi//7v/5CamgqlUomQkBAsWLCAhRHVSJw5IiIiIiIiAq85IiIiIiIiAsDiiIiIiIiICACvOdKJWq3G/fv34eDgoPVBpUSmSBAEPH36FN7e3jAzk/Z4Sl5eHgoKCirsZ2VlpfU5WEQkP8xmkjNmc9VhcaSD+/fvl3roGZHc3LlzBz4+PpJtLy8vD/5+9kh9oKqwr6enJ5KTk01yJ0xElcNsJmI2VwUWRzpwcHAAANw6WxeO9jwT0RDeahBs6CHIVhEKcQx7xb8HUikoKEDqAxWSTvvC0aH8v1dZT9UIbH0HBQUFJrcDJqLKYzYbHrPZcJjNVYfFkQ5Kpusd7c20/odCVcdCYWnoIcjX/+5nWVWnrdg7KGDvUP621eDpMkRUGrPZ8JjNBsRsrjIsjojIoAoFFQq1PFGgUFBX42iIiIhIztnM4oiIDEoNAWqUvwPW1kZERETSk3M2szgiIoNSQ4BKpjtgIiIiYyTnbGZxREQGVSioUahlH2vKU/dERETGSM7ZzOKIiAxK/b9FWzsRERFVHzlnM4sjIjIoVQVT99raiIiISHpyzmYWR0RkUIUCKpi6r76xEBERkbyzmcURERm
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1200x1000 with 16 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.metrics import ConfusionMatrixDisplay\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n",
|
|||
|
"for index, key in enumerate(class_models.keys()):\n",
|
|||
|
" c_matrix = class_models[key][\"Confusion_matrix\"]\n",
|
|||
|
" disp = ConfusionMatrixDisplay(\n",
|
|||
|
" confusion_matrix=c_matrix, display_labels=[\"Less\", \"More\"]\n",
|
|||
|
" ).plot(ax=ax.flat[index])\n",
|
|||
|
" disp.ax_.set_title(key)\n",
|
|||
|
"\n",
|
|||
|
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Точность, полнота, верность (аккуратность), F-мера"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 34,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style type=\"text/css\">\n",
|
|||
|
"#T_04a52_row0_col0, #T_04a52_row0_col1, #T_04a52_row0_col2, #T_04a52_row0_col3, #T_04a52_row1_col0, #T_04a52_row1_col1, #T_04a52_row1_col2, #T_04a52_row1_col3, #T_04a52_row2_col0, #T_04a52_row2_col1, #T_04a52_row2_col2, #T_04a52_row2_col3, #T_04a52_row3_col0, #T_04a52_row3_col1, #T_04a52_row3_col2, #T_04a52_row3_col3, #T_04a52_row4_col0, #T_04a52_row4_col1, #T_04a52_row4_col2, #T_04a52_row4_col3, #T_04a52_row5_col0, #T_04a52_row5_col1, #T_04a52_row5_col2, #T_04a52_row5_col3 {\n",
|
|||
|
" background-color: #a8db34;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_04a52_row0_col4, #T_04a52_row0_col5, #T_04a52_row0_col6, #T_04a52_row0_col7, #T_04a52_row1_col4, #T_04a52_row1_col5, #T_04a52_row1_col6, #T_04a52_row1_col7, #T_04a52_row2_col4, #T_04a52_row2_col5, #T_04a52_row2_col6, #T_04a52_row2_col7, #T_04a52_row3_col4, #T_04a52_row3_col5, #T_04a52_row3_col6, #T_04a52_row3_col7, #T_04a52_row4_col4, #T_04a52_row4_col5, #T_04a52_row4_col6, #T_04a52_row4_col7, #T_04a52_row5_col4, #T_04a52_row5_col5, #T_04a52_row5_col6, #T_04a52_row5_col7 {\n",
|
|||
|
" background-color: #da5a6a;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_04a52_row6_col0, #T_04a52_row6_col3 {\n",
|
|||
|
" background-color: #98d83e;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_04a52_row6_col1 {\n",
|
|||
|
" background-color: #90d743;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_04a52_row6_col2 {\n",
|
|||
|
" background-color: #a5db36;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_04a52_row6_col4, #T_04a52_row6_col6 {\n",
|
|||
|
" background-color: #d7566c;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_04a52_row6_col5, #T_04a52_row6_col7 {\n",
|
|||
|
" background-color: #d45270;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_04a52_row7_col0, #T_04a52_row7_col1, #T_04a52_row7_col2, #T_04a52_row7_col3 {\n",
|
|||
|
" background-color: #26818e;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_04a52_row7_col4, #T_04a52_row7_col5, #T_04a52_row7_col6, #T_04a52_row7_col7 {\n",
|
|||
|
" background-color: #4e02a2;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"</style>\n",
|
|||
|
"<table id=\"T_04a52\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"blank level0\" > </th>\n",
|
|||
|
" <th id=\"T_04a52_level0_col0\" class=\"col_heading level0 col0\" >Precision_train</th>\n",
|
|||
|
" <th id=\"T_04a52_level0_col1\" class=\"col_heading level0 col1\" >Precision_test</th>\n",
|
|||
|
" <th id=\"T_04a52_level0_col2\" class=\"col_heading level0 col2\" >Recall_train</th>\n",
|
|||
|
" <th id=\"T_04a52_level0_col3\" class=\"col_heading level0 col3\" >Recall_test</th>\n",
|
|||
|
" <th id=\"T_04a52_level0_col4\" class=\"col_heading level0 col4\" >Accuracy_train</th>\n",
|
|||
|
" <th id=\"T_04a52_level0_col5\" class=\"col_heading level0 col5\" >Accuracy_test</th>\n",
|
|||
|
" <th id=\"T_04a52_level0_col6\" class=\"col_heading level0 col6\" >F1_train</th>\n",
|
|||
|
" <th id=\"T_04a52_level0_col7\" class=\"col_heading level0 col7\" >F1_test</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_04a52_level0_row0\" class=\"row_heading level0 row0\" >logistic</th>\n",
|
|||
|
" <td id=\"T_04a52_row0_col0\" class=\"data row0 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row0_col1\" class=\"data row0 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row0_col2\" class=\"data row0 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row0_col3\" class=\"data row0 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row0_col4\" class=\"data row0 col4\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row0_col5\" class=\"data row0 col5\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row0_col6\" class=\"data row0 col6\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row0_col7\" class=\"data row0 col7\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_04a52_level0_row1\" class=\"row_heading level0 row1\" >ridge</th>\n",
|
|||
|
" <td id=\"T_04a52_row1_col0\" class=\"data row1 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row1_col1\" class=\"data row1 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row1_col2\" class=\"data row1 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row1_col3\" class=\"data row1 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row1_col4\" class=\"data row1 col4\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row1_col5\" class=\"data row1 col5\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row1_col6\" class=\"data row1 col6\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row1_col7\" class=\"data row1 col7\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_04a52_level0_row2\" class=\"row_heading level0 row2\" >decision_tree</th>\n",
|
|||
|
" <td id=\"T_04a52_row2_col0\" class=\"data row2 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row2_col1\" class=\"data row2 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row2_col2\" class=\"data row2 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row2_col3\" class=\"data row2 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row2_col4\" class=\"data row2 col4\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row2_col5\" class=\"data row2 col5\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row2_col6\" class=\"data row2 col6\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row2_col7\" class=\"data row2 col7\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_04a52_level0_row3\" class=\"row_heading level0 row3\" >naive_bayes</th>\n",
|
|||
|
" <td id=\"T_04a52_row3_col0\" class=\"data row3 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row3_col1\" class=\"data row3 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row3_col2\" class=\"data row3 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row3_col3\" class=\"data row3 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row3_col4\" class=\"data row3 col4\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row3_col5\" class=\"data row3 col5\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row3_col6\" class=\"data row3 col6\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row3_col7\" class=\"data row3 col7\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_04a52_level0_row4\" class=\"row_heading level0 row4\" >random_forest</th>\n",
|
|||
|
" <td id=\"T_04a52_row4_col0\" class=\"data row4 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row4_col1\" class=\"data row4 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row4_col2\" class=\"data row4 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row4_col3\" class=\"data row4 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row4_col4\" class=\"data row4 col4\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row4_col5\" class=\"data row4 col5\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row4_col6\" class=\"data row4 col6\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row4_col7\" class=\"data row4 col7\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_04a52_level0_row5\" class=\"row_heading level0 row5\" >gradient_boosting</th>\n",
|
|||
|
" <td id=\"T_04a52_row5_col0\" class=\"data row5 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row5_col1\" class=\"data row5 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row5_col2\" class=\"data row5 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row5_col3\" class=\"data row5 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row5_col4\" class=\"data row5 col4\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row5_col5\" class=\"data row5 col5\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row5_col6\" class=\"data row5 col6\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_04a52_row5_col7\" class=\"data row5 col7\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_04a52_level0_row6\" class=\"row_heading level0 row6\" >mlp</th>\n",
|
|||
|
" <td id=\"T_04a52_row6_col0\" class=\"data row6 col0\" >0.999054</td>\n",
|
|||
|
" <td id=\"T_04a52_row6_col1\" class=\"data row6 col1\" >0.998106</td>\n",
|
|||
|
" <td id=\"T_04a52_row6_col2\" class=\"data row6 col2\" >0.999842</td>\n",
|
|||
|
" <td id=\"T_04a52_row6_col3\" class=\"data row6 col3\" >0.998106</td>\n",
|
|||
|
" <td id=\"T_04a52_row6_col4\" class=\"data row6 col4\" >0.999595</td>\n",
|
|||
|
" <td id=\"T_04a52_row6_col5\" class=\"data row6 col5\" >0.998612</td>\n",
|
|||
|
" <td id=\"T_04a52_row6_col6\" class=\"data row6 col6\" >0.999448</td>\n",
|
|||
|
" <td id=\"T_04a52_row6_col7\" class=\"data row6 col7\" >0.998106</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_04a52_level0_row7\" class=\"row_heading level0 row7\" >knn</th>\n",
|
|||
|
" <td id=\"T_04a52_row7_col0\" class=\"data row7 col0\" >0.982081</td>\n",
|
|||
|
" <td id=\"T_04a52_row7_col1\" class=\"data row7 col1\" >0.977664</td>\n",
|
|||
|
" <td id=\"T_04a52_row7_col2\" class=\"data row7 col2\" >0.977585</td>\n",
|
|||
|
" <td id=\"T_04a52_row7_col3\" class=\"data row7 col3\" >0.967172</td>\n",
|
|||
|
" <td id=\"T_04a52_row7_col4\" class=\"data row7 col4\" >0.985252</td>\n",
|
|||
|
" <td id=\"T_04a52_row7_col5\" class=\"data row7 col5\" >0.979875</td>\n",
|
|||
|
" <td id=\"T_04a52_row7_col6\" class=\"data row7 col6\" >0.979828</td>\n",
|
|||
|
" <td id=\"T_04a52_row7_col7\" class=\"data row7 col7\" >0.972390</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"<pandas.io.formats.style.Styler at 0x1b7eca7dca0>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 34,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
|
|||
|
" [\n",
|
|||
|
" \"Precision_train\",\n",
|
|||
|
" \"Precision_test\",\n",
|
|||
|
" \"Recall_train\",\n",
|
|||
|
" \"Recall_test\",\n",
|
|||
|
" \"Accuracy_train\",\n",
|
|||
|
" \"Accuracy_test\",\n",
|
|||
|
" \"F1_train\",\n",
|
|||
|
" \"F1_test\",\n",
|
|||
|
" ]\n",
|
|||
|
"]\n",
|
|||
|
"class_metrics.sort_values(\n",
|
|||
|
" by=\"Accuracy_test\", ascending=False\n",
|
|||
|
").style.background_gradient(\n",
|
|||
|
" cmap=\"plasma\",\n",
|
|||
|
" low=0.3,\n",
|
|||
|
" high=1,\n",
|
|||
|
" subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
|
|||
|
").background_gradient(\n",
|
|||
|
" cmap=\"viridis\",\n",
|
|||
|
" low=1,\n",
|
|||
|
" high=0.3,\n",
|
|||
|
" subset=[\n",
|
|||
|
" \"Precision_train\",\n",
|
|||
|
" \"Precision_test\",\n",
|
|||
|
" \"Recall_train\",\n",
|
|||
|
" \"Recall_test\",\n",
|
|||
|
" ],\n",
|
|||
|
")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 35,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style type=\"text/css\">\n",
|
|||
|
"#T_a3be8_row0_col0, #T_a3be8_row0_col1, #T_a3be8_row1_col0, #T_a3be8_row1_col1, #T_a3be8_row2_col0, #T_a3be8_row2_col1, #T_a3be8_row3_col0, #T_a3be8_row3_col1, #T_a3be8_row4_col0, #T_a3be8_row4_col1, #T_a3be8_row5_col0, #T_a3be8_row5_col1 {\n",
|
|||
|
" background-color: #a8db34;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_a3be8_row0_col2, #T_a3be8_row0_col3, #T_a3be8_row0_col4, #T_a3be8_row1_col2, #T_a3be8_row1_col3, #T_a3be8_row1_col4, #T_a3be8_row2_col2, #T_a3be8_row2_col3, #T_a3be8_row2_col4, #T_a3be8_row3_col2, #T_a3be8_row3_col3, #T_a3be8_row3_col4, #T_a3be8_row4_col2, #T_a3be8_row4_col3, #T_a3be8_row4_col4, #T_a3be8_row5_col2, #T_a3be8_row5_col3, #T_a3be8_row5_col4 {\n",
|
|||
|
" background-color: #da5a6a;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_a3be8_row6_col0, #T_a3be8_row6_col1 {\n",
|
|||
|
" background-color: #93d741;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_a3be8_row6_col2 {\n",
|
|||
|
" background-color: #c7427c;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_a3be8_row6_col3, #T_a3be8_row6_col4 {\n",
|
|||
|
" background-color: #d45270;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_a3be8_row7_col0, #T_a3be8_row7_col1 {\n",
|
|||
|
" background-color: #26818e;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_a3be8_row7_col2, #T_a3be8_row7_col3, #T_a3be8_row7_col4 {\n",
|
|||
|
" background-color: #4e02a2;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"</style>\n",
|
|||
|
"<table id=\"T_a3be8\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"blank level0\" > </th>\n",
|
|||
|
" <th id=\"T_a3be8_level0_col0\" class=\"col_heading level0 col0\" >Accuracy_test</th>\n",
|
|||
|
" <th id=\"T_a3be8_level0_col1\" class=\"col_heading level0 col1\" >F1_test</th>\n",
|
|||
|
" <th id=\"T_a3be8_level0_col2\" class=\"col_heading level0 col2\" >ROC_AUC_test</th>\n",
|
|||
|
" <th id=\"T_a3be8_level0_col3\" class=\"col_heading level0 col3\" >Cohen_kappa_test</th>\n",
|
|||
|
" <th id=\"T_a3be8_level0_col4\" class=\"col_heading level0 col4\" >MCC_test</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_a3be8_level0_row0\" class=\"row_heading level0 row0\" >logistic</th>\n",
|
|||
|
" <td id=\"T_a3be8_row0_col0\" class=\"data row0 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row0_col1\" class=\"data row0 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row0_col2\" class=\"data row0 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row0_col3\" class=\"data row0 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row0_col4\" class=\"data row0 col4\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_a3be8_level0_row1\" class=\"row_heading level0 row1\" >ridge</th>\n",
|
|||
|
" <td id=\"T_a3be8_row1_col0\" class=\"data row1 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row1_col1\" class=\"data row1 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row1_col2\" class=\"data row1 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row1_col3\" class=\"data row1 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row1_col4\" class=\"data row1 col4\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_a3be8_level0_row2\" class=\"row_heading level0 row2\" >decision_tree</th>\n",
|
|||
|
" <td id=\"T_a3be8_row2_col0\" class=\"data row2 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row2_col1\" class=\"data row2 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row2_col2\" class=\"data row2 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row2_col3\" class=\"data row2 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row2_col4\" class=\"data row2 col4\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_a3be8_level0_row3\" class=\"row_heading level0 row3\" >naive_bayes</th>\n",
|
|||
|
" <td id=\"T_a3be8_row3_col0\" class=\"data row3 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row3_col1\" class=\"data row3 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row3_col2\" class=\"data row3 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row3_col3\" class=\"data row3 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row3_col4\" class=\"data row3 col4\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_a3be8_level0_row4\" class=\"row_heading level0 row4\" >random_forest</th>\n",
|
|||
|
" <td id=\"T_a3be8_row4_col0\" class=\"data row4 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row4_col1\" class=\"data row4 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row4_col2\" class=\"data row4 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row4_col3\" class=\"data row4 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row4_col4\" class=\"data row4 col4\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_a3be8_level0_row5\" class=\"row_heading level0 row5\" >gradient_boosting</th>\n",
|
|||
|
" <td id=\"T_a3be8_row5_col0\" class=\"data row5 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row5_col1\" class=\"data row5 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row5_col2\" class=\"data row5 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row5_col3\" class=\"data row5 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_a3be8_row5_col4\" class=\"data row5 col4\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_a3be8_level0_row6\" class=\"row_heading level0 row6\" >mlp</th>\n",
|
|||
|
" <td id=\"T_a3be8_row6_col0\" class=\"data row6 col0\" >0.998612</td>\n",
|
|||
|
" <td id=\"T_a3be8_row6_col1\" class=\"data row6 col1\" >0.998106</td>\n",
|
|||
|
" <td id=\"T_a3be8_row6_col2\" class=\"data row6 col2\" >0.999368</td>\n",
|
|||
|
" <td id=\"T_a3be8_row6_col3\" class=\"data row6 col3\" >0.997011</td>\n",
|
|||
|
" <td id=\"T_a3be8_row6_col4\" class=\"data row6 col4\" >0.997011</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_a3be8_level0_row7\" class=\"row_heading level0 row7\" >knn</th>\n",
|
|||
|
" <td id=\"T_a3be8_row7_col0\" class=\"data row7 col0\" >0.979875</td>\n",
|
|||
|
" <td id=\"T_a3be8_row7_col1\" class=\"data row7 col1\" >0.972390</td>\n",
|
|||
|
" <td id=\"T_a3be8_row7_col2\" class=\"data row7 col2\" >0.996636</td>\n",
|
|||
|
" <td id=\"T_a3be8_row7_col3\" class=\"data row7 col3\" >0.956558</td>\n",
|
|||
|
" <td id=\"T_a3be8_row7_col4\" class=\"data row7 col4\" >0.956592</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"<pandas.io.formats.style.Styler at 0x1b7eca4f470>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 35,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
|
|||
|
" [\n",
|
|||
|
" \"Accuracy_test\",\n",
|
|||
|
" \"F1_test\",\n",
|
|||
|
" \"ROC_AUC_test\",\n",
|
|||
|
" \"Cohen_kappa_test\",\n",
|
|||
|
" \"MCC_test\",\n",
|
|||
|
" ]\n",
|
|||
|
"]\n",
|
|||
|
"class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n",
|
|||
|
" cmap=\"plasma\",\n",
|
|||
|
" low=0.3,\n",
|
|||
|
" high=1,\n",
|
|||
|
" subset=[\n",
|
|||
|
" \"ROC_AUC_test\",\n",
|
|||
|
" \"MCC_test\",\n",
|
|||
|
" \"Cohen_kappa_test\",\n",
|
|||
|
" ],\n",
|
|||
|
").background_gradient(\n",
|
|||
|
" cmap=\"viridis\",\n",
|
|||
|
" low=1,\n",
|
|||
|
" high=0.3,\n",
|
|||
|
" subset=[\n",
|
|||
|
" \"Accuracy_test\",\n",
|
|||
|
" \"F1_test\",\n",
|
|||
|
" ],\n",
|
|||
|
")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 36,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'logistic'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n",
|
|||
|
"\n",
|
|||
|
"display(best_model)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Вывод данных с ошибкой предсказания для оценки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 37,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'Error items count: 0'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>Predicted</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>sqft_basement</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_renovated</th>\n",
|
|||
|
" <th>zipcode</th>\n",
|
|||
|
" <th>lat</th>\n",
|
|||
|
" <th>long</th>\n",
|
|||
|
" <th>sqft_living15</th>\n",
|
|||
|
" <th>sqft_lot15</th>\n",
|
|||
|
" <th>date_numeric</th>\n",
|
|||
|
" <th>above_average_price</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>0 rows × 22 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"Empty DataFrame\n",
|
|||
|
"Columns: [price, Predicted, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition, grade, sqft_above, sqft_basement, yr_built, yr_renovated, zipcode, lat, long, sqft_living15, sqft_lot15, date_numeric, above_average_price]\n",
|
|||
|
"Index: []\n",
|
|||
|
"\n",
|
|||
|
"[0 rows x 22 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 37,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"preprocessing_result = pipeline_end.transform(X_test)\n",
|
|||
|
"preprocessed_df = pd.DataFrame(\n",
|
|||
|
" preprocessing_result,\n",
|
|||
|
" columns=pipeline_end.get_feature_names_out(),\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"y_pred = class_models[best_model][\"preds\"]\n",
|
|||
|
"\n",
|
|||
|
"# Cравнение реальных значений (y_test[\"above_average_price\"]) с предсказанными значениями (y_pred)\n",
|
|||
|
"# на тестовых данных\n",
|
|||
|
"error_index = y_test[y_test[\"above_average_price\"] != y_pred].index.tolist()\n",
|
|||
|
"display(f\"Error items count: {len(error_index)}\")\n",
|
|||
|
"\n",
|
|||
|
"error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n",
|
|||
|
"error_df = X_test.loc[error_index].copy()\n",
|
|||
|
"error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n",
|
|||
|
"error_df.sort_index()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Пример использования обученной модели (конвейера) для предсказания"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 41,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>grade</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>sqft_basement</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_renovated</th>\n",
|
|||
|
" <th>zipcode</th>\n",
|
|||
|
" <th>lat</th>\n",
|
|||
|
" <th>long</th>\n",
|
|||
|
" <th>sqft_living15</th>\n",
|
|||
|
" <th>sqft_lot15</th>\n",
|
|||
|
" <th>date_numeric</th>\n",
|
|||
|
" <th>above_average_price</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>626059335</th>\n",
|
|||
|
" <td>527000.0</td>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>2330.0</td>\n",
|
|||
|
" <td>19436.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>8.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1987.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>98011.0</td>\n",
|
|||
|
" <td>47.7663</td>\n",
|
|||
|
" <td>-122.215</td>\n",
|
|||
|
" <td>1910.0</td>\n",
|
|||
|
" <td>10055.0</td>\n",
|
|||
|
" <td>16317.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>1 rows × 21 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" price bedrooms bathrooms sqft_living sqft_lot floors \\\n",
|
|||
|
"626059335 527000.0 4.0 2.25 2330.0 19436.0 2.0 \n",
|
|||
|
"\n",
|
|||
|
" waterfront view condition grade ... sqft_basement yr_built \\\n",
|
|||
|
"626059335 0.0 0.0 3.0 8.0 ... 0.0 1987.0 \n",
|
|||
|
"\n",
|
|||
|
" yr_renovated zipcode lat long sqft_living15 sqft_lot15 \\\n",
|
|||
|
"626059335 0.0 98011.0 47.7663 -122.215 1910.0 10055.0 \n",
|
|||
|
"\n",
|
|||
|
" date_numeric above_average_price \n",
|
|||
|
"626059335 16317.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[1 rows x 21 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>preprocessing_num__price</th>\n",
|
|||
|
" <th>preprocessing_num__bedrooms</th>\n",
|
|||
|
" <th>preprocessing_num__bathrooms</th>\n",
|
|||
|
" <th>preprocessing_num__sqft_living</th>\n",
|
|||
|
" <th>preprocessing_num__sqft_lot</th>\n",
|
|||
|
" <th>preprocessing_num__floors</th>\n",
|
|||
|
" <th>preprocessing_num__waterfront</th>\n",
|
|||
|
" <th>preprocessing_num__view</th>\n",
|
|||
|
" <th>preprocessing_num__condition</th>\n",
|
|||
|
" <th>preprocessing_num__grade</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>preprocessing_num__sqft_basement</th>\n",
|
|||
|
" <th>preprocessing_num__yr_built</th>\n",
|
|||
|
" <th>preprocessing_num__yr_renovated</th>\n",
|
|||
|
" <th>preprocessing_num__zipcode</th>\n",
|
|||
|
" <th>preprocessing_num__lat</th>\n",
|
|||
|
" <th>preprocessing_num__long</th>\n",
|
|||
|
" <th>preprocessing_num__sqft_living15</th>\n",
|
|||
|
" <th>preprocessing_num__sqft_lot15</th>\n",
|
|||
|
" <th>preprocessing_num__date_numeric</th>\n",
|
|||
|
" <th>remainder__above_average_price</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>626059335</th>\n",
|
|||
|
" <td>-0.034693</td>\n",
|
|||
|
" <td>0.6975</td>\n",
|
|||
|
" <td>0.172229</td>\n",
|
|||
|
" <td>0.275457</td>\n",
|
|||
|
" <td>0.108187</td>\n",
|
|||
|
" <td>0.939548</td>\n",
|
|||
|
" <td>-0.087375</td>\n",
|
|||
|
" <td>-0.307461</td>\n",
|
|||
|
" <td>-0.630265</td>\n",
|
|||
|
" <td>0.293371</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>-0.66087</td>\n",
|
|||
|
" <td>0.541943</td>\n",
|
|||
|
" <td>-0.208897</td>\n",
|
|||
|
" <td>-1.248545</td>\n",
|
|||
|
" <td>1.491739</td>\n",
|
|||
|
" <td>-0.00418</td>\n",
|
|||
|
" <td>-0.112556</td>\n",
|
|||
|
" <td>-0.091828</td>\n",
|
|||
|
" <td>-0.485795</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>1 rows × 21 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" preprocessing_num__price preprocessing_num__bedrooms \\\n",
|
|||
|
"626059335 -0.034693 0.6975 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__bathrooms preprocessing_num__sqft_living \\\n",
|
|||
|
"626059335 0.172229 0.275457 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__sqft_lot preprocessing_num__floors \\\n",
|
|||
|
"626059335 0.108187 0.939548 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__waterfront preprocessing_num__view \\\n",
|
|||
|
"626059335 -0.087375 -0.307461 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__condition preprocessing_num__grade ... \\\n",
|
|||
|
"626059335 -0.630265 0.293371 ... \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__sqft_basement preprocessing_num__yr_built \\\n",
|
|||
|
"626059335 -0.66087 0.541943 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__yr_renovated preprocessing_num__zipcode \\\n",
|
|||
|
"626059335 -0.208897 -1.248545 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__lat preprocessing_num__long \\\n",
|
|||
|
"626059335 1.491739 -0.00418 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__sqft_living15 preprocessing_num__sqft_lot15 \\\n",
|
|||
|
"626059335 -0.112556 -0.091828 \n",
|
|||
|
"\n",
|
|||
|
" preprocessing_num__date_numeric remainder__above_average_price \n",
|
|||
|
"626059335 -0.485795 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[1 rows x 21 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'predicted: 0 (proba: [0.99455988 0.00544012])'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'real: 0'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"model = class_models[best_model][\"pipeline\"]\n",
|
|||
|
"\n",
|
|||
|
"example_id = 626059335\n",
|
|||
|
"test = pd.DataFrame(X_test.loc[example_id, :]).T\n",
|
|||
|
"test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T\n",
|
|||
|
"display(test)\n",
|
|||
|
"display(test_preprocessed)\n",
|
|||
|
"result_proba = model.predict_proba(test)[0]\n",
|
|||
|
"result = model.predict(test)[0]\n",
|
|||
|
"real = int(y_test.loc[example_id].values[0])\n",
|
|||
|
"display(f\"predicted: {result} (proba: {result_proba})\")\n",
|
|||
|
"display(f\"real: {real}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Подбор гиперпараметров методом поиска по сетке"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 42,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\TEMP_UNIVERSITY\\mai\\.venv\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n",
|
|||
|
" _data = np.array(data, dtype=dtype, copy=copy,\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"{'model__criterion': 'gini',\n",
|
|||
|
" 'model__max_depth': 2,\n",
|
|||
|
" 'model__max_features': 'sqrt',\n",
|
|||
|
" 'model__n_estimators': 10}"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 42,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import GridSearchCV\n",
|
|||
|
"\n",
|
|||
|
"optimized_model_type = \"random_forest\"\n",
|
|||
|
"\n",
|
|||
|
"random_forest_model = class_models[optimized_model_type][\"pipeline\"]\n",
|
|||
|
"\n",
|
|||
|
"param_grid = {\n",
|
|||
|
" \"model__n_estimators\": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],\n",
|
|||
|
" \"model__max_features\": [\"sqrt\", \"log2\", 2],\n",
|
|||
|
" \"model__max_depth\": [2, 3, 4, 5, 6, 7, 8, 9 ,10],\n",
|
|||
|
" \"model__criterion\": [\"gini\", \"entropy\", \"log_loss\"],\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"gs_optomizer = GridSearchCV(\n",
|
|||
|
" estimator=random_forest_model, param_grid=param_grid, n_jobs=-1\n",
|
|||
|
")\n",
|
|||
|
"gs_optomizer.fit(X_train, y_train.values.ravel())\n",
|
|||
|
"gs_optomizer.best_params_"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Обучение модели с новыми гиперпараметрами"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 43,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"optimized_model = ensemble.RandomForestClassifier(\n",
|
|||
|
" random_state=random_state,\n",
|
|||
|
" criterion=\"gini\",\n",
|
|||
|
" max_depth=7,\n",
|
|||
|
" max_features=\"sqrt\",\n",
|
|||
|
" n_estimators=30,\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"result = {}\n",
|
|||
|
"\n",
|
|||
|
"result[\"pipeline\"] = Pipeline([(\"pipeline\", pipeline_end), (\"model\", optimized_model)]).fit(X_train, y_train.values.ravel())\n",
|
|||
|
"result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n",
|
|||
|
"result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n",
|
|||
|
"result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n",
|
|||
|
"\n",
|
|||
|
"result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n",
|
|||
|
"result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n",
|
|||
|
"result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n",
|
|||
|
"result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n",
|
|||
|
"result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n",
|
|||
|
"result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n",
|
|||
|
"result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n",
|
|||
|
"result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n",
|
|||
|
"result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n",
|
|||
|
"result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n",
|
|||
|
"result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n",
|
|||
|
"result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Формирование данных для оценки старой и новой версии модели"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 44,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n",
|
|||
|
"optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
|
|||
|
" data=class_models[optimized_model_type]\n",
|
|||
|
")\n",
|
|||
|
"optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
|
|||
|
" data=result\n",
|
|||
|
")\n",
|
|||
|
"optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n",
|
|||
|
"optimized_metrics = optimized_metrics.set_index(\"Name\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Оценка параметров старой и новой модели"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 45,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style type=\"text/css\">\n",
|
|||
|
"#T_df96e_row0_col0, #T_df96e_row0_col1, #T_df96e_row0_col2, #T_df96e_row0_col3, #T_df96e_row1_col0, #T_df96e_row1_col1, #T_df96e_row1_col2, #T_df96e_row1_col3 {\n",
|
|||
|
" background-color: #440154;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_df96e_row0_col4, #T_df96e_row0_col5, #T_df96e_row0_col6, #T_df96e_row0_col7, #T_df96e_row1_col4, #T_df96e_row1_col5, #T_df96e_row1_col6, #T_df96e_row1_col7 {\n",
|
|||
|
" background-color: #0d0887;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"</style>\n",
|
|||
|
"<table id=\"T_df96e\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"blank level0\" > </th>\n",
|
|||
|
" <th id=\"T_df96e_level0_col0\" class=\"col_heading level0 col0\" >Precision_train</th>\n",
|
|||
|
" <th id=\"T_df96e_level0_col1\" class=\"col_heading level0 col1\" >Precision_test</th>\n",
|
|||
|
" <th id=\"T_df96e_level0_col2\" class=\"col_heading level0 col2\" >Recall_train</th>\n",
|
|||
|
" <th id=\"T_df96e_level0_col3\" class=\"col_heading level0 col3\" >Recall_test</th>\n",
|
|||
|
" <th id=\"T_df96e_level0_col4\" class=\"col_heading level0 col4\" >Accuracy_train</th>\n",
|
|||
|
" <th id=\"T_df96e_level0_col5\" class=\"col_heading level0 col5\" >Accuracy_test</th>\n",
|
|||
|
" <th id=\"T_df96e_level0_col6\" class=\"col_heading level0 col6\" >F1_train</th>\n",
|
|||
|
" <th id=\"T_df96e_level0_col7\" class=\"col_heading level0 col7\" >F1_test</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"index_name level0\" >Name</th>\n",
|
|||
|
" <th class=\"blank col0\" > </th>\n",
|
|||
|
" <th class=\"blank col1\" > </th>\n",
|
|||
|
" <th class=\"blank col2\" > </th>\n",
|
|||
|
" <th class=\"blank col3\" > </th>\n",
|
|||
|
" <th class=\"blank col4\" > </th>\n",
|
|||
|
" <th class=\"blank col5\" > </th>\n",
|
|||
|
" <th class=\"blank col6\" > </th>\n",
|
|||
|
" <th class=\"blank col7\" > </th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_df96e_level0_row0\" class=\"row_heading level0 row0\" >Old</th>\n",
|
|||
|
" <td id=\"T_df96e_row0_col0\" class=\"data row0 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_df96e_row0_col1\" class=\"data row0 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_df96e_row0_col2\" class=\"data row0 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_df96e_row0_col3\" class=\"data row0 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_df96e_row0_col4\" class=\"data row0 col4\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_df96e_row0_col5\" class=\"data row0 col5\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_df96e_row0_col6\" class=\"data row0 col6\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_df96e_row0_col7\" class=\"data row0 col7\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_df96e_level0_row1\" class=\"row_heading level0 row1\" >New</th>\n",
|
|||
|
" <td id=\"T_df96e_row1_col0\" class=\"data row1 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_df96e_row1_col1\" class=\"data row1 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_df96e_row1_col2\" class=\"data row1 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_df96e_row1_col3\" class=\"data row1 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_df96e_row1_col4\" class=\"data row1 col4\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_df96e_row1_col5\" class=\"data row1 col5\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_df96e_row1_col6\" class=\"data row1 col6\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_df96e_row1_col7\" class=\"data row1 col7\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"<pandas.io.formats.style.Styler at 0x1b7ecbc1340>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 45,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"optimized_metrics[\n",
|
|||
|
" [\n",
|
|||
|
" \"Precision_train\",\n",
|
|||
|
" \"Precision_test\",\n",
|
|||
|
" \"Recall_train\",\n",
|
|||
|
" \"Recall_test\",\n",
|
|||
|
" \"Accuracy_train\",\n",
|
|||
|
" \"Accuracy_test\",\n",
|
|||
|
" \"F1_train\",\n",
|
|||
|
" \"F1_test\",\n",
|
|||
|
" ]\n",
|
|||
|
"].style.background_gradient(\n",
|
|||
|
" cmap=\"plasma\",\n",
|
|||
|
" low=0.3,\n",
|
|||
|
" high=1,\n",
|
|||
|
" subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
|
|||
|
").background_gradient(\n",
|
|||
|
" cmap=\"viridis\",\n",
|
|||
|
" low=1,\n",
|
|||
|
" high=0.3,\n",
|
|||
|
" subset=[\n",
|
|||
|
" \"Precision_train\",\n",
|
|||
|
" \"Precision_test\",\n",
|
|||
|
" \"Recall_train\",\n",
|
|||
|
" \"Recall_test\",\n",
|
|||
|
" ],\n",
|
|||
|
")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 46,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style type=\"text/css\">\n",
|
|||
|
"#T_15d67_row0_col0, #T_15d67_row0_col1, #T_15d67_row1_col0, #T_15d67_row1_col1 {\n",
|
|||
|
" background-color: #440154;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_15d67_row0_col2, #T_15d67_row0_col3, #T_15d67_row0_col4, #T_15d67_row1_col2, #T_15d67_row1_col3, #T_15d67_row1_col4 {\n",
|
|||
|
" background-color: #0d0887;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"</style>\n",
|
|||
|
"<table id=\"T_15d67\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"blank level0\" > </th>\n",
|
|||
|
" <th id=\"T_15d67_level0_col0\" class=\"col_heading level0 col0\" >Accuracy_test</th>\n",
|
|||
|
" <th id=\"T_15d67_level0_col1\" class=\"col_heading level0 col1\" >F1_test</th>\n",
|
|||
|
" <th id=\"T_15d67_level0_col2\" class=\"col_heading level0 col2\" >ROC_AUC_test</th>\n",
|
|||
|
" <th id=\"T_15d67_level0_col3\" class=\"col_heading level0 col3\" >Cohen_kappa_test</th>\n",
|
|||
|
" <th id=\"T_15d67_level0_col4\" class=\"col_heading level0 col4\" >MCC_test</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"index_name level0\" >Name</th>\n",
|
|||
|
" <th class=\"blank col0\" > </th>\n",
|
|||
|
" <th class=\"blank col1\" > </th>\n",
|
|||
|
" <th class=\"blank col2\" > </th>\n",
|
|||
|
" <th class=\"blank col3\" > </th>\n",
|
|||
|
" <th class=\"blank col4\" > </th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_15d67_level0_row0\" class=\"row_heading level0 row0\" >Old</th>\n",
|
|||
|
" <td id=\"T_15d67_row0_col0\" class=\"data row0 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_15d67_row0_col1\" class=\"data row0 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_15d67_row0_col2\" class=\"data row0 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_15d67_row0_col3\" class=\"data row0 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_15d67_row0_col4\" class=\"data row0 col4\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_15d67_level0_row1\" class=\"row_heading level0 row1\" >New</th>\n",
|
|||
|
" <td id=\"T_15d67_row1_col0\" class=\"data row1 col0\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_15d67_row1_col1\" class=\"data row1 col1\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_15d67_row1_col2\" class=\"data row1 col2\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_15d67_row1_col3\" class=\"data row1 col3\" >1.000000</td>\n",
|
|||
|
" <td id=\"T_15d67_row1_col4\" class=\"data row1 col4\" >1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"<pandas.io.formats.style.Styler at 0x1b7edb90b60>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 46,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"optimized_metrics[\n",
|
|||
|
" [\n",
|
|||
|
" \"Accuracy_test\",\n",
|
|||
|
" \"F1_test\",\n",
|
|||
|
" \"ROC_AUC_test\",\n",
|
|||
|
" \"Cohen_kappa_test\",\n",
|
|||
|
" \"MCC_test\",\n",
|
|||
|
" ]\n",
|
|||
|
"].style.background_gradient(\n",
|
|||
|
" cmap=\"plasma\",\n",
|
|||
|
" low=0.3,\n",
|
|||
|
" high=1,\n",
|
|||
|
" subset=[\n",
|
|||
|
" \"ROC_AUC_test\",\n",
|
|||
|
" \"MCC_test\",\n",
|
|||
|
" \"Cohen_kappa_test\",\n",
|
|||
|
" ],\n",
|
|||
|
").background_gradient(\n",
|
|||
|
" cmap=\"viridis\",\n",
|
|||
|
" low=1,\n",
|
|||
|
" high=0.3,\n",
|
|||
|
" subset=[\n",
|
|||
|
" \"Accuracy_test\",\n",
|
|||
|
" \"F1_test\",\n",
|
|||
|
" ],\n",
|
|||
|
")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 47,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2kAAAGsCAYAAABHMu+IAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABMm0lEQVR4nO3deVyVZf7/8fcBZBE5ICogiuRSKhOuTcVo2kKg2eLYTFNpiZpODrZoLtliWjPROFPZqpW5fUd/1rQ4paWRhaaRpUWaqbmlloJbgqCynfP7wzx1FO0+nhvOfY6v5+NxP75y7puL6/br8O5zX5/7vm1Op9MpAAAAAIAlBPl6AgAAAACAX1CkAQAAAICFUKQBAAAAgIVQpAEAAACAhVCkAQAAAICFUKQBAAAAgIVQpAEAAACAhVCkAQAAAICFhPh6AgCAmh07dkwVFRWmjRcaGqrw8HDTxgMAwBPkmnEUaQBgQceOHVPL5AYq3Ftt2pgJCQnavn17wAYaAMC6yDXPUKQBgAVVVFSocG+1tq9Jlj3K+870ksMOtey6QxUVFQEZZgAAayPXPEORBgAWZo8KMiXMAACwAnLNGIo0ALCwaqdD1U5zxgEAwNfINWMo0gDAwhxyyiHv08yMMQAA8Ba5ZgxrjQAAAABgIaykAYCFOeSQGQ0d5owCAIB3yDVjKNIAwMKqnU5VO71v6TBjDAAAvEWuGUO7IwAAAABYCCtpAGBh3GANAAgk5JoxFGkAYGEOOVVNmAEAAgS5ZgztjgAAAABgIaykAYCF0RYCAAgk5JoxrKQBAAAAgIWwkgYAFsajigEAgYRcM4YiDQAszPHzZsY4AAD4GrlmDO2OAAAAAGAhrKQBgIVVm/SoYjPGAADAW+SaMRRpAGBh1c7jmxnjAADga+SaMbQ7AgAAAICFsJIGABbGDdYAgEBCrhlDkQYAFuaQTdWymTIOAAC+Rq4ZQ7sjAAAAAFgIK2kAYGEO5/HNjHEAAPA1cs0YVtIAAAAAwEJYSQMAC6s2qXffjDEAAPAWuWYMRRoAWBhhBgAIJOSaMbQ7AgAAAICFsJIGABbmcNrkcJrwqGITxgAAwFvkmjEUaQBgYbSFAAACCblmDO2OAIBT5OTk6Pe//72ioqIUFxenvn37atOmTW7HXH755bLZbG7bnXfe6XbMzp071adPH9WvX19xcXEaM2aMqqqq3I7Jy8tTly5dFBYWpjZt2mjWrFm1fXoAgHOMv+UaRRoAWFi1gkzbPLFs2TJlZ2frs88+U25uriorK5WRkaGysjK344YOHao9e/a4tsmTJ/8y9+pq9enTRxUVFfr00081e/ZszZo1SxMmTHAds337dvXp00dXXHGFCgoKdO+99+qOO+7QkiVLvPuLAwBYErlmjM3pdAb4q+AAwP+UlJQoOjpaS9e1UGSU99fTyg47dFXqThUXF8tut3v8/fv27VNcXJyWLVumHj16SDp+xbFTp06aMmVKjd/z/vvv69prr9Xu3bsVHx8vSZo2bZrGjRunffv2KTQ0VOPGjdOiRYv0zTffuL7v5ptv1qFDh7R48WLPTxQAYEnkmme5xkoaAJxDSkpK3Lby8nJD31dcXCxJio2Ndft87ty5aty4sS688EKNHz9eR44cce3Lz89XamqqK8gkKTMzUyUlJVq/fr3rmPT0dLcxMzMzlZ+ff1bnBwA4twRqrvHgEACwMLNvsE5KSnL7/JFHHtHEiRPP+L0Oh0P33nuvunXrpgsvvND1+a233qrk5GQlJiZq7dq1GjdunDZt2qS33npLklRYWOgWZJJcXxcWFp7xmJKSEh09elQRERGenywAwLLINWO5RpEGABZW7QxStdP7pofqnxvbd+3a5dYWEhYW9pvfm52drW+++UYrVqxw+3zYsGGuP6empqpp06a66qqrtHXrVrVu3drrOQMAAg+5ZgztjgBwDrHb7W7bb4XZiBEjtHDhQn388cdq3rz5GY+95JJLJElbtmyRJCUkJKioqMjtmBNfJyQknPEYu93OKhoA4DcFaq5RpAGAhTlkk0NBJmyetZY4nU6NGDFCb7/9tj766CO1bNnyN7+noKBAktS0aVNJUlpamtatW6e9e/e6jsnNzZXdbldKSorrmKVLl7qNk5ubq7S0NI/mCwDwD+SaMRRpAIBTZGdn6z//+Y/mzZunqKgoFRYWqrCwUEePHpUkbd26VY899pjWrFmj77//Xu+8845uv/129ejRQx06dJAkZWRkKCUlRbfddpu+/vprLVmyRA899JCys7NdVzrvvPNObdu2TWPHjtXGjRv14osv6vXXX9fIkSN9du4AgMDjb7nGI/gBwIJOPKr4nbWtFRkV7PV4ZYerdX2HrYYfVWyz1XyFcubMmcrKytKuXbs0YMAAffPNNyorK1NSUpL++Mc/6qGHHnIbf8eOHRo+fLjy8vIUGRmpgQMH6oknnlBIyC+3ROfl5WnkyJH69ttv1bx5cz388MPKysry+pwBANZBrmV5dH4UaQBgQSfC7O2vzzctzP7YcfNZv08GAABvkGueod0RAAAAACyER/ADgIUdv8Ha+/fJmDEGAADeIteMoUgDAAtzKEjVJjQ9OERnOwDA98g1Y2h3BAAAAAALYSUNACys2hmkaqf319OqeUYUAMACyDVjKNIAwMJOvLTT+3ECO8wAAP6BXDOGdkcAAAAAsBBW0gDAwqqdNlU7vX+ClRljAADgLXLNGFbSAAAAAMBCWEkDAAurNulRxdUB3rsPAPAP5JoxFGkAYGEOZ5AcJjwFyxHgT8ECAPgHcs0Y2h0BAAAAwEJYSQMAC6MtBAAQSMg1YyjSAMDCHDLnCVYO76cCAIDXyDVjaHcEAAAAAAthJQ0ALMyhIDlMuJ5mxhgAAHiLXDOGIg0ALKzaGaRqE56CZcYYAAB4i1wzJrDPDgAAAAD8DCtpAGBhDtnkkBk3WHs/BgAA3iLXjKFIAwALoy0EABBIyDVjAvvsAAAAAMDPsJIGABZm3ks/uSYHAPA9cs2YwD47AAAAAPAzrKQZ4HA4tHv3bkVFRclmC+ybFAF4z+l06vDhw0pMTFRQkHfXwhxOmxxOE26wNmEMBA5yDYAnyLW6R5FmwO7du5WUlOTraQDwM7t27VLz5s29GsNhUltIoL/0E54h1wCcDXKt7lCkGRAVFSVJ2vHlebI3COx/EPDcHy9I9fUUYDFVqtQKvef63QFYDbmGMyHXcDJyre5RpBlwohXE3iBI9ijCDO5CbPV8PQVYjfP4/zGjjczhDJLDhMcMmzEGAge5hjMh13AKcq3OUaQBgIVVy6ZqE17YacYYAAB4i1wzJrBLUAAAAADwM6ykAYCF0RYCAAgk5JoxFGkAYGHVMqelo9r7qQAA4DVyzZjALkEBAAAAwM+wkgYAFkZbCAAgkJBrxgT22QEAAACAn2ElDQAsrNoZpGoTrhaaMQYAAN4i14yhSAMAC3PKJocJN1g7A/x9MgAA/0CuGRPYJSgAAAAA+BlW0gDAwmgLAQAEEnLNGIo0ALAwh9Mmh9P7lg4zxgAAwFvkmjGBXYICAAAAgJ9hJQ0ALKxaQao24XqaGWMAAOAtcs0YijQAsDDaQgAAgYRcMyawS1AAAAAA8DOspAGAhTkUJIcJ19PMGAMAAG+Ra8ZQpAGAhVU7bao2oaXDjDEAAPAWuWZMYJegAAAAAOBnWEkDAAvjBmsAQCAh14xhJQ0AAAAALISVNACwMKczSA6n99fTnCaMAQCAt8g1YyjSAMDCqmVTtUy4wdqEMQAA8Ba5Zkxgl6AAAAAA4GdYSQMAC3M4zbk52uE0YTIAAHiJXDOGIg0ALMxhUu++GWMAAOAtcs2YwD47AAAAAPAzrKQBgIU5ZJPDhJujzRgDAABvkWvGUKQBgIVVO22qNqF334wxAADwFrlmDO2OAAAAAGAhrKQBgIVxgzUAIJCQa8YE9tkBAAAAgJ9hJQ0ALMwhmznvkwnwG6wBAP6BXDOGIg0ALMxp0lOwnAEeZgAA/0CuGUO7IwAAAABYCEUaAFiYw2kzbfNETk6Ofv/73ysqKkpxcXHq27evNm3a5HbMsWPHlJ2drUa
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x400 with 4 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"for index in range(0, len(optimized_metrics)):\n",
|
|||
|
" c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n",
|
|||
|
" disp = ConfusionMatrixDisplay(\n",
|
|||
|
" confusion_matrix=c_matrix, display_labels=[\"Less\", \"More\"]\n",
|
|||
|
" ).plot(ax=ax.flat[index])\n",
|
|||
|
"\n",
|
|||
|
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|