2998 lines
132 KiB
Plaintext
2998 lines
132 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 337,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>bathrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>floors</th>\n",
|
||
" <th>waterfront</th>\n",
|
||
" <th>view</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>grade</th>\n",
|
||
" <th>sqft_above</th>\n",
|
||
" <th>sqft_basement</th>\n",
|
||
" <th>yr_built</th>\n",
|
||
" <th>yr_renovated</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>lat</th>\n",
|
||
" <th>long</th>\n",
|
||
" <th>sqft_living15</th>\n",
|
||
" <th>sqft_lot15</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>7129300520</td>\n",
|
||
" <td>20141013T000000</td>\n",
|
||
" <td>221900.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1180</td>\n",
|
||
" <td>5650</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1180</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1955</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98178</td>\n",
|
||
" <td>47.5112</td>\n",
|
||
" <td>-122.257</td>\n",
|
||
" <td>1340</td>\n",
|
||
" <td>5650</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>6414100192</td>\n",
|
||
" <td>20141209T000000</td>\n",
|
||
" <td>538000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.25</td>\n",
|
||
" <td>2570</td>\n",
|
||
" <td>7242</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>2170</td>\n",
|
||
" <td>400</td>\n",
|
||
" <td>1951</td>\n",
|
||
" <td>1991</td>\n",
|
||
" <td>98125</td>\n",
|
||
" <td>47.7210</td>\n",
|
||
" <td>-122.319</td>\n",
|
||
" <td>1690</td>\n",
|
||
" <td>7639</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>5631500400</td>\n",
|
||
" <td>20150225T000000</td>\n",
|
||
" <td>180000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>770</td>\n",
|
||
" <td>10000</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>770</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1933</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98028</td>\n",
|
||
" <td>47.7379</td>\n",
|
||
" <td>-122.233</td>\n",
|
||
" <td>2720</td>\n",
|
||
" <td>8062</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2487200875</td>\n",
|
||
" <td>20141209T000000</td>\n",
|
||
" <td>604000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>3.00</td>\n",
|
||
" <td>1960</td>\n",
|
||
" <td>5000</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1050</td>\n",
|
||
" <td>910</td>\n",
|
||
" <td>1965</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98136</td>\n",
|
||
" <td>47.5208</td>\n",
|
||
" <td>-122.393</td>\n",
|
||
" <td>1360</td>\n",
|
||
" <td>5000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1954400510</td>\n",
|
||
" <td>20150218T000000</td>\n",
|
||
" <td>510000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>8080</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1987</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98074</td>\n",
|
||
" <td>47.6168</td>\n",
|
||
" <td>-122.045</td>\n",
|
||
" <td>1800</td>\n",
|
||
" <td>7503</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9995</th>\n",
|
||
" <td>322059264</td>\n",
|
||
" <td>20140926T000000</td>\n",
|
||
" <td>279000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>47044</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1904</td>\n",
|
||
" <td>1958</td>\n",
|
||
" <td>98042</td>\n",
|
||
" <td>47.4206</td>\n",
|
||
" <td>-122.155</td>\n",
|
||
" <td>1930</td>\n",
|
||
" <td>12139</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9996</th>\n",
|
||
" <td>5557500270</td>\n",
|
||
" <td>20150209T000000</td>\n",
|
||
" <td>262000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.50</td>\n",
|
||
" <td>1700</td>\n",
|
||
" <td>9579</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1100</td>\n",
|
||
" <td>600</td>\n",
|
||
" <td>1962</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98023</td>\n",
|
||
" <td>47.3209</td>\n",
|
||
" <td>-122.338</td>\n",
|
||
" <td>1700</td>\n",
|
||
" <td>9628</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9997</th>\n",
|
||
" <td>9164100125</td>\n",
|
||
" <td>20140807T000000</td>\n",
|
||
" <td>533000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1550</td>\n",
|
||
" <td>4750</td>\n",
|
||
" <td>1.5</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1550</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1919</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98117</td>\n",
|
||
" <td>47.6824</td>\n",
|
||
" <td>-122.389</td>\n",
|
||
" <td>1320</td>\n",
|
||
" <td>4750</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9998</th>\n",
|
||
" <td>7370600045</td>\n",
|
||
" <td>20150402T000000</td>\n",
|
||
" <td>640000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>8100</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1950</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98177</td>\n",
|
||
" <td>47.7212</td>\n",
|
||
" <td>-122.364</td>\n",
|
||
" <td>1880</td>\n",
|
||
" <td>7750</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9999</th>\n",
|
||
" <td>8594400060</td>\n",
|
||
" <td>20140609T000000</td>\n",
|
||
" <td>285000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.25</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>35127</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1987</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98092</td>\n",
|
||
" <td>47.3025</td>\n",
|
||
" <td>-122.067</td>\n",
|
||
" <td>1820</td>\n",
|
||
" <td>35166</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>10000 rows × 21 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id date price bedrooms bathrooms sqft_living \\\n",
|
||
"0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
|
||
"1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
|
||
"2 5631500400 20150225T000000 180000.0 2 1.00 770 \n",
|
||
"3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
|
||
"4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"9995 322059264 20140926T000000 279000.0 2 1.00 1020 \n",
|
||
"9996 5557500270 20150209T000000 262000.0 3 1.50 1700 \n",
|
||
"9997 9164100125 20140807T000000 533000.0 4 1.00 1550 \n",
|
||
"9998 7370600045 20150402T000000 640000.0 3 1.75 1680 \n",
|
||
"9999 8594400060 20140609T000000 285000.0 3 2.25 1680 \n",
|
||
"\n",
|
||
" sqft_lot floors waterfront view ... grade sqft_above \\\n",
|
||
"0 5650 1.0 0 0 ... 7 1180 \n",
|
||
"1 7242 2.0 0 0 ... 7 2170 \n",
|
||
"2 10000 1.0 0 0 ... 6 770 \n",
|
||
"3 5000 1.0 0 0 ... 7 1050 \n",
|
||
"4 8080 1.0 0 0 ... 8 1680 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"9995 47044 1.0 0 0 ... 7 1020 \n",
|
||
"9996 9579 1.0 0 0 ... 7 1100 \n",
|
||
"9997 4750 1.5 0 0 ... 7 1550 \n",
|
||
"9998 8100 1.0 0 2 ... 8 1680 \n",
|
||
"9999 35127 2.0 0 0 ... 7 1680 \n",
|
||
"\n",
|
||
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
|
||
"0 0 1955 0 98178 47.5112 -122.257 \n",
|
||
"1 400 1951 1991 98125 47.7210 -122.319 \n",
|
||
"2 0 1933 0 98028 47.7379 -122.233 \n",
|
||
"3 910 1965 0 98136 47.5208 -122.393 \n",
|
||
"4 0 1987 0 98074 47.6168 -122.045 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"9995 0 1904 1958 98042 47.4206 -122.155 \n",
|
||
"9996 600 1962 0 98023 47.3209 -122.338 \n",
|
||
"9997 0 1919 0 98117 47.6824 -122.389 \n",
|
||
"9998 0 1950 0 98177 47.7212 -122.364 \n",
|
||
"9999 0 1987 0 98092 47.3025 -122.067 \n",
|
||
"\n",
|
||
" sqft_living15 sqft_lot15 \n",
|
||
"0 1340 5650 \n",
|
||
"1 1690 7639 \n",
|
||
"2 2720 8062 \n",
|
||
"3 1360 5000 \n",
|
||
"4 1800 7503 \n",
|
||
"... ... ... \n",
|
||
"9995 1930 12139 \n",
|
||
"9996 1700 9628 \n",
|
||
"9997 1320 4750 \n",
|
||
"9998 1880 7750 \n",
|
||
"9999 1820 35166 \n",
|
||
"\n",
|
||
"[10000 rows x 21 columns]"
|
||
]
|
||
},
|
||
"execution_count": 337,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import seaborn as sns\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn import set_config\n",
|
||
"\n",
|
||
"df = pd.read_csv(\"data/house_data.csv\", sep=\",\", nrows=10000)\n",
|
||
"df.dropna()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Устраняем выбросы в колонке цены и добавляем колонку с категориями цены"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 338,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>bathrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>floors</th>\n",
|
||
" <th>waterfront</th>\n",
|
||
" <th>view</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>sqft_above</th>\n",
|
||
" <th>sqft_basement</th>\n",
|
||
" <th>yr_built</th>\n",
|
||
" <th>yr_renovated</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>lat</th>\n",
|
||
" <th>long</th>\n",
|
||
" <th>sqft_living15</th>\n",
|
||
" <th>sqft_lot15</th>\n",
|
||
" <th>price_category</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>9980</th>\n",
|
||
" <td>6840700036</td>\n",
|
||
" <td>20140728T000000</td>\n",
|
||
" <td>497000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>770</td>\n",
|
||
" <td>3325</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>770</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1918</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98122</td>\n",
|
||
" <td>47.6102</td>\n",
|
||
" <td>-122.299</td>\n",
|
||
" <td>960</td>\n",
|
||
" <td>4800</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9981</th>\n",
|
||
" <td>1824069083</td>\n",
|
||
" <td>20150429T000000</td>\n",
|
||
" <td>835000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>3060</td>\n",
|
||
" <td>30166</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>3060</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1959</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98027</td>\n",
|
||
" <td>47.5656</td>\n",
|
||
" <td>-122.093</td>\n",
|
||
" <td>1880</td>\n",
|
||
" <td>19602</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9982</th>\n",
|
||
" <td>1836980240</td>\n",
|
||
" <td>20141015T000000</td>\n",
|
||
" <td>730000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.75</td>\n",
|
||
" <td>2920</td>\n",
|
||
" <td>4500</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2920</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1999</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98006</td>\n",
|
||
" <td>47.5646</td>\n",
|
||
" <td>-122.124</td>\n",
|
||
" <td>2920</td>\n",
|
||
" <td>4505</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9983</th>\n",
|
||
" <td>3528900160</td>\n",
|
||
" <td>20141001T000000</td>\n",
|
||
" <td>655000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1370</td>\n",
|
||
" <td>5250</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1070</td>\n",
|
||
" <td>300</td>\n",
|
||
" <td>1939</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98109</td>\n",
|
||
" <td>47.6421</td>\n",
|
||
" <td>-122.348</td>\n",
|
||
" <td>2410</td>\n",
|
||
" <td>4200</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9984</th>\n",
|
||
" <td>1442800060</td>\n",
|
||
" <td>20141120T000000</td>\n",
|
||
" <td>205000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>1870</td>\n",
|
||
" <td>3118</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1870</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1993</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98038</td>\n",
|
||
" <td>47.3739</td>\n",
|
||
" <td>-122.056</td>\n",
|
||
" <td>1580</td>\n",
|
||
" <td>3601</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9985</th>\n",
|
||
" <td>8722100030</td>\n",
|
||
" <td>20150407T000000</td>\n",
|
||
" <td>632750.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>1800</td>\n",
|
||
" <td>4800</td>\n",
|
||
" <td>1.5</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1800</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1918</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98112</td>\n",
|
||
" <td>47.6388</td>\n",
|
||
" <td>-122.302</td>\n",
|
||
" <td>1950</td>\n",
|
||
" <td>4800</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9986</th>\n",
|
||
" <td>1723049624</td>\n",
|
||
" <td>20140512T000000</td>\n",
|
||
" <td>330000.0</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>3.00</td>\n",
|
||
" <td>2100</td>\n",
|
||
" <td>7715</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1250</td>\n",
|
||
" <td>850</td>\n",
|
||
" <td>2013</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98168</td>\n",
|
||
" <td>47.4866</td>\n",
|
||
" <td>-122.319</td>\n",
|
||
" <td>2100</td>\n",
|
||
" <td>7959</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9987</th>\n",
|
||
" <td>4040400200</td>\n",
|
||
" <td>20141007T000000</td>\n",
|
||
" <td>527500.0</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2.25</td>\n",
|
||
" <td>2530</td>\n",
|
||
" <td>8250</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2530</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1961</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98007</td>\n",
|
||
" <td>47.6117</td>\n",
|
||
" <td>-122.134</td>\n",
|
||
" <td>2020</td>\n",
|
||
" <td>8250</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9988</th>\n",
|
||
" <td>8691391090</td>\n",
|
||
" <td>20140508T000000</td>\n",
|
||
" <td>716500.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>3290</td>\n",
|
||
" <td>6465</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>3290</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2002</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98075</td>\n",
|
||
" <td>47.5981</td>\n",
|
||
" <td>-121.976</td>\n",
|
||
" <td>3100</td>\n",
|
||
" <td>5929</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9989</th>\n",
|
||
" <td>7853302190</td>\n",
|
||
" <td>20141217T000000</td>\n",
|
||
" <td>388500.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>1890</td>\n",
|
||
" <td>5395</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1890</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2006</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98065</td>\n",
|
||
" <td>47.5415</td>\n",
|
||
" <td>-121.883</td>\n",
|
||
" <td>2060</td>\n",
|
||
" <td>5395</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9990</th>\n",
|
||
" <td>3260000700</td>\n",
|
||
" <td>20140904T000000</td>\n",
|
||
" <td>530000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>7770</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1967</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98005</td>\n",
|
||
" <td>47.6028</td>\n",
|
||
" <td>-122.167</td>\n",
|
||
" <td>1880</td>\n",
|
||
" <td>7770</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9991</th>\n",
|
||
" <td>5126300510</td>\n",
|
||
" <td>20150108T000000</td>\n",
|
||
" <td>419000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>2170</td>\n",
|
||
" <td>4517</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2170</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2002</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98059</td>\n",
|
||
" <td>47.4819</td>\n",
|
||
" <td>-122.140</td>\n",
|
||
" <td>2610</td>\n",
|
||
" <td>4770</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9992</th>\n",
|
||
" <td>7199330370</td>\n",
|
||
" <td>20150309T000000</td>\n",
|
||
" <td>385000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>1200</td>\n",
|
||
" <td>7360</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1200</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1978</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98052</td>\n",
|
||
" <td>47.6979</td>\n",
|
||
" <td>-122.130</td>\n",
|
||
" <td>1200</td>\n",
|
||
" <td>7500</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9993</th>\n",
|
||
" <td>1854900240</td>\n",
|
||
" <td>20140528T000000</td>\n",
|
||
" <td>655000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>2990</td>\n",
|
||
" <td>5669</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2990</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2003</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98074</td>\n",
|
||
" <td>47.6119</td>\n",
|
||
" <td>-122.011</td>\n",
|
||
" <td>3110</td>\n",
|
||
" <td>5058</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9994</th>\n",
|
||
" <td>6738700335</td>\n",
|
||
" <td>20140701T000000</td>\n",
|
||
" <td>1127312.5</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.75</td>\n",
|
||
" <td>3770</td>\n",
|
||
" <td>10900</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>3070</td>\n",
|
||
" <td>700</td>\n",
|
||
" <td>1924</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98144</td>\n",
|
||
" <td>47.5849</td>\n",
|
||
" <td>-122.290</td>\n",
|
||
" <td>3000</td>\n",
|
||
" <td>5000</td>\n",
|
||
" <td>very_high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9995</th>\n",
|
||
" <td>322059264</td>\n",
|
||
" <td>20140926T000000</td>\n",
|
||
" <td>279000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>47044</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1904</td>\n",
|
||
" <td>1958</td>\n",
|
||
" <td>98042</td>\n",
|
||
" <td>47.4206</td>\n",
|
||
" <td>-122.155</td>\n",
|
||
" <td>1930</td>\n",
|
||
" <td>12139</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9996</th>\n",
|
||
" <td>5557500270</td>\n",
|
||
" <td>20150209T000000</td>\n",
|
||
" <td>262000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.50</td>\n",
|
||
" <td>1700</td>\n",
|
||
" <td>9579</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1100</td>\n",
|
||
" <td>600</td>\n",
|
||
" <td>1962</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98023</td>\n",
|
||
" <td>47.3209</td>\n",
|
||
" <td>-122.338</td>\n",
|
||
" <td>1700</td>\n",
|
||
" <td>9628</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9997</th>\n",
|
||
" <td>9164100125</td>\n",
|
||
" <td>20140807T000000</td>\n",
|
||
" <td>533000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1550</td>\n",
|
||
" <td>4750</td>\n",
|
||
" <td>1.5</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1550</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1919</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98117</td>\n",
|
||
" <td>47.6824</td>\n",
|
||
" <td>-122.389</td>\n",
|
||
" <td>1320</td>\n",
|
||
" <td>4750</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9998</th>\n",
|
||
" <td>7370600045</td>\n",
|
||
" <td>20150402T000000</td>\n",
|
||
" <td>640000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>8100</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1950</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98177</td>\n",
|
||
" <td>47.7212</td>\n",
|
||
" <td>-122.364</td>\n",
|
||
" <td>1880</td>\n",
|
||
" <td>7750</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9999</th>\n",
|
||
" <td>8594400060</td>\n",
|
||
" <td>20140609T000000</td>\n",
|
||
" <td>285000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.25</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>35127</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1987</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98092</td>\n",
|
||
" <td>47.3025</td>\n",
|
||
" <td>-122.067</td>\n",
|
||
" <td>1820</td>\n",
|
||
" <td>35166</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>20 rows × 22 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id date price bedrooms bathrooms \\\n",
|
||
"9980 6840700036 20140728T000000 497000.0 2 1.00 \n",
|
||
"9981 1824069083 20150429T000000 835000.0 3 1.00 \n",
|
||
"9982 1836980240 20141015T000000 730000.0 4 2.75 \n",
|
||
"9983 3528900160 20141001T000000 655000.0 3 1.00 \n",
|
||
"9984 1442800060 20141120T000000 205000.0 3 2.50 \n",
|
||
"9985 8722100030 20150407T000000 632750.0 4 2.00 \n",
|
||
"9986 1723049624 20140512T000000 330000.0 5 3.00 \n",
|
||
"9987 4040400200 20141007T000000 527500.0 5 2.25 \n",
|
||
"9988 8691391090 20140508T000000 716500.0 4 2.50 \n",
|
||
"9989 7853302190 20141217T000000 388500.0 4 2.50 \n",
|
||
"9990 3260000700 20140904T000000 530000.0 3 1.75 \n",
|
||
"9991 5126300510 20150108T000000 419000.0 3 2.50 \n",
|
||
"9992 7199330370 20150309T000000 385000.0 3 1.75 \n",
|
||
"9993 1854900240 20140528T000000 655000.0 4 2.50 \n",
|
||
"9994 6738700335 20140701T000000 1127312.5 4 2.75 \n",
|
||
"9995 322059264 20140926T000000 279000.0 2 1.00 \n",
|
||
"9996 5557500270 20150209T000000 262000.0 3 1.50 \n",
|
||
"9997 9164100125 20140807T000000 533000.0 4 1.00 \n",
|
||
"9998 7370600045 20150402T000000 640000.0 3 1.75 \n",
|
||
"9999 8594400060 20140609T000000 285000.0 3 2.25 \n",
|
||
"\n",
|
||
" sqft_living sqft_lot floors waterfront view ... sqft_above \\\n",
|
||
"9980 770 3325 1.0 0 0 ... 770 \n",
|
||
"9981 3060 30166 1.0 0 0 ... 3060 \n",
|
||
"9982 2920 4500 2.0 0 0 ... 2920 \n",
|
||
"9983 1370 5250 1.0 0 0 ... 1070 \n",
|
||
"9984 1870 3118 2.0 0 0 ... 1870 \n",
|
||
"9985 1800 4800 1.5 0 0 ... 1800 \n",
|
||
"9986 2100 7715 1.0 0 0 ... 1250 \n",
|
||
"9987 2530 8250 2.0 0 0 ... 2530 \n",
|
||
"9988 3290 6465 2.0 0 0 ... 3290 \n",
|
||
"9989 1890 5395 2.0 0 0 ... 1890 \n",
|
||
"9990 1680 7770 1.0 0 0 ... 1680 \n",
|
||
"9991 2170 4517 2.0 0 0 ... 2170 \n",
|
||
"9992 1200 7360 1.0 0 0 ... 1200 \n",
|
||
"9993 2990 5669 2.0 0 0 ... 2990 \n",
|
||
"9994 3770 10900 2.0 0 2 ... 3070 \n",
|
||
"9995 1020 47044 1.0 0 0 ... 1020 \n",
|
||
"9996 1700 9579 1.0 0 0 ... 1100 \n",
|
||
"9997 1550 4750 1.5 0 0 ... 1550 \n",
|
||
"9998 1680 8100 1.0 0 2 ... 1680 \n",
|
||
"9999 1680 35127 2.0 0 0 ... 1680 \n",
|
||
"\n",
|
||
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
|
||
"9980 0 1918 0 98122 47.6102 -122.299 \n",
|
||
"9981 0 1959 0 98027 47.5656 -122.093 \n",
|
||
"9982 0 1999 0 98006 47.5646 -122.124 \n",
|
||
"9983 300 1939 0 98109 47.6421 -122.348 \n",
|
||
"9984 0 1993 0 98038 47.3739 -122.056 \n",
|
||
"9985 0 1918 0 98112 47.6388 -122.302 \n",
|
||
"9986 850 2013 0 98168 47.4866 -122.319 \n",
|
||
"9987 0 1961 0 98007 47.6117 -122.134 \n",
|
||
"9988 0 2002 0 98075 47.5981 -121.976 \n",
|
||
"9989 0 2006 0 98065 47.5415 -121.883 \n",
|
||
"9990 0 1967 0 98005 47.6028 -122.167 \n",
|
||
"9991 0 2002 0 98059 47.4819 -122.140 \n",
|
||
"9992 0 1978 0 98052 47.6979 -122.130 \n",
|
||
"9993 0 2003 0 98074 47.6119 -122.011 \n",
|
||
"9994 700 1924 0 98144 47.5849 -122.290 \n",
|
||
"9995 0 1904 1958 98042 47.4206 -122.155 \n",
|
||
"9996 600 1962 0 98023 47.3209 -122.338 \n",
|
||
"9997 0 1919 0 98117 47.6824 -122.389 \n",
|
||
"9998 0 1950 0 98177 47.7212 -122.364 \n",
|
||
"9999 0 1987 0 98092 47.3025 -122.067 \n",
|
||
"\n",
|
||
" sqft_living15 sqft_lot15 price_category \n",
|
||
"9980 960 4800 middle \n",
|
||
"9981 1880 19602 high \n",
|
||
"9982 2920 4505 high \n",
|
||
"9983 2410 4200 high \n",
|
||
"9984 1580 3601 low \n",
|
||
"9985 1950 4800 high \n",
|
||
"9986 2100 7959 low \n",
|
||
"9987 2020 8250 middle \n",
|
||
"9988 3100 5929 high \n",
|
||
"9989 2060 5395 middle \n",
|
||
"9990 1880 7770 middle \n",
|
||
"9991 2610 4770 middle \n",
|
||
"9992 1200 7500 middle \n",
|
||
"9993 3110 5058 high \n",
|
||
"9994 3000 5000 very_high \n",
|
||
"9995 1930 12139 low \n",
|
||
"9996 1700 9628 low \n",
|
||
"9997 1320 4750 middle \n",
|
||
"9998 1880 7750 high \n",
|
||
"9999 1820 35166 low \n",
|
||
"\n",
|
||
"[20 rows x 22 columns]"
|
||
]
|
||
},
|
||
"execution_count": 338,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"q1 = df['price'].quantile(0.25) # Находим 1-й квартиль (Q1)\n",
|
||
"q3 = df['price'].quantile(0.75) # Находим 3-й квартиль (Q3)\n",
|
||
"iqr = q3 - q1 # Вычисляем межквартильный размах (IQR)\n",
|
||
"\n",
|
||
"# Определяем границы для выбросов\n",
|
||
"lower_bound = q1 - 1.5 * iqr # Нижняя граница\n",
|
||
"upper_bound = q3 + 1.5 * iqr # Верхняя граница\n",
|
||
"\n",
|
||
"# Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n",
|
||
"df['price'] = df['price'].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
|
||
"\n",
|
||
"# Добавляем столбец с категорями цены\n",
|
||
"df['price_category'] = pd.cut(df['price'], bins=[75000,338750,602750,866750,1130750], labels=['low','middle','high','very_high'], include_lowest=True)\n",
|
||
"df.tail(20)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Бизнес-цели\n",
|
||
"1. Прогноз класса цены недвижимости (Классификация)\n",
|
||
"2. Оценка состояния недвижимости (Регрессия)\n",
|
||
"\n",
|
||
"### Определение достижимого уровня качества модели для первой задачи\n",
|
||
"#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации (Целевой признак - price)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 339,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'X_train'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>bathrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>floors</th>\n",
|
||
" <th>waterfront</th>\n",
|
||
" <th>view</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>sqft_above</th>\n",
|
||
" <th>sqft_basement</th>\n",
|
||
" <th>yr_built</th>\n",
|
||
" <th>yr_renovated</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>lat</th>\n",
|
||
" <th>long</th>\n",
|
||
" <th>sqft_living15</th>\n",
|
||
" <th>sqft_lot15</th>\n",
|
||
" <th>price_category</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>9843</th>\n",
|
||
" <td>3260000340</td>\n",
|
||
" <td>20140622T000000</td>\n",
|
||
" <td>732600.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>2130</td>\n",
|
||
" <td>7300</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1230</td>\n",
|
||
" <td>900</td>\n",
|
||
" <td>1963</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98005</td>\n",
|
||
" <td>47.6050</td>\n",
|
||
" <td>-122.167</td>\n",
|
||
" <td>2130</td>\n",
|
||
" <td>7560</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9623</th>\n",
|
||
" <td>9828702055</td>\n",
|
||
" <td>20140508T000000</td>\n",
|
||
" <td>358000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.50</td>\n",
|
||
" <td>960</td>\n",
|
||
" <td>1808</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>960</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1993</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98122</td>\n",
|
||
" <td>47.6183</td>\n",
|
||
" <td>-122.298</td>\n",
|
||
" <td>1290</td>\n",
|
||
" <td>1668</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3095</th>\n",
|
||
" <td>3438500625</td>\n",
|
||
" <td>20140519T000000</td>\n",
|
||
" <td>210000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1080</td>\n",
|
||
" <td>21043</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1080</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1942</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98106</td>\n",
|
||
" <td>47.5515</td>\n",
|
||
" <td>-122.357</td>\n",
|
||
" <td>1380</td>\n",
|
||
" <td>7620</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>411</th>\n",
|
||
" <td>2422029094</td>\n",
|
||
" <td>20140716T000000</td>\n",
|
||
" <td>517534.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>833</td>\n",
|
||
" <td>143947</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>833</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2006</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98070</td>\n",
|
||
" <td>47.3889</td>\n",
|
||
" <td>-122.482</td>\n",
|
||
" <td>1380</td>\n",
|
||
" <td>143947</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3060</th>\n",
|
||
" <td>7462900015</td>\n",
|
||
" <td>20150108T000000</td>\n",
|
||
" <td>387000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.25</td>\n",
|
||
" <td>1760</td>\n",
|
||
" <td>45133</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1760</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1984</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98065</td>\n",
|
||
" <td>47.5124</td>\n",
|
||
" <td>-121.866</td>\n",
|
||
" <td>1910</td>\n",
|
||
" <td>51773</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1750</th>\n",
|
||
" <td>2787720140</td>\n",
|
||
" <td>20150407T000000</td>\n",
|
||
" <td>416000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>1790</td>\n",
|
||
" <td>11542</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1190</td>\n",
|
||
" <td>600</td>\n",
|
||
" <td>1969</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98059</td>\n",
|
||
" <td>47.5124</td>\n",
|
||
" <td>-122.160</td>\n",
|
||
" <td>1790</td>\n",
|
||
" <td>9131</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2354</th>\n",
|
||
" <td>6192400400</td>\n",
|
||
" <td>20140728T000000</td>\n",
|
||
" <td>775000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>3090</td>\n",
|
||
" <td>7112</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>3090</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2001</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98052</td>\n",
|
||
" <td>47.7050</td>\n",
|
||
" <td>-122.118</td>\n",
|
||
" <td>3050</td>\n",
|
||
" <td>6000</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>857</th>\n",
|
||
" <td>2296500036</td>\n",
|
||
" <td>20150310T000000</td>\n",
|
||
" <td>450000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.75</td>\n",
|
||
" <td>2980</td>\n",
|
||
" <td>13260</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1800</td>\n",
|
||
" <td>1180</td>\n",
|
||
" <td>1979</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98056</td>\n",
|
||
" <td>47.5152</td>\n",
|
||
" <td>-122.197</td>\n",
|
||
" <td>1920</td>\n",
|
||
" <td>10731</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6181</th>\n",
|
||
" <td>2787310130</td>\n",
|
||
" <td>20141212T000000</td>\n",
|
||
" <td>289950.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>2090</td>\n",
|
||
" <td>7416</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1050</td>\n",
|
||
" <td>1040</td>\n",
|
||
" <td>1970</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98031</td>\n",
|
||
" <td>47.4107</td>\n",
|
||
" <td>-122.179</td>\n",
|
||
" <td>1710</td>\n",
|
||
" <td>7527</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3141</th>\n",
|
||
" <td>8567300110</td>\n",
|
||
" <td>20140604T000000</td>\n",
|
||
" <td>485000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>2340</td>\n",
|
||
" <td>59058</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2340</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1985</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98038</td>\n",
|
||
" <td>47.4052</td>\n",
|
||
" <td>-122.028</td>\n",
|
||
" <td>2700</td>\n",
|
||
" <td>37263</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>8000 rows × 22 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id date price bedrooms bathrooms sqft_living \\\n",
|
||
"9843 3260000340 20140622T000000 732600.0 4 2.50 2130 \n",
|
||
"9623 9828702055 20140508T000000 358000.0 2 1.50 960 \n",
|
||
"3095 3438500625 20140519T000000 210000.0 3 1.00 1080 \n",
|
||
"411 2422029094 20140716T000000 517534.0 2 1.00 833 \n",
|
||
"3060 7462900015 20150108T000000 387000.0 3 2.25 1760 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"1750 2787720140 20150407T000000 416000.0 3 2.50 1790 \n",
|
||
"2354 6192400400 20140728T000000 775000.0 4 2.50 3090 \n",
|
||
"857 2296500036 20150310T000000 450000.0 4 2.75 2980 \n",
|
||
"6181 2787310130 20141212T000000 289950.0 4 1.75 2090 \n",
|
||
"3141 8567300110 20140604T000000 485000.0 3 2.50 2340 \n",
|
||
"\n",
|
||
" sqft_lot floors waterfront view ... sqft_above sqft_basement \\\n",
|
||
"9843 7300 1.0 0 0 ... 1230 900 \n",
|
||
"9623 1808 2.0 0 0 ... 960 0 \n",
|
||
"3095 21043 1.0 0 0 ... 1080 0 \n",
|
||
"411 143947 1.0 0 0 ... 833 0 \n",
|
||
"3060 45133 2.0 0 0 ... 1760 0 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"1750 11542 1.0 0 0 ... 1190 600 \n",
|
||
"2354 7112 2.0 0 0 ... 3090 0 \n",
|
||
"857 13260 1.0 0 0 ... 1800 1180 \n",
|
||
"6181 7416 1.0 0 0 ... 1050 1040 \n",
|
||
"3141 59058 1.0 0 0 ... 2340 0 \n",
|
||
"\n",
|
||
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
|
||
"9843 1963 0 98005 47.6050 -122.167 2130 \n",
|
||
"9623 1993 0 98122 47.6183 -122.298 1290 \n",
|
||
"3095 1942 0 98106 47.5515 -122.357 1380 \n",
|
||
"411 2006 0 98070 47.3889 -122.482 1380 \n",
|
||
"3060 1984 0 98065 47.5124 -121.866 1910 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"1750 1969 0 98059 47.5124 -122.160 1790 \n",
|
||
"2354 2001 0 98052 47.7050 -122.118 3050 \n",
|
||
"857 1979 0 98056 47.5152 -122.197 1920 \n",
|
||
"6181 1970 0 98031 47.4107 -122.179 1710 \n",
|
||
"3141 1985 0 98038 47.4052 -122.028 2700 \n",
|
||
"\n",
|
||
" sqft_lot15 price_category \n",
|
||
"9843 7560 high \n",
|
||
"9623 1668 middle \n",
|
||
"3095 7620 low \n",
|
||
"411 143947 middle \n",
|
||
"3060 51773 middle \n",
|
||
"... ... ... \n",
|
||
"1750 9131 middle \n",
|
||
"2354 6000 high \n",
|
||
"857 10731 middle \n",
|
||
"6181 7527 low \n",
|
||
"3141 37263 middle \n",
|
||
"\n",
|
||
"[8000 rows x 22 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'y_train'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>price_category</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>9843</th>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9623</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3095</th>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>411</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3060</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1750</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2354</th>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>857</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6181</th>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3141</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>8000 rows × 1 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" price_category\n",
|
||
"9843 high\n",
|
||
"9623 middle\n",
|
||
"3095 low\n",
|
||
"411 middle\n",
|
||
"3060 middle\n",
|
||
"... ...\n",
|
||
"1750 middle\n",
|
||
"2354 high\n",
|
||
"857 middle\n",
|
||
"6181 low\n",
|
||
"3141 middle\n",
|
||
"\n",
|
||
"[8000 rows x 1 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'X_test'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>bathrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>floors</th>\n",
|
||
" <th>waterfront</th>\n",
|
||
" <th>view</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>sqft_above</th>\n",
|
||
" <th>sqft_basement</th>\n",
|
||
" <th>yr_built</th>\n",
|
||
" <th>yr_renovated</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>lat</th>\n",
|
||
" <th>long</th>\n",
|
||
" <th>sqft_living15</th>\n",
|
||
" <th>sqft_lot15</th>\n",
|
||
" <th>price_category</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>5341</th>\n",
|
||
" <td>6632900574</td>\n",
|
||
" <td>20150225T000000</td>\n",
|
||
" <td>595000.0</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>3.00</td>\n",
|
||
" <td>2980</td>\n",
|
||
" <td>10064</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>1300</td>\n",
|
||
" <td>1940</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98155</td>\n",
|
||
" <td>47.7372</td>\n",
|
||
" <td>-122.316</td>\n",
|
||
" <td>1590</td>\n",
|
||
" <td>7800</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4384</th>\n",
|
||
" <td>2423029245</td>\n",
|
||
" <td>20140617T000000</td>\n",
|
||
" <td>550000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>2240</td>\n",
|
||
" <td>78225</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2240</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1976</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98070</td>\n",
|
||
" <td>47.4638</td>\n",
|
||
" <td>-122.484</td>\n",
|
||
" <td>2030</td>\n",
|
||
" <td>202554</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5795</th>\n",
|
||
" <td>2473370050</td>\n",
|
||
" <td>20140604T000000</td>\n",
|
||
" <td>327500.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>1650</td>\n",
|
||
" <td>7800</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1650</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1968</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98058</td>\n",
|
||
" <td>47.4507</td>\n",
|
||
" <td>-122.139</td>\n",
|
||
" <td>1750</td>\n",
|
||
" <td>10400</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4956</th>\n",
|
||
" <td>9528104985</td>\n",
|
||
" <td>20141104T000000</td>\n",
|
||
" <td>611000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1270</td>\n",
|
||
" <td>5100</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1100</td>\n",
|
||
" <td>170</td>\n",
|
||
" <td>1900</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98115</td>\n",
|
||
" <td>47.6771</td>\n",
|
||
" <td>-122.328</td>\n",
|
||
" <td>1670</td>\n",
|
||
" <td>3900</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7723</th>\n",
|
||
" <td>3972900025</td>\n",
|
||
" <td>20150313T000000</td>\n",
|
||
" <td>499000.0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>2400</td>\n",
|
||
" <td>7500</td>\n",
|
||
" <td>1.5</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1400</td>\n",
|
||
" <td>1000</td>\n",
|
||
" <td>1975</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98155</td>\n",
|
||
" <td>47.7661</td>\n",
|
||
" <td>-122.313</td>\n",
|
||
" <td>1980</td>\n",
|
||
" <td>7500</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8517</th>\n",
|
||
" <td>3876600120</td>\n",
|
||
" <td>20150422T000000</td>\n",
|
||
" <td>265000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.50</td>\n",
|
||
" <td>1780</td>\n",
|
||
" <td>10196</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1270</td>\n",
|
||
" <td>510</td>\n",
|
||
" <td>1967</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98001</td>\n",
|
||
" <td>47.3375</td>\n",
|
||
" <td>-122.291</td>\n",
|
||
" <td>1320</td>\n",
|
||
" <td>7875</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6914</th>\n",
|
||
" <td>6821600005</td>\n",
|
||
" <td>20150403T000000</td>\n",
|
||
" <td>710000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>2120</td>\n",
|
||
" <td>5400</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1060</td>\n",
|
||
" <td>1060</td>\n",
|
||
" <td>1941</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98199</td>\n",
|
||
" <td>47.6501</td>\n",
|
||
" <td>-122.395</td>\n",
|
||
" <td>2052</td>\n",
|
||
" <td>6000</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4499</th>\n",
|
||
" <td>2767603931</td>\n",
|
||
" <td>20140818T000000</td>\n",
|
||
" <td>469000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3.25</td>\n",
|
||
" <td>1370</td>\n",
|
||
" <td>1194</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1370</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2004</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98107</td>\n",
|
||
" <td>47.6718</td>\n",
|
||
" <td>-122.388</td>\n",
|
||
" <td>1800</td>\n",
|
||
" <td>2678</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8651</th>\n",
|
||
" <td>8802400411</td>\n",
|
||
" <td>20140619T000000</td>\n",
|
||
" <td>249000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1050</td>\n",
|
||
" <td>8498</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1050</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1959</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98031</td>\n",
|
||
" <td>47.4043</td>\n",
|
||
" <td>-122.202</td>\n",
|
||
" <td>1050</td>\n",
|
||
" <td>8498</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4234</th>\n",
|
||
" <td>5452800735</td>\n",
|
||
" <td>20140722T000000</td>\n",
|
||
" <td>780000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>2270</td>\n",
|
||
" <td>13449</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1310</td>\n",
|
||
" <td>960</td>\n",
|
||
" <td>1975</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98040</td>\n",
|
||
" <td>47.5416</td>\n",
|
||
" <td>-122.232</td>\n",
|
||
" <td>2810</td>\n",
|
||
" <td>13475</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>2000 rows × 22 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id date price bedrooms bathrooms sqft_living \\\n",
|
||
"5341 6632900574 20150225T000000 595000.0 5 3.00 2980 \n",
|
||
"4384 2423029245 20140617T000000 550000.0 3 1.75 2240 \n",
|
||
"5795 2473370050 20140604T000000 327500.0 4 1.75 1650 \n",
|
||
"4956 9528104985 20141104T000000 611000.0 2 1.00 1270 \n",
|
||
"7723 3972900025 20150313T000000 499000.0 6 1.75 2400 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"8517 3876600120 20150422T000000 265000.0 3 1.50 1780 \n",
|
||
"6914 6821600005 20150403T000000 710000.0 4 1.75 2120 \n",
|
||
"4499 2767603931 20140818T000000 469000.0 3 3.25 1370 \n",
|
||
"8651 8802400411 20140619T000000 249000.0 3 1.00 1050 \n",
|
||
"4234 5452800735 20140722T000000 780000.0 4 2.50 2270 \n",
|
||
"\n",
|
||
" sqft_lot floors waterfront view ... sqft_above sqft_basement \\\n",
|
||
"5341 10064 1.0 0 0 ... 1680 1300 \n",
|
||
"4384 78225 2.0 0 0 ... 2240 0 \n",
|
||
"5795 7800 1.0 0 0 ... 1650 0 \n",
|
||
"4956 5100 1.0 0 0 ... 1100 170 \n",
|
||
"7723 7500 1.5 0 0 ... 1400 1000 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"8517 10196 1.0 0 0 ... 1270 510 \n",
|
||
"6914 5400 1.0 0 0 ... 1060 1060 \n",
|
||
"4499 1194 3.0 0 0 ... 1370 0 \n",
|
||
"8651 8498 1.0 0 0 ... 1050 0 \n",
|
||
"4234 13449 1.0 0 0 ... 1310 960 \n",
|
||
"\n",
|
||
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
|
||
"5341 1940 0 98155 47.7372 -122.316 1590 \n",
|
||
"4384 1976 0 98070 47.4638 -122.484 2030 \n",
|
||
"5795 1968 0 98058 47.4507 -122.139 1750 \n",
|
||
"4956 1900 0 98115 47.6771 -122.328 1670 \n",
|
||
"7723 1975 0 98155 47.7661 -122.313 1980 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"8517 1967 0 98001 47.3375 -122.291 1320 \n",
|
||
"6914 1941 0 98199 47.6501 -122.395 2052 \n",
|
||
"4499 2004 0 98107 47.6718 -122.388 1800 \n",
|
||
"8651 1959 0 98031 47.4043 -122.202 1050 \n",
|
||
"4234 1975 0 98040 47.5416 -122.232 2810 \n",
|
||
"\n",
|
||
" sqft_lot15 price_category \n",
|
||
"5341 7800 middle \n",
|
||
"4384 202554 middle \n",
|
||
"5795 10400 low \n",
|
||
"4956 3900 high \n",
|
||
"7723 7500 middle \n",
|
||
"... ... ... \n",
|
||
"8517 7875 low \n",
|
||
"6914 6000 high \n",
|
||
"4499 2678 middle \n",
|
||
"8651 8498 low \n",
|
||
"4234 13475 high \n",
|
||
"\n",
|
||
"[2000 rows x 22 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'y_test'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>price_category</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>5341</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4384</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5795</th>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4956</th>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7723</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8517</th>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6914</th>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4499</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8651</th>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4234</th>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>2000 rows × 1 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" price_category\n",
|
||
"5341 middle\n",
|
||
"4384 middle\n",
|
||
"5795 low\n",
|
||
"4956 high\n",
|
||
"7723 middle\n",
|
||
"... ...\n",
|
||
"8517 low\n",
|
||
"6914 high\n",
|
||
"4499 middle\n",
|
||
"8651 low\n",
|
||
"4234 high\n",
|
||
"\n",
|
||
"[2000 rows x 1 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"from typing import Tuple\n",
|
||
"import pandas as pd\n",
|
||
"from pandas import DataFrame\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"\n",
|
||
"def split_stratified_into_train_val_test(\n",
|
||
" df_input,\n",
|
||
" stratify_colname=\"y\",\n",
|
||
" frac_train=0.6,\n",
|
||
" frac_val=0.15,\n",
|
||
" frac_test=0.25,\n",
|
||
" random_state=None,\n",
|
||
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
|
||
" \n",
|
||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||
" raise ValueError(\n",
|
||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||
" % (frac_train, frac_val, frac_test)\n",
|
||
" )\n",
|
||
" if stratify_colname not in df_input.columns:\n",
|
||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||
" X = df_input # Contains all columns.\n",
|
||
" y = df_input[\n",
|
||
" [stratify_colname]\n",
|
||
" ] # Dataframe of just the column on which to stratify.\n",
|
||
" # Split original dataframe into train and temp dataframes.\n",
|
||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||
" )\n",
|
||
" if frac_val <= 0:\n",
|
||
" assert len(df_input) == len(df_train) + len(df_temp)\n",
|
||
" return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
|
||
" # Split the temp dataframe into val and test dataframes.\n",
|
||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||
" df_temp,\n",
|
||
" y_temp,\n",
|
||
" stratify=y_temp,\n",
|
||
" test_size=relative_frac_test,\n",
|
||
" random_state=random_state,\n",
|
||
" )\n",
|
||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||
" return df_train, df_val, df_test, y_train, y_val, y_test\n",
|
||
"\n",
|
||
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
|
||
" df, stratify_colname=\"price_category\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42\n",
|
||
")\n",
|
||
"\n",
|
||
"display(\"X_train\", X_train)\n",
|
||
"display(\"y_train\", y_train)\n",
|
||
"\n",
|
||
"display(\"X_test\", X_test)\n",
|
||
"display(\"y_test\", y_test)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Формирование конвейера\n",
|
||
"preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
|
||
"\n",
|
||
"preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
|
||
"\n",
|
||
"features_preprocessing -- трансформер для предобработки признаков\n",
|
||
"\n",
|
||
"drop_columns -- трансформер для удаления колонок\n",
|
||
"\n",
|
||
"pipeline_end -- основной конвейер предобработки данных и конструирования признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 340,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>bathrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>floors</th>\n",
|
||
" <th>condition</th>\n",
|
||
" <th>grade</th>\n",
|
||
" <th>sqft_above</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>yr_renovated</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>lat</th>\n",
|
||
" <th>long</th>\n",
|
||
" <th>sqft_living15</th>\n",
|
||
" <th>sqft_lot15</th>\n",
|
||
" <th>price_h</th>\n",
|
||
" <th>price_l</th>\n",
|
||
" <th>price_m</th>\n",
|
||
" <th>price_vh</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>-0.451103</td>\n",
|
||
" <td>0.916381</td>\n",
|
||
" <td>0.700559</td>\n",
|
||
" <td>0.573416</td>\n",
|
||
" <td>0.081706</td>\n",
|
||
" <td>-0.187493</td>\n",
|
||
" <td>-0.838739</td>\n",
|
||
" <td>0.839159</td>\n",
|
||
" <td>-0.512647</td>\n",
|
||
" <td>-0.638064</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>-1.349962</td>\n",
|
||
" <td>0.32254</td>\n",
|
||
" <td>0.340593</td>\n",
|
||
" <td>0.223199</td>\n",
|
||
" <td>-0.210584</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1.845014</td>\n",
|
||
" <td>-0.589326</td>\n",
|
||
" <td>-1.49426</td>\n",
|
||
" <td>-0.72971</td>\n",
|
||
" <td>-1.191326</td>\n",
|
||
" <td>-0.302999</td>\n",
|
||
" <td>1.120073</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>-0.512647</td>\n",
|
||
" <td>-0.969739</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>0.820656</td>\n",
|
||
" <td>0.417588</td>\n",
|
||
" <td>-0.601419</td>\n",
|
||
" <td>-1.022503</td>\n",
|
||
" <td>-0.421966</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>-0.388708</td>\n",
|
||
" <td>-1.184213</td>\n",
|
||
" <td>-0.396851</td>\n",
|
||
" <td>-1.381273</td>\n",
|
||
" <td>-1.060759</td>\n",
|
||
" <td>0.101544</td>\n",
|
||
" <td>-0.838739</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>-1.369558</td>\n",
|
||
" <td>-0.822328</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>0.523819</td>\n",
|
||
" <td>-0.059795</td>\n",
|
||
" <td>-1.025683</td>\n",
|
||
" <td>-0.889035</td>\n",
|
||
" <td>-0.208431</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>-0.74402</td>\n",
|
||
" <td>0.051922</td>\n",
|
||
" <td>-1.49426</td>\n",
|
||
" <td>-1.381273</td>\n",
|
||
" <td>-1.32951</td>\n",
|
||
" <td>2.686416</td>\n",
|
||
" <td>-0.838739</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>-2.22647</td>\n",
|
||
" <td>-1.125749</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>-0.144063</td>\n",
|
||
" <td>-1.221808</td>\n",
|
||
" <td>-1.924549</td>\n",
|
||
" <td>-0.889035</td>\n",
|
||
" <td>4.682444</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1.018038</td>\n",
|
||
" <td>-0.47276</td>\n",
|
||
" <td>-0.396851</td>\n",
|
||
" <td>0.247635</td>\n",
|
||
" <td>-0.320877</td>\n",
|
||
" <td>0.608196</td>\n",
|
||
" <td>1.120073</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>-0.512647</td>\n",
|
||
" <td>0.013003</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>-0.236825</td>\n",
|
||
" <td>-0.339221</td>\n",
|
||
" <td>2.505062</td>\n",
|
||
" <td>-0.103056</td>\n",
|
||
" <td>1.375604</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>-0.083826</td>\n",
|
||
" <td>-0.492858</td>\n",
|
||
" <td>-0.396851</td>\n",
|
||
" <td>1.550761</td>\n",
|
||
" <td>-0.701698</td>\n",
|
||
" <td>-0.314672</td>\n",
|
||
" <td>3.078884</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>0.344264</td>\n",
|
||
" <td>-0.416947</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>0.468162</td>\n",
|
||
" <td>0.987875</td>\n",
|
||
" <td>-0.903438</td>\n",
|
||
" <td>-0.844546</td>\n",
|
||
" <td>-0.436854</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>0.301277</td>\n",
|
||
" <td>-0.953091</td>\n",
|
||
" <td>-0.396851</td>\n",
|
||
" <td>0.573416</td>\n",
|
||
" <td>-0.712579</td>\n",
|
||
" <td>-0.180574</td>\n",
|
||
" <td>-0.838739</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>-0.512647</td>\n",
|
||
" <td>-0.773191</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>-0.886155</td>\n",
|
||
" <td>-1.293987</td>\n",
|
||
" <td>0.254302</td>\n",
|
||
" <td>-0.666588</td>\n",
|
||
" <td>-0.205992</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>-0.086798</td>\n",
|
||
" <td>-1.148038</td>\n",
|
||
" <td>-1.49426</td>\n",
|
||
" <td>-1.381273</td>\n",
|
||
" <td>-1.25661</td>\n",
|
||
" <td>-0.232501</td>\n",
|
||
" <td>-0.838739</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>-1.369558</td>\n",
|
||
" <td>-1.043445</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>0.523819</td>\n",
|
||
" <td>-0.249176</td>\n",
|
||
" <td>-1.018493</td>\n",
|
||
" <td>-1.600865</td>\n",
|
||
" <td>-0.296686</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>-0.824567</td>\n",
|
||
" <td>-1.148038</td>\n",
|
||
" <td>-1.49426</td>\n",
|
||
" <td>-1.381273</td>\n",
|
||
" <td>-1.0934</td>\n",
|
||
" <td>-0.15174</td>\n",
|
||
" <td>0.140667</td>\n",
|
||
" <td>0.839159</td>\n",
|
||
" <td>-0.512647</td>\n",
|
||
" <td>-0.859181</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>-1.387066</td>\n",
|
||
" <td>-1.937882</td>\n",
|
||
" <td>-0.60861</td>\n",
|
||
" <td>-0.636929</td>\n",
|
||
" <td>-0.137397</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>1.647935</td>\n",
|
||
" <td>-0.762165</td>\n",
|
||
" <td>2.895378</td>\n",
|
||
" <td>0.899198</td>\n",
|
||
" <td>0.963036</td>\n",
|
||
" <td>-0.186442</td>\n",
|
||
" <td>-0.838739</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>0.344264</td>\n",
|
||
" <td>0.037571</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>-1.016021</td>\n",
|
||
" <td>-1.783519</td>\n",
|
||
" <td>-0.896247</td>\n",
|
||
" <td>0.208369</td>\n",
|
||
" <td>-0.186332</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>-1.159614</td>\n",
|
||
" <td>-0.581287</td>\n",
|
||
" <td>-1.49426</td>\n",
|
||
" <td>-1.381273</td>\n",
|
||
" <td>-1.321893</td>\n",
|
||
" <td>-0.185096</td>\n",
|
||
" <td>-0.838739</td>\n",
|
||
" <td>0.839159</td>\n",
|
||
" <td>-1.369558</td>\n",
|
||
" <td>-1.11715</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>-0.830498</td>\n",
|
||
" <td>0.837799</td>\n",
|
||
" <td>0.304638</td>\n",
|
||
" <td>-0.355163</td>\n",
|
||
" <td>-0.130796</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>-1.329183</td>\n",
|
||
" <td>-0.681775</td>\n",
|
||
" <td>-1.49426</td>\n",
|
||
" <td>-1.381273</td>\n",
|
||
" <td>-1.071639</td>\n",
|
||
" <td>-0.200575</td>\n",
|
||
" <td>-0.838739</td>\n",
|
||
" <td>0.839159</td>\n",
|
||
" <td>-0.512647</td>\n",
|
||
" <td>-0.834612</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>1.024731</td>\n",
|
||
" <td>1.226566</td>\n",
|
||
" <td>-1.025683</td>\n",
|
||
" <td>-0.444141</td>\n",
|
||
" <td>-0.202404</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>0.377864</td>\n",
|
||
" <td>0.286926</td>\n",
|
||
" <td>0.700559</td>\n",
|
||
" <td>0.573416</td>\n",
|
||
" <td>0.419005</td>\n",
|
||
" <td>0.256379</td>\n",
|
||
" <td>1.120073</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>0.344264</td>\n",
|
||
" <td>0.848334</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>-0.923259</td>\n",
|
||
" <td>1.277306</td>\n",
|
||
" <td>-0.169963</td>\n",
|
||
" <td>0.742242</td>\n",
|
||
" <td>-0.071779</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>0.289882</td>\n",
|
||
" <td>-0.88677</td>\n",
|
||
" <td>-0.396851</td>\n",
|
||
" <td>0.573416</td>\n",
|
||
" <td>0.103467</td>\n",
|
||
" <td>-0.143853</td>\n",
|
||
" <td>-0.838739</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>0.344264</td>\n",
|
||
" <td>-0.244967</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>2.045107</td>\n",
|
||
" <td>-0.729417</td>\n",
|
||
" <td>-0.428836</td>\n",
|
||
" <td>-0.043737</td>\n",
|
||
" <td>-0.155335</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>1.613049</td>\n",
|
||
" <td>0.282907</td>\n",
|
||
" <td>-0.396851</td>\n",
|
||
" <td>-0.078147</td>\n",
|
||
" <td>0.103467</td>\n",
|
||
" <td>-0.259422</td>\n",
|
||
" <td>-0.838739</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>0.344264</td>\n",
|
||
" <td>-0.822328</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>0.727894</td>\n",
|
||
" <td>0.868529</td>\n",
|
||
" <td>-1.277366</td>\n",
|
||
" <td>0.223199</td>\n",
|
||
" <td>-0.338303</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>-0.962885</td>\n",
|
||
" <td>0.285118</td>\n",
|
||
" <td>0.700559</td>\n",
|
||
" <td>0.573416</td>\n",
|
||
" <td>0.005542</td>\n",
|
||
" <td>-0.183813</td>\n",
|
||
" <td>-0.838739</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>0.344264</td>\n",
|
||
" <td>-0.380094</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>-0.478004</td>\n",
|
||
" <td>1.195837</td>\n",
|
||
" <td>0.78643</td>\n",
|
||
" <td>0.445646</td>\n",
|
||
" <td>-0.180592</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>1.722145</td>\n",
|
||
" <td>-0.259726</td>\n",
|
||
" <td>-0.396851</td>\n",
|
||
" <td>-0.403928</td>\n",
|
||
" <td>-0.571131</td>\n",
|
||
" <td>-0.18865</td>\n",
|
||
" <td>-0.838739</td>\n",
|
||
" <td>0.839159</td>\n",
|
||
" <td>-0.512647</td>\n",
|
||
" <td>-0.269535</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>-0.811945</td>\n",
|
||
" <td>1.222993</td>\n",
|
||
" <td>0.168011</td>\n",
|
||
" <td>-0.666588</td>\n",
|
||
" <td>-0.213095</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>0.740562</td>\n",
|
||
" <td>1.589247</td>\n",
|
||
" <td>0.700559</td>\n",
|
||
" <td>1.550761</td>\n",
|
||
" <td>2.878025</td>\n",
|
||
" <td>0.466843</td>\n",
|
||
" <td>1.120073</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>2.058087</td>\n",
|
||
" <td>2.052192</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>-1.349962</td>\n",
|
||
" <td>0.604825</td>\n",
|
||
" <td>0.340593</td>\n",
|
||
" <td>2.462498</td>\n",
|
||
" <td>0.79434</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>-1.555659</td>\n",
|
||
" <td>-0.922945</td>\n",
|
||
" <td>-0.396851</td>\n",
|
||
" <td>-1.381273</td>\n",
|
||
" <td>-0.799624</td>\n",
|
||
" <td>-0.107784</td>\n",
|
||
" <td>-0.838739</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>-0.512647</td>\n",
|
||
" <td>-0.527505</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>-0.2158</td>\n",
|
||
" <td>1.432881</td>\n",
|
||
" <td>1.536008</td>\n",
|
||
" <td>-0.644564</td>\n",
|
||
" <td>-0.978014</td>\n",
|
||
" <td>-0.183354</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>-0.953738</td>\n",
|
||
" <td>0.142224</td>\n",
|
||
" <td>2.895378</td>\n",
|
||
" <td>1.224979</td>\n",
|
||
" <td>0.886872</td>\n",
|
||
" <td>4.00146</td>\n",
|
||
" <td>1.120073</td>\n",
|
||
" <td>-0.666734</td>\n",
|
||
" <td>-0.512647</td>\n",
|
||
" <td>0.713207</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>4.605736</td>\n",
|
||
" <td>-0.663527</td>\n",
|
||
" <td>-1.135335</td>\n",
|
||
" <td>0.85834</td>\n",
|
||
" <td>0.593944</td>\n",
|
||
" <td>1.659169</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>20 rows × 22 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id price bedrooms bathrooms sqft_living sqft_lot floors \\\n",
|
||
"0 -0.451103 0.916381 0.700559 0.573416 0.081706 -0.187493 -0.838739 \n",
|
||
"1 1.845014 -0.589326 -1.49426 -0.72971 -1.191326 -0.302999 1.120073 \n",
|
||
"2 -0.388708 -1.184213 -0.396851 -1.381273 -1.060759 0.101544 -0.838739 \n",
|
||
"3 -0.74402 0.051922 -1.49426 -1.381273 -1.32951 2.686416 -0.838739 \n",
|
||
"4 1.018038 -0.47276 -0.396851 0.247635 -0.320877 0.608196 1.120073 \n",
|
||
"5 -0.083826 -0.492858 -0.396851 1.550761 -0.701698 -0.314672 3.078884 \n",
|
||
"6 0.301277 -0.953091 -0.396851 0.573416 -0.712579 -0.180574 -0.838739 \n",
|
||
"7 -0.086798 -1.148038 -1.49426 -1.381273 -1.25661 -0.232501 -0.838739 \n",
|
||
"8 -0.824567 -1.148038 -1.49426 -1.381273 -1.0934 -0.15174 0.140667 \n",
|
||
"9 1.647935 -0.762165 2.895378 0.899198 0.963036 -0.186442 -0.838739 \n",
|
||
"10 -1.159614 -0.581287 -1.49426 -1.381273 -1.321893 -0.185096 -0.838739 \n",
|
||
"11 -1.329183 -0.681775 -1.49426 -1.381273 -1.071639 -0.200575 -0.838739 \n",
|
||
"12 0.377864 0.286926 0.700559 0.573416 0.419005 0.256379 1.120073 \n",
|
||
"13 0.289882 -0.88677 -0.396851 0.573416 0.103467 -0.143853 -0.838739 \n",
|
||
"14 1.613049 0.282907 -0.396851 -0.078147 0.103467 -0.259422 -0.838739 \n",
|
||
"15 -0.962885 0.285118 0.700559 0.573416 0.005542 -0.183813 -0.838739 \n",
|
||
"16 1.722145 -0.259726 -0.396851 -0.403928 -0.571131 -0.18865 -0.838739 \n",
|
||
"17 0.740562 1.589247 0.700559 1.550761 2.878025 0.466843 1.120073 \n",
|
||
"18 -1.555659 -0.922945 -0.396851 -1.381273 -0.799624 -0.107784 -0.838739 \n",
|
||
"19 -0.953738 0.142224 2.895378 1.224979 0.886872 4.00146 1.120073 \n",
|
||
"\n",
|
||
" condition grade sqft_above ... yr_renovated zipcode lat \\\n",
|
||
"0 0.839159 -0.512647 -0.638064 ... -0.2158 -1.349962 0.32254 \n",
|
||
"1 -0.666734 -0.512647 -0.969739 ... -0.2158 0.820656 0.417588 \n",
|
||
"2 -0.666734 -1.369558 -0.822328 ... -0.2158 0.523819 -0.059795 \n",
|
||
"3 -0.666734 -2.22647 -1.125749 ... -0.2158 -0.144063 -1.221808 \n",
|
||
"4 -0.666734 -0.512647 0.013003 ... -0.2158 -0.236825 -0.339221 \n",
|
||
"5 -0.666734 0.344264 -0.416947 ... -0.2158 0.468162 0.987875 \n",
|
||
"6 -0.666734 -0.512647 -0.773191 ... -0.2158 -0.886155 -1.293987 \n",
|
||
"7 -0.666734 -1.369558 -1.043445 ... -0.2158 0.523819 -0.249176 \n",
|
||
"8 0.839159 -0.512647 -0.859181 ... -0.2158 -1.387066 -1.937882 \n",
|
||
"9 -0.666734 0.344264 0.037571 ... -0.2158 -1.016021 -1.783519 \n",
|
||
"10 0.839159 -1.369558 -1.11715 ... -0.2158 -0.830498 0.837799 \n",
|
||
"11 0.839159 -0.512647 -0.834612 ... -0.2158 1.024731 1.226566 \n",
|
||
"12 -0.666734 0.344264 0.848334 ... -0.2158 -0.923259 1.277306 \n",
|
||
"13 -0.666734 0.344264 -0.244967 ... -0.2158 2.045107 -0.729417 \n",
|
||
"14 -0.666734 0.344264 -0.822328 ... -0.2158 0.727894 0.868529 \n",
|
||
"15 -0.666734 0.344264 -0.380094 ... -0.2158 -0.478004 1.195837 \n",
|
||
"16 0.839159 -0.512647 -0.269535 ... -0.2158 -0.811945 1.222993 \n",
|
||
"17 -0.666734 2.058087 2.052192 ... -0.2158 -1.349962 0.604825 \n",
|
||
"18 -0.666734 -0.512647 -0.527505 ... -0.2158 1.432881 1.536008 \n",
|
||
"19 -0.666734 -0.512647 0.713207 ... 4.605736 -0.663527 -1.135335 \n",
|
||
"\n",
|
||
" long sqft_living15 sqft_lot15 price_h price_l price_m price_vh \n",
|
||
"0 0.340593 0.223199 -0.210584 1.0 0.0 0.0 0.0 \n",
|
||
"1 -0.601419 -1.022503 -0.421966 0.0 0.0 1.0 0.0 \n",
|
||
"2 -1.025683 -0.889035 -0.208431 0.0 1.0 0.0 0.0 \n",
|
||
"3 -1.924549 -0.889035 4.682444 0.0 0.0 1.0 0.0 \n",
|
||
"4 2.505062 -0.103056 1.375604 0.0 0.0 1.0 0.0 \n",
|
||
"5 -0.903438 -0.844546 -0.436854 0.0 0.0 1.0 0.0 \n",
|
||
"6 0.254302 -0.666588 -0.205992 0.0 1.0 0.0 0.0 \n",
|
||
"7 -1.018493 -1.600865 -0.296686 0.0 1.0 0.0 0.0 \n",
|
||
"8 -0.60861 -0.636929 -0.137397 0.0 1.0 0.0 0.0 \n",
|
||
"9 -0.896247 0.208369 -0.186332 0.0 1.0 0.0 0.0 \n",
|
||
"10 0.304638 -0.355163 -0.130796 0.0 0.0 1.0 0.0 \n",
|
||
"11 -1.025683 -0.444141 -0.202404 0.0 1.0 0.0 0.0 \n",
|
||
"12 -0.169963 0.742242 -0.071779 0.0 0.0 1.0 0.0 \n",
|
||
"13 -0.428836 -0.043737 -0.155335 0.0 1.0 0.0 0.0 \n",
|
||
"14 -1.277366 0.223199 -0.338303 0.0 0.0 1.0 0.0 \n",
|
||
"15 0.78643 0.445646 -0.180592 0.0 0.0 1.0 0.0 \n",
|
||
"16 0.168011 -0.666588 -0.213095 0.0 0.0 1.0 0.0 \n",
|
||
"17 0.340593 2.462498 0.79434 0.0 0.0 0.0 1.0 \n",
|
||
"18 -0.644564 -0.978014 -0.183354 0.0 1.0 0.0 0.0 \n",
|
||
"19 0.85834 0.593944 1.659169 0.0 0.0 1.0 0.0 \n",
|
||
"\n",
|
||
"[20 rows x 22 columns]"
|
||
]
|
||
},
|
||
"execution_count": 340,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"from sklearn.base import BaseEstimator, TransformerMixin\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.discriminant_analysis import StandardScaler\n",
|
||
"from sklearn.impute import SimpleImputer\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.ensemble import RandomForestRegressor # Пример регрессионной модели\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.pipeline import make_pipeline\n",
|
||
"\n",
|
||
"random_state = 42\n",
|
||
"\n",
|
||
"# Указываем столбцы, которые нужно удалить и обрабатывать\n",
|
||
"columns_to_drop = [\"date\", \"view\", \"waterfront\"]\n",
|
||
"num_columns = [\n",
|
||
" column\n",
|
||
" for column in df.columns\n",
|
||
" if column not in columns_to_drop and df[column].dtype != \"object\" and df[column].dtype != \"category\"\n",
|
||
"]\n",
|
||
"cat_columns = [\n",
|
||
" column\n",
|
||
" for column in df.columns\n",
|
||
" if column not in columns_to_drop and df[column].dtype == \"object\" or df[column].dtype == \"category\"\n",
|
||
"]\n",
|
||
"\n",
|
||
"# Определяем предобработку для численных данных\n",
|
||
"num_imputer = SimpleImputer(strategy=\"median\")\n",
|
||
"num_scaler = StandardScaler()\n",
|
||
"preprocessing_num = Pipeline(\n",
|
||
" [\n",
|
||
" (\"imputer\", num_imputer),\n",
|
||
" (\"scaler\", num_scaler),\n",
|
||
" ]\n",
|
||
")\n",
|
||
"\n",
|
||
"# Определяем предобработку для категориальных данных\n",
|
||
"cat_imputer = SimpleImputer(strategy=\"constant\")\n",
|
||
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)\n",
|
||
"preprocessing_cat = Pipeline(\n",
|
||
" [\n",
|
||
" (\"imputer\", cat_imputer),\n",
|
||
" (\"encoder\", cat_encoder),\n",
|
||
" ]\n",
|
||
")\n",
|
||
"\n",
|
||
"features_preprocessing = ColumnTransformer(\n",
|
||
" verbose_feature_names_out=False,\n",
|
||
" transformers=[\n",
|
||
" (\"prepocessing_num\", preprocessing_num, num_columns),\n",
|
||
" (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
|
||
" (\"prepocessing_features\", cat_imputer, [\"price_category\"]),\n",
|
||
" ],\n",
|
||
" remainder=\"passthrough\"\n",
|
||
")\n",
|
||
"\n",
|
||
"drop_columns = ColumnTransformer(\n",
|
||
" verbose_feature_names_out=False,\n",
|
||
" transformers=[\n",
|
||
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
|
||
" ],\n",
|
||
" remainder=\"passthrough\",\n",
|
||
")\n",
|
||
"\n",
|
||
"features_postprocessing = ColumnTransformer(\n",
|
||
" verbose_feature_names_out=False,\n",
|
||
" transformers=[\n",
|
||
" (\"prepocessing_cat\", preprocessing_cat, [\"price_category\"]),\n",
|
||
" ],\n",
|
||
" remainder=\"passthrough\",\n",
|
||
")\n",
|
||
"\n",
|
||
"pipeline_end = Pipeline(\n",
|
||
" [\n",
|
||
" (\"features_preprocessing\", features_preprocessing),\n",
|
||
" (\"drop_columns\", drop_columns),\n",
|
||
" (\"features_postprocessing\", features_postprocessing),\n",
|
||
" ]\n",
|
||
"\n",
|
||
")\n",
|
||
"# preprocessing_result = pipeline_end.fit_transform(X_train.values)\n",
|
||
"cols = ['price_h', 'price_l', 'price_m', 'price_vh']\n",
|
||
"preprocessing_result = features_preprocessing.fit_transform(X_train)\n",
|
||
"preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns + cols + columns_to_drop)\n",
|
||
"\n",
|
||
"preprocessing_result = drop_columns.fit_transform(preprocessing_result)\n",
|
||
"preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cols + cat_columns)\n",
|
||
"\n",
|
||
"preprocessing_result = preprocessing_result.drop(columns=[\"price_category\"])\n",
|
||
"preprocessing_result.head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Формирование набора моделей для классификации¶\n",
|
||
"logistic -- логистическая регрессия\n",
|
||
"\n",
|
||
"ridge -- гребневая регрессия\n",
|
||
"\n",
|
||
"decision_tree -- дерево решений\n",
|
||
"\n",
|
||
"knn -- k-ближайших соседей\n",
|
||
"\n",
|
||
"naive_bayes -- наивный Байесовский классификатор\n",
|
||
"\n",
|
||
"gradient_boosting -- метод градиентного бустинга (набор деревьев решений)\n",
|
||
"\n",
|
||
"random_forest -- метод случайного леса (набор деревьев решений)\n",
|
||
"\n",
|
||
"mlp -- многослойный персептрон (нейронная сеть)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 341,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree\n",
|
||
"\n",
|
||
"class_models = {\n",
|
||
" \"logistic\": {\"model\": linear_model.LogisticRegression()},\n",
|
||
" # \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n",
|
||
" \"ridge\": {\"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")},\n",
|
||
" \"decision_tree\": {\n",
|
||
" \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)\n",
|
||
" },\n",
|
||
" \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n",
|
||
" \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n",
|
||
" \"gradient_boosting\": {\n",
|
||
" \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n",
|
||
" },\n",
|
||
" \"random_forest\": {\n",
|
||
" \"model\": ensemble.RandomForestClassifier(\n",
|
||
" max_depth=11, class_weight=\"balanced\", random_state=random_state\n",
|
||
" )\n",
|
||
" },\n",
|
||
" \"mlp\": {\n",
|
||
" \"model\": neural_network.MLPClassifier(\n",
|
||
" hidden_layer_sizes=(7,),\n",
|
||
" max_iter=500,\n",
|
||
" early_stopping=True,\n",
|
||
" random_state=random_state,\n",
|
||
" )\n",
|
||
" },\n",
|
||
"}"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Обучение моделей на обучающем наборе данных и оценка на тестовом"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 343,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: logistic\n"
|
||
]
|
||
},
|
||
{
|
||
"ename": "ValueError",
|
||
"evalue": "Specifying the columns using strings is only supported for dataframes.",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\utils\\_indexing.py:338\u001b[0m, in \u001b[0;36m_get_column_indices\u001b[1;34m(X, key)\u001b[0m\n\u001b[0;32m 337\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 338\u001b[0m all_columns \u001b[38;5;241m=\u001b[39m \u001b[43mX\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\n\u001b[0;32m 339\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m:\n",
|
||
"\u001b[1;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'columns'",
|
||
"\nDuring handling of the above exception, another exception occurred:\n",
|
||
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[1;32mIn[343], line 9\u001b[0m\n\u001b[0;32m 6\u001b[0m model \u001b[38;5;241m=\u001b[39m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 8\u001b[0m model_pipeline \u001b[38;5;241m=\u001b[39m Pipeline([(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, pipeline_end), (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m, model)])\n\u001b[1;32m----> 9\u001b[0m model_pipeline \u001b[38;5;241m=\u001b[39m \u001b[43mmodel_pipeline\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mravel\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 11\u001b[0m y_train_predict \u001b[38;5;241m=\u001b[39m model_pipeline\u001b[38;5;241m.\u001b[39mpredict(X_train)\n\u001b[0;32m 12\u001b[0m y_test_probs \u001b[38;5;241m=\u001b[39m model_pipeline\u001b[38;5;241m.\u001b[39mpredict_proba(X_test)[:, \u001b[38;5;241m1\u001b[39m]\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:469\u001b[0m, in \u001b[0;36mPipeline.fit\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 426\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model.\u001b[39;00m\n\u001b[0;32m 427\u001b[0m \n\u001b[0;32m 428\u001b[0m \u001b[38;5;124;03mFit all the transformers one after the other and sequentially transform the\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 466\u001b[0m \u001b[38;5;124;03m Pipeline with fitted steps.\u001b[39;00m\n\u001b[0;32m 467\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 468\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 469\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrouted_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 470\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n\u001b[0;32m 471\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpassthrough\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:406\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m 404\u001b[0m cloned_transformer \u001b[38;5;241m=\u001b[39m clone(transformer)\n\u001b[0;32m 405\u001b[0m \u001b[38;5;66;03m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 406\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m \u001b[43mfit_transform_one_cached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[43m \u001b[49m\u001b[43mcloned_transformer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 409\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 410\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 411\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPipeline\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 412\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep_idx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 413\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 414\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 415\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m 416\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m 417\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m 418\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:1310\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, params)\u001b[0m\n\u001b[0;32m 1308\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m 1309\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1310\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfit_transform\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1311\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1312\u001b[0m res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[0;32m 1313\u001b[0m X, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\n\u001b[0;32m 1314\u001b[0m )\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:533\u001b[0m, in \u001b[0;36mPipeline.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 490\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model and transform with the final estimator.\u001b[39;00m\n\u001b[0;32m 491\u001b[0m \n\u001b[0;32m 492\u001b[0m \u001b[38;5;124;03mFit all the transformers one after the other and sequentially transform\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 530\u001b[0m \u001b[38;5;124;03m Transformed samples.\u001b[39;00m\n\u001b[0;32m 531\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 532\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 533\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrouted_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 535\u001b[0m last_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator\n\u001b[0;32m 536\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:406\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m 404\u001b[0m cloned_transformer \u001b[38;5;241m=\u001b[39m clone(transformer)\n\u001b[0;32m 405\u001b[0m \u001b[38;5;66;03m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 406\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m \u001b[43mfit_transform_one_cached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[43m \u001b[49m\u001b[43mcloned_transformer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 409\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 410\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 411\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPipeline\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 412\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep_idx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 413\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 414\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 415\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m 416\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m 417\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m 418\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\pipeline.py:1310\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, params)\u001b[0m\n\u001b[0;32m 1308\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m 1309\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1310\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfit_transform\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1311\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1312\u001b[0m res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[0;32m 1313\u001b[0m X, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\n\u001b[0;32m 1314\u001b[0m )\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output.<locals>.wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 319\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 320\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 321\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 322\u001b[0m )\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:968\u001b[0m, in \u001b[0;36mColumnTransformer.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 965\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_transformers()\n\u001b[0;32m 966\u001b[0m n_samples \u001b[38;5;241m=\u001b[39m _num_samples(X)\n\u001b[1;32m--> 968\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_column_callables\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 969\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_remainder(X)\n\u001b[0;32m 971\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _routing_enabled():\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:536\u001b[0m, in \u001b[0;36mColumnTransformer._validate_column_callables\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 534\u001b[0m columns \u001b[38;5;241m=\u001b[39m columns(X)\n\u001b[0;32m 535\u001b[0m all_columns\u001b[38;5;241m.\u001b[39mappend(columns)\n\u001b[1;32m--> 536\u001b[0m transformer_to_input_indices[name] \u001b[38;5;241m=\u001b[39m \u001b[43m_get_column_indices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 538\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_columns \u001b[38;5;241m=\u001b[39m all_columns\n\u001b[0;32m 539\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_transformer_to_input_indices \u001b[38;5;241m=\u001b[39m transformer_to_input_indices\n",
|
||
"File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\sklearn\\utils\\_indexing.py:340\u001b[0m, in \u001b[0;36m_get_column_indices\u001b[1;34m(X, key)\u001b[0m\n\u001b[0;32m 338\u001b[0m all_columns \u001b[38;5;241m=\u001b[39m X\u001b[38;5;241m.\u001b[39mcolumns\n\u001b[0;32m 339\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m:\n\u001b[1;32m--> 340\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 341\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSpecifying the columns using strings is only supported for dataframes.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 342\u001b[0m )\n\u001b[0;32m 343\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m 344\u001b[0m columns \u001b[38;5;241m=\u001b[39m [key]\n",
|
||
"\u001b[1;31mValueError\u001b[0m: Specifying the columns using strings is only supported for dataframes."
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"from sklearn import metrics\n",
|
||
"\n",
|
||
"for model_name in class_models.keys():\n",
|
||
" print(f\"Model: {model_name}\")\n",
|
||
" model = class_models[model_name][\"model\"]\n",
|
||
"\n",
|
||
" model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
|
||
" model_pipeline = model_pipeline.fit(X_train.values, y_train.values.ravel())\n",
|
||
"\n",
|
||
" y_train_predict = model_pipeline.predict(X_train)\n",
|
||
" y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n",
|
||
" y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n",
|
||
"\n",
|
||
" class_models[model_name][\"pipeline\"] = model_pipeline\n",
|
||
" class_models[model_name][\"probs\"] = y_test_probs\n",
|
||
" class_models[model_name][\"preds\"] = y_test_predict\n",
|
||
"\n",
|
||
" class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n",
|
||
" y_train, y_train_predict\n",
|
||
" )\n",
|
||
" class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n",
|
||
" y_test, y_test_predict\n",
|
||
" )\n",
|
||
" class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n",
|
||
" y_train, y_train_predict\n",
|
||
" )\n",
|
||
" class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n",
|
||
" y_test, y_test_predict\n",
|
||
" )\n",
|
||
" class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n",
|
||
" y_train, y_train_predict\n",
|
||
" )\n",
|
||
" class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n",
|
||
" y_test, y_test_predict\n",
|
||
" )\n",
|
||
" class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n",
|
||
" y_test, y_test_probs\n",
|
||
" )\n",
|
||
" class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n",
|
||
" class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n",
|
||
" class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n",
|
||
" y_test, y_test_predict\n",
|
||
" )\n",
|
||
" class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n",
|
||
" y_test, y_test_predict\n",
|
||
" )\n",
|
||
" class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n",
|
||
" y_test, y_test_predict\n",
|
||
" )"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "kernel",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|