2615 lines
93 KiB
Plaintext
2615 lines
93 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 112,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>bathrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>floors</th>\n",
|
||
" <th>waterfront</th>\n",
|
||
" <th>view</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>grade</th>\n",
|
||
" <th>sqft_above</th>\n",
|
||
" <th>sqft_basement</th>\n",
|
||
" <th>yr_built</th>\n",
|
||
" <th>yr_renovated</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>lat</th>\n",
|
||
" <th>long</th>\n",
|
||
" <th>sqft_living15</th>\n",
|
||
" <th>sqft_lot15</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>7129300520</td>\n",
|
||
" <td>20141013T000000</td>\n",
|
||
" <td>221900.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1180</td>\n",
|
||
" <td>5650</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1180</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1955</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98178</td>\n",
|
||
" <td>47.5112</td>\n",
|
||
" <td>-122.257</td>\n",
|
||
" <td>1340</td>\n",
|
||
" <td>5650</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>6414100192</td>\n",
|
||
" <td>20141209T000000</td>\n",
|
||
" <td>538000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.25</td>\n",
|
||
" <td>2570</td>\n",
|
||
" <td>7242</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>2170</td>\n",
|
||
" <td>400</td>\n",
|
||
" <td>1951</td>\n",
|
||
" <td>1991</td>\n",
|
||
" <td>98125</td>\n",
|
||
" <td>47.7210</td>\n",
|
||
" <td>-122.319</td>\n",
|
||
" <td>1690</td>\n",
|
||
" <td>7639</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>5631500400</td>\n",
|
||
" <td>20150225T000000</td>\n",
|
||
" <td>180000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>770</td>\n",
|
||
" <td>10000</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>770</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1933</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98028</td>\n",
|
||
" <td>47.7379</td>\n",
|
||
" <td>-122.233</td>\n",
|
||
" <td>2720</td>\n",
|
||
" <td>8062</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2487200875</td>\n",
|
||
" <td>20141209T000000</td>\n",
|
||
" <td>604000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>3.00</td>\n",
|
||
" <td>1960</td>\n",
|
||
" <td>5000</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1050</td>\n",
|
||
" <td>910</td>\n",
|
||
" <td>1965</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98136</td>\n",
|
||
" <td>47.5208</td>\n",
|
||
" <td>-122.393</td>\n",
|
||
" <td>1360</td>\n",
|
||
" <td>5000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1954400510</td>\n",
|
||
" <td>20150218T000000</td>\n",
|
||
" <td>510000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>8080</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1987</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98074</td>\n",
|
||
" <td>47.6168</td>\n",
|
||
" <td>-122.045</td>\n",
|
||
" <td>1800</td>\n",
|
||
" <td>7503</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9995</th>\n",
|
||
" <td>322059264</td>\n",
|
||
" <td>20140926T000000</td>\n",
|
||
" <td>279000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>47044</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1904</td>\n",
|
||
" <td>1958</td>\n",
|
||
" <td>98042</td>\n",
|
||
" <td>47.4206</td>\n",
|
||
" <td>-122.155</td>\n",
|
||
" <td>1930</td>\n",
|
||
" <td>12139</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9996</th>\n",
|
||
" <td>5557500270</td>\n",
|
||
" <td>20150209T000000</td>\n",
|
||
" <td>262000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.50</td>\n",
|
||
" <td>1700</td>\n",
|
||
" <td>9579</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1100</td>\n",
|
||
" <td>600</td>\n",
|
||
" <td>1962</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98023</td>\n",
|
||
" <td>47.3209</td>\n",
|
||
" <td>-122.338</td>\n",
|
||
" <td>1700</td>\n",
|
||
" <td>9628</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9997</th>\n",
|
||
" <td>9164100125</td>\n",
|
||
" <td>20140807T000000</td>\n",
|
||
" <td>533000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1550</td>\n",
|
||
" <td>4750</td>\n",
|
||
" <td>1.5</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1550</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1919</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98117</td>\n",
|
||
" <td>47.6824</td>\n",
|
||
" <td>-122.389</td>\n",
|
||
" <td>1320</td>\n",
|
||
" <td>4750</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9998</th>\n",
|
||
" <td>7370600045</td>\n",
|
||
" <td>20150402T000000</td>\n",
|
||
" <td>640000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>8100</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1950</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98177</td>\n",
|
||
" <td>47.7212</td>\n",
|
||
" <td>-122.364</td>\n",
|
||
" <td>1880</td>\n",
|
||
" <td>7750</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9999</th>\n",
|
||
" <td>8594400060</td>\n",
|
||
" <td>20140609T000000</td>\n",
|
||
" <td>285000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.25</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>35127</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1987</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98092</td>\n",
|
||
" <td>47.3025</td>\n",
|
||
" <td>-122.067</td>\n",
|
||
" <td>1820</td>\n",
|
||
" <td>35166</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>10000 rows × 21 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id date price bedrooms bathrooms sqft_living \\\n",
|
||
"0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
|
||
"1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
|
||
"2 5631500400 20150225T000000 180000.0 2 1.00 770 \n",
|
||
"3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
|
||
"4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"9995 322059264 20140926T000000 279000.0 2 1.00 1020 \n",
|
||
"9996 5557500270 20150209T000000 262000.0 3 1.50 1700 \n",
|
||
"9997 9164100125 20140807T000000 533000.0 4 1.00 1550 \n",
|
||
"9998 7370600045 20150402T000000 640000.0 3 1.75 1680 \n",
|
||
"9999 8594400060 20140609T000000 285000.0 3 2.25 1680 \n",
|
||
"\n",
|
||
" sqft_lot floors waterfront view ... grade sqft_above \\\n",
|
||
"0 5650 1.0 0 0 ... 7 1180 \n",
|
||
"1 7242 2.0 0 0 ... 7 2170 \n",
|
||
"2 10000 1.0 0 0 ... 6 770 \n",
|
||
"3 5000 1.0 0 0 ... 7 1050 \n",
|
||
"4 8080 1.0 0 0 ... 8 1680 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"9995 47044 1.0 0 0 ... 7 1020 \n",
|
||
"9996 9579 1.0 0 0 ... 7 1100 \n",
|
||
"9997 4750 1.5 0 0 ... 7 1550 \n",
|
||
"9998 8100 1.0 0 2 ... 8 1680 \n",
|
||
"9999 35127 2.0 0 0 ... 7 1680 \n",
|
||
"\n",
|
||
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
|
||
"0 0 1955 0 98178 47.5112 -122.257 \n",
|
||
"1 400 1951 1991 98125 47.7210 -122.319 \n",
|
||
"2 0 1933 0 98028 47.7379 -122.233 \n",
|
||
"3 910 1965 0 98136 47.5208 -122.393 \n",
|
||
"4 0 1987 0 98074 47.6168 -122.045 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"9995 0 1904 1958 98042 47.4206 -122.155 \n",
|
||
"9996 600 1962 0 98023 47.3209 -122.338 \n",
|
||
"9997 0 1919 0 98117 47.6824 -122.389 \n",
|
||
"9998 0 1950 0 98177 47.7212 -122.364 \n",
|
||
"9999 0 1987 0 98092 47.3025 -122.067 \n",
|
||
"\n",
|
||
" sqft_living15 sqft_lot15 \n",
|
||
"0 1340 5650 \n",
|
||
"1 1690 7639 \n",
|
||
"2 2720 8062 \n",
|
||
"3 1360 5000 \n",
|
||
"4 1800 7503 \n",
|
||
"... ... ... \n",
|
||
"9995 1930 12139 \n",
|
||
"9996 1700 9628 \n",
|
||
"9997 1320 4750 \n",
|
||
"9998 1880 7750 \n",
|
||
"9999 1820 35166 \n",
|
||
"\n",
|
||
"[10000 rows x 21 columns]"
|
||
]
|
||
},
|
||
"execution_count": 112,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import seaborn as sns\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn import set_config\n",
|
||
"\n",
|
||
"df = pd.read_csv(\"data/house_data.csv\", sep=\",\", nrows=10000)\n",
|
||
"df.dropna()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Устраняем выбросы в колонке цены и добавляем колонку с категориями цены"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 113,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>bathrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>floors</th>\n",
|
||
" <th>waterfront</th>\n",
|
||
" <th>view</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>sqft_above</th>\n",
|
||
" <th>sqft_basement</th>\n",
|
||
" <th>yr_built</th>\n",
|
||
" <th>yr_renovated</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>lat</th>\n",
|
||
" <th>long</th>\n",
|
||
" <th>sqft_living15</th>\n",
|
||
" <th>sqft_lot15</th>\n",
|
||
" <th>price_category</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>9980</th>\n",
|
||
" <td>6840700036</td>\n",
|
||
" <td>20140728T000000</td>\n",
|
||
" <td>497000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>770</td>\n",
|
||
" <td>3325</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>770</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1918</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98122</td>\n",
|
||
" <td>47.6102</td>\n",
|
||
" <td>-122.299</td>\n",
|
||
" <td>960</td>\n",
|
||
" <td>4800</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9981</th>\n",
|
||
" <td>1824069083</td>\n",
|
||
" <td>20150429T000000</td>\n",
|
||
" <td>835000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>3060</td>\n",
|
||
" <td>30166</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>3060</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1959</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98027</td>\n",
|
||
" <td>47.5656</td>\n",
|
||
" <td>-122.093</td>\n",
|
||
" <td>1880</td>\n",
|
||
" <td>19602</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9982</th>\n",
|
||
" <td>1836980240</td>\n",
|
||
" <td>20141015T000000</td>\n",
|
||
" <td>730000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.75</td>\n",
|
||
" <td>2920</td>\n",
|
||
" <td>4500</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2920</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1999</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98006</td>\n",
|
||
" <td>47.5646</td>\n",
|
||
" <td>-122.124</td>\n",
|
||
" <td>2920</td>\n",
|
||
" <td>4505</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9983</th>\n",
|
||
" <td>3528900160</td>\n",
|
||
" <td>20141001T000000</td>\n",
|
||
" <td>655000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1370</td>\n",
|
||
" <td>5250</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1070</td>\n",
|
||
" <td>300</td>\n",
|
||
" <td>1939</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98109</td>\n",
|
||
" <td>47.6421</td>\n",
|
||
" <td>-122.348</td>\n",
|
||
" <td>2410</td>\n",
|
||
" <td>4200</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9984</th>\n",
|
||
" <td>1442800060</td>\n",
|
||
" <td>20141120T000000</td>\n",
|
||
" <td>205000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>1870</td>\n",
|
||
" <td>3118</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1870</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1993</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98038</td>\n",
|
||
" <td>47.3739</td>\n",
|
||
" <td>-122.056</td>\n",
|
||
" <td>1580</td>\n",
|
||
" <td>3601</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9985</th>\n",
|
||
" <td>8722100030</td>\n",
|
||
" <td>20150407T000000</td>\n",
|
||
" <td>632750.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>1800</td>\n",
|
||
" <td>4800</td>\n",
|
||
" <td>1.5</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1800</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1918</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98112</td>\n",
|
||
" <td>47.6388</td>\n",
|
||
" <td>-122.302</td>\n",
|
||
" <td>1950</td>\n",
|
||
" <td>4800</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9986</th>\n",
|
||
" <td>1723049624</td>\n",
|
||
" <td>20140512T000000</td>\n",
|
||
" <td>330000.0</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>3.00</td>\n",
|
||
" <td>2100</td>\n",
|
||
" <td>7715</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1250</td>\n",
|
||
" <td>850</td>\n",
|
||
" <td>2013</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98168</td>\n",
|
||
" <td>47.4866</td>\n",
|
||
" <td>-122.319</td>\n",
|
||
" <td>2100</td>\n",
|
||
" <td>7959</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9987</th>\n",
|
||
" <td>4040400200</td>\n",
|
||
" <td>20141007T000000</td>\n",
|
||
" <td>527500.0</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2.25</td>\n",
|
||
" <td>2530</td>\n",
|
||
" <td>8250</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2530</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1961</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98007</td>\n",
|
||
" <td>47.6117</td>\n",
|
||
" <td>-122.134</td>\n",
|
||
" <td>2020</td>\n",
|
||
" <td>8250</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9988</th>\n",
|
||
" <td>8691391090</td>\n",
|
||
" <td>20140508T000000</td>\n",
|
||
" <td>716500.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>3290</td>\n",
|
||
" <td>6465</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>3290</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2002</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98075</td>\n",
|
||
" <td>47.5981</td>\n",
|
||
" <td>-121.976</td>\n",
|
||
" <td>3100</td>\n",
|
||
" <td>5929</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9989</th>\n",
|
||
" <td>7853302190</td>\n",
|
||
" <td>20141217T000000</td>\n",
|
||
" <td>388500.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>1890</td>\n",
|
||
" <td>5395</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1890</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2006</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98065</td>\n",
|
||
" <td>47.5415</td>\n",
|
||
" <td>-121.883</td>\n",
|
||
" <td>2060</td>\n",
|
||
" <td>5395</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9990</th>\n",
|
||
" <td>3260000700</td>\n",
|
||
" <td>20140904T000000</td>\n",
|
||
" <td>530000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>7770</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1967</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98005</td>\n",
|
||
" <td>47.6028</td>\n",
|
||
" <td>-122.167</td>\n",
|
||
" <td>1880</td>\n",
|
||
" <td>7770</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9991</th>\n",
|
||
" <td>5126300510</td>\n",
|
||
" <td>20150108T000000</td>\n",
|
||
" <td>419000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>2170</td>\n",
|
||
" <td>4517</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2170</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2002</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98059</td>\n",
|
||
" <td>47.4819</td>\n",
|
||
" <td>-122.140</td>\n",
|
||
" <td>2610</td>\n",
|
||
" <td>4770</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9992</th>\n",
|
||
" <td>7199330370</td>\n",
|
||
" <td>20150309T000000</td>\n",
|
||
" <td>385000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>1200</td>\n",
|
||
" <td>7360</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1200</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1978</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98052</td>\n",
|
||
" <td>47.6979</td>\n",
|
||
" <td>-122.130</td>\n",
|
||
" <td>1200</td>\n",
|
||
" <td>7500</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9993</th>\n",
|
||
" <td>1854900240</td>\n",
|
||
" <td>20140528T000000</td>\n",
|
||
" <td>655000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>2990</td>\n",
|
||
" <td>5669</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2990</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2003</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98074</td>\n",
|
||
" <td>47.6119</td>\n",
|
||
" <td>-122.011</td>\n",
|
||
" <td>3110</td>\n",
|
||
" <td>5058</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9994</th>\n",
|
||
" <td>6738700335</td>\n",
|
||
" <td>20140701T000000</td>\n",
|
||
" <td>1127312.5</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.75</td>\n",
|
||
" <td>3770</td>\n",
|
||
" <td>10900</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>3070</td>\n",
|
||
" <td>700</td>\n",
|
||
" <td>1924</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98144</td>\n",
|
||
" <td>47.5849</td>\n",
|
||
" <td>-122.290</td>\n",
|
||
" <td>3000</td>\n",
|
||
" <td>5000</td>\n",
|
||
" <td>very_high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9995</th>\n",
|
||
" <td>322059264</td>\n",
|
||
" <td>20140926T000000</td>\n",
|
||
" <td>279000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>47044</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1904</td>\n",
|
||
" <td>1958</td>\n",
|
||
" <td>98042</td>\n",
|
||
" <td>47.4206</td>\n",
|
||
" <td>-122.155</td>\n",
|
||
" <td>1930</td>\n",
|
||
" <td>12139</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9996</th>\n",
|
||
" <td>5557500270</td>\n",
|
||
" <td>20150209T000000</td>\n",
|
||
" <td>262000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.50</td>\n",
|
||
" <td>1700</td>\n",
|
||
" <td>9579</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1100</td>\n",
|
||
" <td>600</td>\n",
|
||
" <td>1962</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98023</td>\n",
|
||
" <td>47.3209</td>\n",
|
||
" <td>-122.338</td>\n",
|
||
" <td>1700</td>\n",
|
||
" <td>9628</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9997</th>\n",
|
||
" <td>9164100125</td>\n",
|
||
" <td>20140807T000000</td>\n",
|
||
" <td>533000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1550</td>\n",
|
||
" <td>4750</td>\n",
|
||
" <td>1.5</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1550</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1919</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98117</td>\n",
|
||
" <td>47.6824</td>\n",
|
||
" <td>-122.389</td>\n",
|
||
" <td>1320</td>\n",
|
||
" <td>4750</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9998</th>\n",
|
||
" <td>7370600045</td>\n",
|
||
" <td>20150402T000000</td>\n",
|
||
" <td>640000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>8100</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1950</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98177</td>\n",
|
||
" <td>47.7212</td>\n",
|
||
" <td>-122.364</td>\n",
|
||
" <td>1880</td>\n",
|
||
" <td>7750</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9999</th>\n",
|
||
" <td>8594400060</td>\n",
|
||
" <td>20140609T000000</td>\n",
|
||
" <td>285000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.25</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>35127</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1987</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98092</td>\n",
|
||
" <td>47.3025</td>\n",
|
||
" <td>-122.067</td>\n",
|
||
" <td>1820</td>\n",
|
||
" <td>35166</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>20 rows × 22 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id date price bedrooms bathrooms \\\n",
|
||
"9980 6840700036 20140728T000000 497000.0 2 1.00 \n",
|
||
"9981 1824069083 20150429T000000 835000.0 3 1.00 \n",
|
||
"9982 1836980240 20141015T000000 730000.0 4 2.75 \n",
|
||
"9983 3528900160 20141001T000000 655000.0 3 1.00 \n",
|
||
"9984 1442800060 20141120T000000 205000.0 3 2.50 \n",
|
||
"9985 8722100030 20150407T000000 632750.0 4 2.00 \n",
|
||
"9986 1723049624 20140512T000000 330000.0 5 3.00 \n",
|
||
"9987 4040400200 20141007T000000 527500.0 5 2.25 \n",
|
||
"9988 8691391090 20140508T000000 716500.0 4 2.50 \n",
|
||
"9989 7853302190 20141217T000000 388500.0 4 2.50 \n",
|
||
"9990 3260000700 20140904T000000 530000.0 3 1.75 \n",
|
||
"9991 5126300510 20150108T000000 419000.0 3 2.50 \n",
|
||
"9992 7199330370 20150309T000000 385000.0 3 1.75 \n",
|
||
"9993 1854900240 20140528T000000 655000.0 4 2.50 \n",
|
||
"9994 6738700335 20140701T000000 1127312.5 4 2.75 \n",
|
||
"9995 322059264 20140926T000000 279000.0 2 1.00 \n",
|
||
"9996 5557500270 20150209T000000 262000.0 3 1.50 \n",
|
||
"9997 9164100125 20140807T000000 533000.0 4 1.00 \n",
|
||
"9998 7370600045 20150402T000000 640000.0 3 1.75 \n",
|
||
"9999 8594400060 20140609T000000 285000.0 3 2.25 \n",
|
||
"\n",
|
||
" sqft_living sqft_lot floors waterfront view ... sqft_above \\\n",
|
||
"9980 770 3325 1.0 0 0 ... 770 \n",
|
||
"9981 3060 30166 1.0 0 0 ... 3060 \n",
|
||
"9982 2920 4500 2.0 0 0 ... 2920 \n",
|
||
"9983 1370 5250 1.0 0 0 ... 1070 \n",
|
||
"9984 1870 3118 2.0 0 0 ... 1870 \n",
|
||
"9985 1800 4800 1.5 0 0 ... 1800 \n",
|
||
"9986 2100 7715 1.0 0 0 ... 1250 \n",
|
||
"9987 2530 8250 2.0 0 0 ... 2530 \n",
|
||
"9988 3290 6465 2.0 0 0 ... 3290 \n",
|
||
"9989 1890 5395 2.0 0 0 ... 1890 \n",
|
||
"9990 1680 7770 1.0 0 0 ... 1680 \n",
|
||
"9991 2170 4517 2.0 0 0 ... 2170 \n",
|
||
"9992 1200 7360 1.0 0 0 ... 1200 \n",
|
||
"9993 2990 5669 2.0 0 0 ... 2990 \n",
|
||
"9994 3770 10900 2.0 0 2 ... 3070 \n",
|
||
"9995 1020 47044 1.0 0 0 ... 1020 \n",
|
||
"9996 1700 9579 1.0 0 0 ... 1100 \n",
|
||
"9997 1550 4750 1.5 0 0 ... 1550 \n",
|
||
"9998 1680 8100 1.0 0 2 ... 1680 \n",
|
||
"9999 1680 35127 2.0 0 0 ... 1680 \n",
|
||
"\n",
|
||
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
|
||
"9980 0 1918 0 98122 47.6102 -122.299 \n",
|
||
"9981 0 1959 0 98027 47.5656 -122.093 \n",
|
||
"9982 0 1999 0 98006 47.5646 -122.124 \n",
|
||
"9983 300 1939 0 98109 47.6421 -122.348 \n",
|
||
"9984 0 1993 0 98038 47.3739 -122.056 \n",
|
||
"9985 0 1918 0 98112 47.6388 -122.302 \n",
|
||
"9986 850 2013 0 98168 47.4866 -122.319 \n",
|
||
"9987 0 1961 0 98007 47.6117 -122.134 \n",
|
||
"9988 0 2002 0 98075 47.5981 -121.976 \n",
|
||
"9989 0 2006 0 98065 47.5415 -121.883 \n",
|
||
"9990 0 1967 0 98005 47.6028 -122.167 \n",
|
||
"9991 0 2002 0 98059 47.4819 -122.140 \n",
|
||
"9992 0 1978 0 98052 47.6979 -122.130 \n",
|
||
"9993 0 2003 0 98074 47.6119 -122.011 \n",
|
||
"9994 700 1924 0 98144 47.5849 -122.290 \n",
|
||
"9995 0 1904 1958 98042 47.4206 -122.155 \n",
|
||
"9996 600 1962 0 98023 47.3209 -122.338 \n",
|
||
"9997 0 1919 0 98117 47.6824 -122.389 \n",
|
||
"9998 0 1950 0 98177 47.7212 -122.364 \n",
|
||
"9999 0 1987 0 98092 47.3025 -122.067 \n",
|
||
"\n",
|
||
" sqft_living15 sqft_lot15 price_category \n",
|
||
"9980 960 4800 middle \n",
|
||
"9981 1880 19602 high \n",
|
||
"9982 2920 4505 high \n",
|
||
"9983 2410 4200 high \n",
|
||
"9984 1580 3601 low \n",
|
||
"9985 1950 4800 high \n",
|
||
"9986 2100 7959 low \n",
|
||
"9987 2020 8250 middle \n",
|
||
"9988 3100 5929 high \n",
|
||
"9989 2060 5395 middle \n",
|
||
"9990 1880 7770 middle \n",
|
||
"9991 2610 4770 middle \n",
|
||
"9992 1200 7500 middle \n",
|
||
"9993 3110 5058 high \n",
|
||
"9994 3000 5000 very_high \n",
|
||
"9995 1930 12139 low \n",
|
||
"9996 1700 9628 low \n",
|
||
"9997 1320 4750 middle \n",
|
||
"9998 1880 7750 high \n",
|
||
"9999 1820 35166 low \n",
|
||
"\n",
|
||
"[20 rows x 22 columns]"
|
||
]
|
||
},
|
||
"execution_count": 113,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"q1 = df['price'].quantile(0.25) # Находим 1-й квартиль (Q1)\n",
|
||
"q3 = df['price'].quantile(0.75) # Находим 3-й квартиль (Q3)\n",
|
||
"iqr = q3 - q1 # Вычисляем межквартильный размах (IQR)\n",
|
||
"\n",
|
||
"# Определяем границы для выбросов\n",
|
||
"lower_bound = q1 - 1.5 * iqr # Нижняя граница\n",
|
||
"upper_bound = q3 + 1.5 * iqr # Верхняя граница\n",
|
||
"\n",
|
||
"# Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n",
|
||
"df['price'] = df['price'].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
|
||
"\n",
|
||
"# Добавляем столбец с категорями цены\n",
|
||
"df['price_category'] = pd.cut(df['price'], bins=[75000,338750,602750,866750,1130750], labels=['low','middle','high','very_high'], include_lowest=True)\n",
|
||
"df.tail(20)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Бизнес-цели\n",
|
||
"1. Прогноз класса цены недвижимости (Классификация)\n",
|
||
"2. Оценка состояния недвижимости (Регрессия)\n",
|
||
"\n",
|
||
"### Определение достижимого уровня качества модели для первой задачи\n",
|
||
"#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации (Целевой признак - price)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 114,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'X_train'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>bathrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>floors</th>\n",
|
||
" <th>waterfront</th>\n",
|
||
" <th>view</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>sqft_above</th>\n",
|
||
" <th>sqft_basement</th>\n",
|
||
" <th>yr_built</th>\n",
|
||
" <th>yr_renovated</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>lat</th>\n",
|
||
" <th>long</th>\n",
|
||
" <th>sqft_living15</th>\n",
|
||
" <th>sqft_lot15</th>\n",
|
||
" <th>price_category</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>9843</th>\n",
|
||
" <td>3260000340</td>\n",
|
||
" <td>20140622T000000</td>\n",
|
||
" <td>732600.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>2130</td>\n",
|
||
" <td>7300</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1230</td>\n",
|
||
" <td>900</td>\n",
|
||
" <td>1963</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98005</td>\n",
|
||
" <td>47.6050</td>\n",
|
||
" <td>-122.167</td>\n",
|
||
" <td>2130</td>\n",
|
||
" <td>7560</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9623</th>\n",
|
||
" <td>9828702055</td>\n",
|
||
" <td>20140508T000000</td>\n",
|
||
" <td>358000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.50</td>\n",
|
||
" <td>960</td>\n",
|
||
" <td>1808</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>960</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1993</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98122</td>\n",
|
||
" <td>47.6183</td>\n",
|
||
" <td>-122.298</td>\n",
|
||
" <td>1290</td>\n",
|
||
" <td>1668</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3095</th>\n",
|
||
" <td>3438500625</td>\n",
|
||
" <td>20140519T000000</td>\n",
|
||
" <td>210000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1080</td>\n",
|
||
" <td>21043</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1080</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1942</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98106</td>\n",
|
||
" <td>47.5515</td>\n",
|
||
" <td>-122.357</td>\n",
|
||
" <td>1380</td>\n",
|
||
" <td>7620</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>411</th>\n",
|
||
" <td>2422029094</td>\n",
|
||
" <td>20140716T000000</td>\n",
|
||
" <td>517534.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>833</td>\n",
|
||
" <td>143947</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>833</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2006</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98070</td>\n",
|
||
" <td>47.3889</td>\n",
|
||
" <td>-122.482</td>\n",
|
||
" <td>1380</td>\n",
|
||
" <td>143947</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3060</th>\n",
|
||
" <td>7462900015</td>\n",
|
||
" <td>20150108T000000</td>\n",
|
||
" <td>387000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.25</td>\n",
|
||
" <td>1760</td>\n",
|
||
" <td>45133</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1760</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1984</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98065</td>\n",
|
||
" <td>47.5124</td>\n",
|
||
" <td>-121.866</td>\n",
|
||
" <td>1910</td>\n",
|
||
" <td>51773</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1750</th>\n",
|
||
" <td>2787720140</td>\n",
|
||
" <td>20150407T000000</td>\n",
|
||
" <td>416000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>1790</td>\n",
|
||
" <td>11542</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1190</td>\n",
|
||
" <td>600</td>\n",
|
||
" <td>1969</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98059</td>\n",
|
||
" <td>47.5124</td>\n",
|
||
" <td>-122.160</td>\n",
|
||
" <td>1790</td>\n",
|
||
" <td>9131</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2354</th>\n",
|
||
" <td>6192400400</td>\n",
|
||
" <td>20140728T000000</td>\n",
|
||
" <td>775000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>3090</td>\n",
|
||
" <td>7112</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>3090</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2001</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98052</td>\n",
|
||
" <td>47.7050</td>\n",
|
||
" <td>-122.118</td>\n",
|
||
" <td>3050</td>\n",
|
||
" <td>6000</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>857</th>\n",
|
||
" <td>2296500036</td>\n",
|
||
" <td>20150310T000000</td>\n",
|
||
" <td>450000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.75</td>\n",
|
||
" <td>2980</td>\n",
|
||
" <td>13260</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1800</td>\n",
|
||
" <td>1180</td>\n",
|
||
" <td>1979</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98056</td>\n",
|
||
" <td>47.5152</td>\n",
|
||
" <td>-122.197</td>\n",
|
||
" <td>1920</td>\n",
|
||
" <td>10731</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6181</th>\n",
|
||
" <td>2787310130</td>\n",
|
||
" <td>20141212T000000</td>\n",
|
||
" <td>289950.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>2090</td>\n",
|
||
" <td>7416</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1050</td>\n",
|
||
" <td>1040</td>\n",
|
||
" <td>1970</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98031</td>\n",
|
||
" <td>47.4107</td>\n",
|
||
" <td>-122.179</td>\n",
|
||
" <td>1710</td>\n",
|
||
" <td>7527</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3141</th>\n",
|
||
" <td>8567300110</td>\n",
|
||
" <td>20140604T000000</td>\n",
|
||
" <td>485000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>2340</td>\n",
|
||
" <td>59058</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2340</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1985</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98038</td>\n",
|
||
" <td>47.4052</td>\n",
|
||
" <td>-122.028</td>\n",
|
||
" <td>2700</td>\n",
|
||
" <td>37263</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>8000 rows × 22 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id date price bedrooms bathrooms sqft_living \\\n",
|
||
"9843 3260000340 20140622T000000 732600.0 4 2.50 2130 \n",
|
||
"9623 9828702055 20140508T000000 358000.0 2 1.50 960 \n",
|
||
"3095 3438500625 20140519T000000 210000.0 3 1.00 1080 \n",
|
||
"411 2422029094 20140716T000000 517534.0 2 1.00 833 \n",
|
||
"3060 7462900015 20150108T000000 387000.0 3 2.25 1760 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"1750 2787720140 20150407T000000 416000.0 3 2.50 1790 \n",
|
||
"2354 6192400400 20140728T000000 775000.0 4 2.50 3090 \n",
|
||
"857 2296500036 20150310T000000 450000.0 4 2.75 2980 \n",
|
||
"6181 2787310130 20141212T000000 289950.0 4 1.75 2090 \n",
|
||
"3141 8567300110 20140604T000000 485000.0 3 2.50 2340 \n",
|
||
"\n",
|
||
" sqft_lot floors waterfront view ... sqft_above sqft_basement \\\n",
|
||
"9843 7300 1.0 0 0 ... 1230 900 \n",
|
||
"9623 1808 2.0 0 0 ... 960 0 \n",
|
||
"3095 21043 1.0 0 0 ... 1080 0 \n",
|
||
"411 143947 1.0 0 0 ... 833 0 \n",
|
||
"3060 45133 2.0 0 0 ... 1760 0 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"1750 11542 1.0 0 0 ... 1190 600 \n",
|
||
"2354 7112 2.0 0 0 ... 3090 0 \n",
|
||
"857 13260 1.0 0 0 ... 1800 1180 \n",
|
||
"6181 7416 1.0 0 0 ... 1050 1040 \n",
|
||
"3141 59058 1.0 0 0 ... 2340 0 \n",
|
||
"\n",
|
||
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
|
||
"9843 1963 0 98005 47.6050 -122.167 2130 \n",
|
||
"9623 1993 0 98122 47.6183 -122.298 1290 \n",
|
||
"3095 1942 0 98106 47.5515 -122.357 1380 \n",
|
||
"411 2006 0 98070 47.3889 -122.482 1380 \n",
|
||
"3060 1984 0 98065 47.5124 -121.866 1910 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"1750 1969 0 98059 47.5124 -122.160 1790 \n",
|
||
"2354 2001 0 98052 47.7050 -122.118 3050 \n",
|
||
"857 1979 0 98056 47.5152 -122.197 1920 \n",
|
||
"6181 1970 0 98031 47.4107 -122.179 1710 \n",
|
||
"3141 1985 0 98038 47.4052 -122.028 2700 \n",
|
||
"\n",
|
||
" sqft_lot15 price_category \n",
|
||
"9843 7560 high \n",
|
||
"9623 1668 middle \n",
|
||
"3095 7620 low \n",
|
||
"411 143947 middle \n",
|
||
"3060 51773 middle \n",
|
||
"... ... ... \n",
|
||
"1750 9131 middle \n",
|
||
"2354 6000 high \n",
|
||
"857 10731 middle \n",
|
||
"6181 7527 low \n",
|
||
"3141 37263 middle \n",
|
||
"\n",
|
||
"[8000 rows x 22 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'y_train'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>price_category</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>9843</th>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9623</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3095</th>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>411</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3060</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1750</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2354</th>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>857</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6181</th>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3141</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>8000 rows × 1 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" price_category\n",
|
||
"9843 high\n",
|
||
"9623 middle\n",
|
||
"3095 low\n",
|
||
"411 middle\n",
|
||
"3060 middle\n",
|
||
"... ...\n",
|
||
"1750 middle\n",
|
||
"2354 high\n",
|
||
"857 middle\n",
|
||
"6181 low\n",
|
||
"3141 middle\n",
|
||
"\n",
|
||
"[8000 rows x 1 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'X_test'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>bathrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>floors</th>\n",
|
||
" <th>waterfront</th>\n",
|
||
" <th>view</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>sqft_above</th>\n",
|
||
" <th>sqft_basement</th>\n",
|
||
" <th>yr_built</th>\n",
|
||
" <th>yr_renovated</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>lat</th>\n",
|
||
" <th>long</th>\n",
|
||
" <th>sqft_living15</th>\n",
|
||
" <th>sqft_lot15</th>\n",
|
||
" <th>price_category</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>5341</th>\n",
|
||
" <td>6632900574</td>\n",
|
||
" <td>20150225T000000</td>\n",
|
||
" <td>595000.0</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>3.00</td>\n",
|
||
" <td>2980</td>\n",
|
||
" <td>10064</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>1300</td>\n",
|
||
" <td>1940</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98155</td>\n",
|
||
" <td>47.7372</td>\n",
|
||
" <td>-122.316</td>\n",
|
||
" <td>1590</td>\n",
|
||
" <td>7800</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4384</th>\n",
|
||
" <td>2423029245</td>\n",
|
||
" <td>20140617T000000</td>\n",
|
||
" <td>550000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>2240</td>\n",
|
||
" <td>78225</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2240</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1976</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98070</td>\n",
|
||
" <td>47.4638</td>\n",
|
||
" <td>-122.484</td>\n",
|
||
" <td>2030</td>\n",
|
||
" <td>202554</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5795</th>\n",
|
||
" <td>2473370050</td>\n",
|
||
" <td>20140604T000000</td>\n",
|
||
" <td>327500.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>1650</td>\n",
|
||
" <td>7800</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1650</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1968</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98058</td>\n",
|
||
" <td>47.4507</td>\n",
|
||
" <td>-122.139</td>\n",
|
||
" <td>1750</td>\n",
|
||
" <td>10400</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4956</th>\n",
|
||
" <td>9528104985</td>\n",
|
||
" <td>20141104T000000</td>\n",
|
||
" <td>611000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1270</td>\n",
|
||
" <td>5100</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1100</td>\n",
|
||
" <td>170</td>\n",
|
||
" <td>1900</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98115</td>\n",
|
||
" <td>47.6771</td>\n",
|
||
" <td>-122.328</td>\n",
|
||
" <td>1670</td>\n",
|
||
" <td>3900</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7723</th>\n",
|
||
" <td>3972900025</td>\n",
|
||
" <td>20150313T000000</td>\n",
|
||
" <td>499000.0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>2400</td>\n",
|
||
" <td>7500</td>\n",
|
||
" <td>1.5</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1400</td>\n",
|
||
" <td>1000</td>\n",
|
||
" <td>1975</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98155</td>\n",
|
||
" <td>47.7661</td>\n",
|
||
" <td>-122.313</td>\n",
|
||
" <td>1980</td>\n",
|
||
" <td>7500</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8517</th>\n",
|
||
" <td>3876600120</td>\n",
|
||
" <td>20150422T000000</td>\n",
|
||
" <td>265000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.50</td>\n",
|
||
" <td>1780</td>\n",
|
||
" <td>10196</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1270</td>\n",
|
||
" <td>510</td>\n",
|
||
" <td>1967</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98001</td>\n",
|
||
" <td>47.3375</td>\n",
|
||
" <td>-122.291</td>\n",
|
||
" <td>1320</td>\n",
|
||
" <td>7875</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6914</th>\n",
|
||
" <td>6821600005</td>\n",
|
||
" <td>20150403T000000</td>\n",
|
||
" <td>710000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>2120</td>\n",
|
||
" <td>5400</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1060</td>\n",
|
||
" <td>1060</td>\n",
|
||
" <td>1941</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98199</td>\n",
|
||
" <td>47.6501</td>\n",
|
||
" <td>-122.395</td>\n",
|
||
" <td>2052</td>\n",
|
||
" <td>6000</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4499</th>\n",
|
||
" <td>2767603931</td>\n",
|
||
" <td>20140818T000000</td>\n",
|
||
" <td>469000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3.25</td>\n",
|
||
" <td>1370</td>\n",
|
||
" <td>1194</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1370</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2004</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98107</td>\n",
|
||
" <td>47.6718</td>\n",
|
||
" <td>-122.388</td>\n",
|
||
" <td>1800</td>\n",
|
||
" <td>2678</td>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8651</th>\n",
|
||
" <td>8802400411</td>\n",
|
||
" <td>20140619T000000</td>\n",
|
||
" <td>249000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1050</td>\n",
|
||
" <td>8498</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1050</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1959</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98031</td>\n",
|
||
" <td>47.4043</td>\n",
|
||
" <td>-122.202</td>\n",
|
||
" <td>1050</td>\n",
|
||
" <td>8498</td>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4234</th>\n",
|
||
" <td>5452800735</td>\n",
|
||
" <td>20140722T000000</td>\n",
|
||
" <td>780000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.50</td>\n",
|
||
" <td>2270</td>\n",
|
||
" <td>13449</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1310</td>\n",
|
||
" <td>960</td>\n",
|
||
" <td>1975</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98040</td>\n",
|
||
" <td>47.5416</td>\n",
|
||
" <td>-122.232</td>\n",
|
||
" <td>2810</td>\n",
|
||
" <td>13475</td>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>2000 rows × 22 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id date price bedrooms bathrooms sqft_living \\\n",
|
||
"5341 6632900574 20150225T000000 595000.0 5 3.00 2980 \n",
|
||
"4384 2423029245 20140617T000000 550000.0 3 1.75 2240 \n",
|
||
"5795 2473370050 20140604T000000 327500.0 4 1.75 1650 \n",
|
||
"4956 9528104985 20141104T000000 611000.0 2 1.00 1270 \n",
|
||
"7723 3972900025 20150313T000000 499000.0 6 1.75 2400 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"8517 3876600120 20150422T000000 265000.0 3 1.50 1780 \n",
|
||
"6914 6821600005 20150403T000000 710000.0 4 1.75 2120 \n",
|
||
"4499 2767603931 20140818T000000 469000.0 3 3.25 1370 \n",
|
||
"8651 8802400411 20140619T000000 249000.0 3 1.00 1050 \n",
|
||
"4234 5452800735 20140722T000000 780000.0 4 2.50 2270 \n",
|
||
"\n",
|
||
" sqft_lot floors waterfront view ... sqft_above sqft_basement \\\n",
|
||
"5341 10064 1.0 0 0 ... 1680 1300 \n",
|
||
"4384 78225 2.0 0 0 ... 2240 0 \n",
|
||
"5795 7800 1.0 0 0 ... 1650 0 \n",
|
||
"4956 5100 1.0 0 0 ... 1100 170 \n",
|
||
"7723 7500 1.5 0 0 ... 1400 1000 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"8517 10196 1.0 0 0 ... 1270 510 \n",
|
||
"6914 5400 1.0 0 0 ... 1060 1060 \n",
|
||
"4499 1194 3.0 0 0 ... 1370 0 \n",
|
||
"8651 8498 1.0 0 0 ... 1050 0 \n",
|
||
"4234 13449 1.0 0 0 ... 1310 960 \n",
|
||
"\n",
|
||
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
|
||
"5341 1940 0 98155 47.7372 -122.316 1590 \n",
|
||
"4384 1976 0 98070 47.4638 -122.484 2030 \n",
|
||
"5795 1968 0 98058 47.4507 -122.139 1750 \n",
|
||
"4956 1900 0 98115 47.6771 -122.328 1670 \n",
|
||
"7723 1975 0 98155 47.7661 -122.313 1980 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"8517 1967 0 98001 47.3375 -122.291 1320 \n",
|
||
"6914 1941 0 98199 47.6501 -122.395 2052 \n",
|
||
"4499 2004 0 98107 47.6718 -122.388 1800 \n",
|
||
"8651 1959 0 98031 47.4043 -122.202 1050 \n",
|
||
"4234 1975 0 98040 47.5416 -122.232 2810 \n",
|
||
"\n",
|
||
" sqft_lot15 price_category \n",
|
||
"5341 7800 middle \n",
|
||
"4384 202554 middle \n",
|
||
"5795 10400 low \n",
|
||
"4956 3900 high \n",
|
||
"7723 7500 middle \n",
|
||
"... ... ... \n",
|
||
"8517 7875 low \n",
|
||
"6914 6000 high \n",
|
||
"4499 2678 middle \n",
|
||
"8651 8498 low \n",
|
||
"4234 13475 high \n",
|
||
"\n",
|
||
"[2000 rows x 22 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'y_test'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>price_category</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>5341</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4384</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5795</th>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4956</th>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7723</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8517</th>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6914</th>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4499</th>\n",
|
||
" <td>middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8651</th>\n",
|
||
" <td>low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4234</th>\n",
|
||
" <td>high</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>2000 rows × 1 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" price_category\n",
|
||
"5341 middle\n",
|
||
"4384 middle\n",
|
||
"5795 low\n",
|
||
"4956 high\n",
|
||
"7723 middle\n",
|
||
"... ...\n",
|
||
"8517 low\n",
|
||
"6914 high\n",
|
||
"4499 middle\n",
|
||
"8651 low\n",
|
||
"4234 high\n",
|
||
"\n",
|
||
"[2000 rows x 1 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"from typing import Tuple\n",
|
||
"import pandas as pd\n",
|
||
"from pandas import DataFrame\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"\n",
|
||
"def split_stratified_into_train_val_test(\n",
|
||
" df_input,\n",
|
||
" stratify_colname=\"y\",\n",
|
||
" frac_train=0.6,\n",
|
||
" frac_val=0.15,\n",
|
||
" frac_test=0.25,\n",
|
||
" random_state=None,\n",
|
||
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
|
||
" \n",
|
||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||
" raise ValueError(\n",
|
||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||
" % (frac_train, frac_val, frac_test)\n",
|
||
" )\n",
|
||
" if stratify_colname not in df_input.columns:\n",
|
||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||
" X = df_input # Contains all columns.\n",
|
||
" y = df_input[\n",
|
||
" [stratify_colname]\n",
|
||
" ] # Dataframe of just the column on which to stratify.\n",
|
||
" # Split original dataframe into train and temp dataframes.\n",
|
||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||
" )\n",
|
||
" if frac_val <= 0:\n",
|
||
" assert len(df_input) == len(df_train) + len(df_temp)\n",
|
||
" return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
|
||
" # Split the temp dataframe into val and test dataframes.\n",
|
||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||
" df_temp,\n",
|
||
" y_temp,\n",
|
||
" stratify=y_temp,\n",
|
||
" test_size=relative_frac_test,\n",
|
||
" random_state=random_state,\n",
|
||
" )\n",
|
||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||
" return df_train, df_val, df_test, y_train, y_val, y_test\n",
|
||
"\n",
|
||
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
|
||
" df, stratify_colname=\"price_category\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42\n",
|
||
")\n",
|
||
"\n",
|
||
"display(\"X_train\", X_train)\n",
|
||
"display(\"y_train\", y_train)\n",
|
||
"\n",
|
||
"display(\"X_test\", X_test)\n",
|
||
"display(\"y_test\", y_test)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Формирование конвейера\n",
|
||
"preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
|
||
"\n",
|
||
"preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
|
||
"\n",
|
||
"features_preprocessing -- трансформер для предобработки признаков\n",
|
||
"\n",
|
||
"features_engineering -- трансформер для конструирования признаков\n",
|
||
"\n",
|
||
"drop_columns -- трансформер для удаления колонок\n",
|
||
"\n",
|
||
"pipeline_end -- основной конвейер предобработки данных и конструирования признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 191,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>bathrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>floors</th>\n",
|
||
" <th>condition</th>\n",
|
||
" <th>grade</th>\n",
|
||
" <th>sqft_above</th>\n",
|
||
" <th>sqft_basement</th>\n",
|
||
" <th>yr_built</th>\n",
|
||
" <th>yr_renovated</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>lat</th>\n",
|
||
" <th>long</th>\n",
|
||
" <th>sqft_living15</th>\n",
|
||
" <th>sqft_lot15</th>\n",
|
||
" <th>price_category</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3260000340</td>\n",
|
||
" <td>732600.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.5</td>\n",
|
||
" <td>2130</td>\n",
|
||
" <td>7300</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1230</td>\n",
|
||
" <td>900</td>\n",
|
||
" <td>1963</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98005</td>\n",
|
||
" <td>47.605</td>\n",
|
||
" <td>-122.167</td>\n",
|
||
" <td>2130</td>\n",
|
||
" <td>7560</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>9828702055</td>\n",
|
||
" <td>358000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.5</td>\n",
|
||
" <td>960</td>\n",
|
||
" <td>1808</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>960</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1993</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98122</td>\n",
|
||
" <td>47.6183</td>\n",
|
||
" <td>-122.298</td>\n",
|
||
" <td>1290</td>\n",
|
||
" <td>1668</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3438500625</td>\n",
|
||
" <td>210000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1080</td>\n",
|
||
" <td>21043</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>1080</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1942</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98106</td>\n",
|
||
" <td>47.5515</td>\n",
|
||
" <td>-122.357</td>\n",
|
||
" <td>1380</td>\n",
|
||
" <td>7620</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2422029094</td>\n",
|
||
" <td>517534.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>833</td>\n",
|
||
" <td>143947</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>833</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2006</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98070</td>\n",
|
||
" <td>47.3889</td>\n",
|
||
" <td>-122.482</td>\n",
|
||
" <td>1380</td>\n",
|
||
" <td>143947</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>7462900015</td>\n",
|
||
" <td>387000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.25</td>\n",
|
||
" <td>1760</td>\n",
|
||
" <td>45133</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1760</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1984</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98065</td>\n",
|
||
" <td>47.5124</td>\n",
|
||
" <td>-121.866</td>\n",
|
||
" <td>1910</td>\n",
|
||
" <td>51773</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7995</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2787720140</td>\n",
|
||
" <td>416000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.5</td>\n",
|
||
" <td>1790</td>\n",
|
||
" <td>11542</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1190</td>\n",
|
||
" <td>600</td>\n",
|
||
" <td>1969</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98059</td>\n",
|
||
" <td>47.5124</td>\n",
|
||
" <td>-122.16</td>\n",
|
||
" <td>1790</td>\n",
|
||
" <td>9131</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7996</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>6192400400</td>\n",
|
||
" <td>775000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.5</td>\n",
|
||
" <td>3090</td>\n",
|
||
" <td>7112</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>3090</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2001</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98052</td>\n",
|
||
" <td>47.705</td>\n",
|
||
" <td>-122.118</td>\n",
|
||
" <td>3050</td>\n",
|
||
" <td>6000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7997</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2296500036</td>\n",
|
||
" <td>450000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2.75</td>\n",
|
||
" <td>2980</td>\n",
|
||
" <td>13260</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>1800</td>\n",
|
||
" <td>1180</td>\n",
|
||
" <td>1979</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98056</td>\n",
|
||
" <td>47.5152</td>\n",
|
||
" <td>-122.197</td>\n",
|
||
" <td>1920</td>\n",
|
||
" <td>10731</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7998</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2787310130</td>\n",
|
||
" <td>289950.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1.75</td>\n",
|
||
" <td>2090</td>\n",
|
||
" <td>7416</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1050</td>\n",
|
||
" <td>1040</td>\n",
|
||
" <td>1970</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98031</td>\n",
|
||
" <td>47.4107</td>\n",
|
||
" <td>-122.179</td>\n",
|
||
" <td>1710</td>\n",
|
||
" <td>7527</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7999</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>8567300110</td>\n",
|
||
" <td>485000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.5</td>\n",
|
||
" <td>2340</td>\n",
|
||
" <td>59058</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>2340</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1985</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98038</td>\n",
|
||
" <td>47.4052</td>\n",
|
||
" <td>-122.028</td>\n",
|
||
" <td>2700</td>\n",
|
||
" <td>37263</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>8000 rows × 19 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id price bedrooms bathrooms sqft_living sqft_lot floors \\\n",
|
||
"0 3 3260000340 732600.0 4 2.5 2130 7300 \n",
|
||
"1 2 9828702055 358000.0 2 1.5 960 1808 \n",
|
||
"2 1 3438500625 210000.0 3 1.0 1080 21043 \n",
|
||
"3 2 2422029094 517534.0 2 1.0 833 143947 \n",
|
||
"4 2 7462900015 387000.0 3 2.25 1760 45133 \n",
|
||
"... .. ... ... ... ... ... ... \n",
|
||
"7995 2 2787720140 416000.0 3 2.5 1790 11542 \n",
|
||
"7996 3 6192400400 775000.0 4 2.5 3090 7112 \n",
|
||
"7997 2 2296500036 450000.0 4 2.75 2980 13260 \n",
|
||
"7998 1 2787310130 289950.0 4 1.75 2090 7416 \n",
|
||
"7999 2 8567300110 485000.0 3 2.5 2340 59058 \n",
|
||
"\n",
|
||
" condition grade sqft_above sqft_basement yr_built yr_renovated zipcode \\\n",
|
||
"0 1.0 4 7 1230 900 1963 0 \n",
|
||
"1 2.0 3 7 960 0 1993 0 \n",
|
||
"2 1.0 3 6 1080 0 1942 0 \n",
|
||
"3 1.0 3 5 833 0 2006 0 \n",
|
||
"4 2.0 3 7 1760 0 1984 0 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"7995 1.0 5 7 1190 600 1969 0 \n",
|
||
"7996 2.0 3 9 3090 0 2001 0 \n",
|
||
"7997 1.0 4 8 1800 1180 1979 0 \n",
|
||
"7998 1.0 4 7 1050 1040 1970 0 \n",
|
||
"7999 1.0 3 8 2340 0 1985 0 \n",
|
||
"\n",
|
||
" lat long sqft_living15 sqft_lot15 price_category \n",
|
||
"0 98005 47.605 -122.167 2130 7560 \n",
|
||
"1 98122 47.6183 -122.298 1290 1668 \n",
|
||
"2 98106 47.5515 -122.357 1380 7620 \n",
|
||
"3 98070 47.3889 -122.482 1380 143947 \n",
|
||
"4 98065 47.5124 -121.866 1910 51773 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"7995 98059 47.5124 -122.16 1790 9131 \n",
|
||
"7996 98052 47.705 -122.118 3050 6000 \n",
|
||
"7997 98056 47.5152 -122.197 1920 10731 \n",
|
||
"7998 98031 47.4107 -122.179 1710 7527 \n",
|
||
"7999 98038 47.4052 -122.028 2700 37263 \n",
|
||
"\n",
|
||
"[8000 rows x 19 columns]"
|
||
]
|
||
},
|
||
"execution_count": 191,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"from sklearn.base import BaseEstimator, TransformerMixin\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.discriminant_analysis import StandardScaler\n",
|
||
"from sklearn.impute import SimpleImputer\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.ensemble import RandomForestRegressor # Пример регрессионной модели\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.pipeline import make_pipeline\n",
|
||
"\n",
|
||
"class HousesFeatures(BaseEstimator, TransformerMixin):\n",
|
||
" def __init__(self):\n",
|
||
" pass\n",
|
||
"\n",
|
||
" def fit(self, X, y=None):\n",
|
||
" return self\n",
|
||
"\n",
|
||
"\n",
|
||
" def transform(self, X, y=None):\n",
|
||
"\n",
|
||
" def get_price_type(category) -> int:\n",
|
||
" if pd.isna(category):\n",
|
||
" return \"unknown\"\n",
|
||
" if category == 'low':\n",
|
||
" return 1\n",
|
||
" elif category == 'middle':\n",
|
||
" return 2\n",
|
||
" elif category == 'high':\n",
|
||
" return 3\n",
|
||
" elif category == 'very_high':\n",
|
||
" return 4\n",
|
||
"\n",
|
||
" # Преобразование категориальных столбцов в числовые 1/0\n",
|
||
" X[\"price_category\"] = [get_price_type(category) for category in X[\"price_category\"]]\n",
|
||
" return X\n",
|
||
"\n",
|
||
" def get_feature_names_out(self, features_in):\n",
|
||
" return np.append(features_in, [\"price_type\"], axis=0)\n",
|
||
"\n",
|
||
"# Указываем столбцы, которые нужно удалить и обрабатывать\n",
|
||
"columns_to_drop = [\"date\", \"view\", \"waterfront\"]\n",
|
||
"num_columns = [\n",
|
||
" column\n",
|
||
" for column in df.columns\n",
|
||
" if column not in columns_to_drop and df[column].dtype != \"object\" and df[column].dtype != \"category\"\n",
|
||
"]\n",
|
||
"cat_columns = [\n",
|
||
" column\n",
|
||
" for column in df.columns\n",
|
||
" if column not in columns_to_drop and df[column].dtype == \"object\" or df[column].dtype == \"category\"\n",
|
||
"]\n",
|
||
"\n",
|
||
"# Определяем предобработку для численных данных\n",
|
||
"num_imputer = SimpleImputer(strategy=\"median\")\n",
|
||
"num_scaler = StandardScaler()\n",
|
||
"preprocessing_num = Pipeline(\n",
|
||
" [\n",
|
||
" (\"imputer\", num_imputer),\n",
|
||
" (\"scaler\", num_scaler),\n",
|
||
" ]\n",
|
||
")\n",
|
||
"\n",
|
||
"# Определяем предобработку для категориальных данных\n",
|
||
"cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
|
||
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
|
||
"preprocessing_cat = Pipeline(\n",
|
||
" [\n",
|
||
" (\"imputer\", cat_imputer),\n",
|
||
" (\"encoder\", cat_encoder),\n",
|
||
" ]\n",
|
||
")\n",
|
||
"\n",
|
||
"features_preprocessing = ColumnTransformer(\n",
|
||
" verbose_feature_names_out=False,\n",
|
||
" transformers=[\n",
|
||
" (\"prepocessing_num\", preprocessing_num, num_columns),\n",
|
||
" (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
|
||
" # (\"prepocessing_features\", cat_imputer, [\"price_category\"]),\n",
|
||
" ],\n",
|
||
" remainder=\"passthrough\"\n",
|
||
")\n",
|
||
"\n",
|
||
"features_engineering = ColumnTransformer(\n",
|
||
" verbose_feature_names_out=False,\n",
|
||
" transformers=[\n",
|
||
" (\"add_features\", HousesFeatures(), [\"price_category\"]),\n",
|
||
" ],\n",
|
||
" remainder=\"passthrough\",\n",
|
||
")\n",
|
||
"\n",
|
||
"drop_columns = ColumnTransformer(\n",
|
||
" verbose_feature_names_out=False,\n",
|
||
" transformers=[\n",
|
||
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
|
||
" ],\n",
|
||
" remainder=\"passthrough\",\n",
|
||
")\n",
|
||
"\n",
|
||
"features_postprocessing = ColumnTransformer(\n",
|
||
" verbose_feature_names_out=False,\n",
|
||
" transformers=[\n",
|
||
" (\"prepocessing_cat\", preprocessing_cat, [\"price_category\"]),\n",
|
||
" ],\n",
|
||
" remainder=\"passthrough\",\n",
|
||
")\n",
|
||
"\n",
|
||
"pipeline_end = Pipeline(\n",
|
||
" [\n",
|
||
" (\"features_preprocessing\", features_preprocessing),\n",
|
||
" (\"features_engineering\", features_engineering),\n",
|
||
" (\"drop_columns\", drop_columns),\n",
|
||
" (\"features_postprocessing\", features_postprocessing),\n",
|
||
" ]\n",
|
||
"\n",
|
||
")\n",
|
||
"cols = ['a', 'b']\n",
|
||
"preprocessing_result = drop_columns.fit_transform(X_train)\n",
|
||
"preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)\n",
|
||
"preprocessing_result = features_engineering.fit_transform(preprocessing_result)\n",
|
||
"preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)\n",
|
||
"preprocessing_result\n",
|
||
"# # preprocessing_result = features_preprocessing.fit_transform(preprocessing_result)\n",
|
||
"# # preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)\n",
|
||
"\n",
|
||
"# preprocessing_result = features_postprocessing.fit_transform(preprocessing_result)\n",
|
||
"\n",
|
||
"# preprocessing_result = pipeline_end.fit_transform(X_train)\n",
|
||
"# preprocessed_df = pd.DataFrame(\n",
|
||
"# preprocessing_result,\n",
|
||
"# columns=pipeline_end.get_feature_names_out(),\n",
|
||
"# )\n",
|
||
"# preprocessed_df"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "kernel",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|