4130 lines
148 KiB
Plaintext
4130 lines
148 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 17,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>date</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>grade</th>\n",
|
|||
|
" <th>sqft_above</th>\n",
|
|||
|
" <th>sqft_basement</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_renovated</th>\n",
|
|||
|
" <th>zipcode</th>\n",
|
|||
|
" <th>lat</th>\n",
|
|||
|
" <th>long</th>\n",
|
|||
|
" <th>sqft_living15</th>\n",
|
|||
|
" <th>sqft_lot15</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>20141013T000000</td>\n",
|
|||
|
" <td>221900.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>1180</td>\n",
|
|||
|
" <td>5650</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1180</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1955</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98178</td>\n",
|
|||
|
" <td>47.5112</td>\n",
|
|||
|
" <td>-122.257</td>\n",
|
|||
|
" <td>1340</td>\n",
|
|||
|
" <td>5650</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>20141209T000000</td>\n",
|
|||
|
" <td>538000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>2570</td>\n",
|
|||
|
" <td>7242</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>2170</td>\n",
|
|||
|
" <td>400</td>\n",
|
|||
|
" <td>1951</td>\n",
|
|||
|
" <td>1991</td>\n",
|
|||
|
" <td>98125</td>\n",
|
|||
|
" <td>47.7210</td>\n",
|
|||
|
" <td>-122.319</td>\n",
|
|||
|
" <td>1690</td>\n",
|
|||
|
" <td>7639</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>20150225T000000</td>\n",
|
|||
|
" <td>180000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>770</td>\n",
|
|||
|
" <td>10000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>770</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1933</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98028</td>\n",
|
|||
|
" <td>47.7379</td>\n",
|
|||
|
" <td>-122.233</td>\n",
|
|||
|
" <td>2720</td>\n",
|
|||
|
" <td>8062</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>20141209T000000</td>\n",
|
|||
|
" <td>604000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>3.00</td>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>5000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1050</td>\n",
|
|||
|
" <td>910</td>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98136</td>\n",
|
|||
|
" <td>47.5208</td>\n",
|
|||
|
" <td>-122.393</td>\n",
|
|||
|
" <td>1360</td>\n",
|
|||
|
" <td>5000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>20150218T000000</td>\n",
|
|||
|
" <td>510000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.00</td>\n",
|
|||
|
" <td>1680</td>\n",
|
|||
|
" <td>8080</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1680</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1987</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98074</td>\n",
|
|||
|
" <td>47.6168</td>\n",
|
|||
|
" <td>-122.045</td>\n",
|
|||
|
" <td>1800</td>\n",
|
|||
|
" <td>7503</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21608</th>\n",
|
|||
|
" <td>20140521T000000</td>\n",
|
|||
|
" <td>360000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1530</td>\n",
|
|||
|
" <td>1131</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1530</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2009</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98103</td>\n",
|
|||
|
" <td>47.6993</td>\n",
|
|||
|
" <td>-122.346</td>\n",
|
|||
|
" <td>1530</td>\n",
|
|||
|
" <td>1509</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21609</th>\n",
|
|||
|
" <td>20150223T000000</td>\n",
|
|||
|
" <td>400000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2310</td>\n",
|
|||
|
" <td>5813</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>2310</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98146</td>\n",
|
|||
|
" <td>47.5107</td>\n",
|
|||
|
" <td>-122.362</td>\n",
|
|||
|
" <td>1830</td>\n",
|
|||
|
" <td>7200</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21610</th>\n",
|
|||
|
" <td>20140623T000000</td>\n",
|
|||
|
" <td>402101.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>1350</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2009</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98144</td>\n",
|
|||
|
" <td>47.5944</td>\n",
|
|||
|
" <td>-122.299</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>2007</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21611</th>\n",
|
|||
|
" <td>20150116T000000</td>\n",
|
|||
|
" <td>400000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1600</td>\n",
|
|||
|
" <td>2388</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1600</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2004</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98027</td>\n",
|
|||
|
" <td>47.5345</td>\n",
|
|||
|
" <td>-122.069</td>\n",
|
|||
|
" <td>1410</td>\n",
|
|||
|
" <td>1287</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21612</th>\n",
|
|||
|
" <td>20141015T000000</td>\n",
|
|||
|
" <td>325000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>1076</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2008</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98144</td>\n",
|
|||
|
" <td>47.5941</td>\n",
|
|||
|
" <td>-122.299</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>1357</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>21613 rows × 20 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" date price bedrooms bathrooms sqft_living sqft_lot \\\n",
|
|||
|
"0 20141013T000000 221900.0 3 1.00 1180 5650 \n",
|
|||
|
"1 20141209T000000 538000.0 3 2.25 2570 7242 \n",
|
|||
|
"2 20150225T000000 180000.0 2 1.00 770 10000 \n",
|
|||
|
"3 20141209T000000 604000.0 4 3.00 1960 5000 \n",
|
|||
|
"4 20150218T000000 510000.0 3 2.00 1680 8080 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"21608 20140521T000000 360000.0 3 2.50 1530 1131 \n",
|
|||
|
"21609 20150223T000000 400000.0 4 2.50 2310 5813 \n",
|
|||
|
"21610 20140623T000000 402101.0 2 0.75 1020 1350 \n",
|
|||
|
"21611 20150116T000000 400000.0 3 2.50 1600 2388 \n",
|
|||
|
"21612 20141015T000000 325000.0 2 0.75 1020 1076 \n",
|
|||
|
"\n",
|
|||
|
" floors waterfront view condition grade sqft_above sqft_basement \\\n",
|
|||
|
"0 1.0 0 0 3 7 1180 0 \n",
|
|||
|
"1 2.0 0 0 3 7 2170 400 \n",
|
|||
|
"2 1.0 0 0 3 6 770 0 \n",
|
|||
|
"3 1.0 0 0 5 7 1050 910 \n",
|
|||
|
"4 1.0 0 0 3 8 1680 0 \n",
|
|||
|
"... ... ... ... ... ... ... ... \n",
|
|||
|
"21608 3.0 0 0 3 8 1530 0 \n",
|
|||
|
"21609 2.0 0 0 3 8 2310 0 \n",
|
|||
|
"21610 2.0 0 0 3 7 1020 0 \n",
|
|||
|
"21611 2.0 0 0 3 8 1600 0 \n",
|
|||
|
"21612 2.0 0 0 3 7 1020 0 \n",
|
|||
|
"\n",
|
|||
|
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
|
|||
|
"0 1955 0 98178 47.5112 -122.257 1340 \n",
|
|||
|
"1 1951 1991 98125 47.7210 -122.319 1690 \n",
|
|||
|
"2 1933 0 98028 47.7379 -122.233 2720 \n",
|
|||
|
"3 1965 0 98136 47.5208 -122.393 1360 \n",
|
|||
|
"4 1987 0 98074 47.6168 -122.045 1800 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"21608 2009 0 98103 47.6993 -122.346 1530 \n",
|
|||
|
"21609 2014 0 98146 47.5107 -122.362 1830 \n",
|
|||
|
"21610 2009 0 98144 47.5944 -122.299 1020 \n",
|
|||
|
"21611 2004 0 98027 47.5345 -122.069 1410 \n",
|
|||
|
"21612 2008 0 98144 47.5941 -122.299 1020 \n",
|
|||
|
"\n",
|
|||
|
" sqft_lot15 \n",
|
|||
|
"0 5650 \n",
|
|||
|
"1 7639 \n",
|
|||
|
"2 8062 \n",
|
|||
|
"3 5000 \n",
|
|||
|
"4 7503 \n",
|
|||
|
"... ... \n",
|
|||
|
"21608 1509 \n",
|
|||
|
"21609 7200 \n",
|
|||
|
"21610 2007 \n",
|
|||
|
"21611 1287 \n",
|
|||
|
"21612 1357 \n",
|
|||
|
"\n",
|
|||
|
"[21613 rows x 20 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 17,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"house = pd.read_csv(\"data/kc_house_data.csv\", index_col=\"id\")\n",
|
|||
|
"\n",
|
|||
|
"house = house.reset_index(drop=True)\n",
|
|||
|
"\n",
|
|||
|
"house"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 18,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>yr_built_1901</th>\n",
|
|||
|
" <th>yr_built_1902</th>\n",
|
|||
|
" <th>yr_built_1903</th>\n",
|
|||
|
" <th>yr_built_1904</th>\n",
|
|||
|
" <th>yr_built_1905</th>\n",
|
|||
|
" <th>yr_built_1906</th>\n",
|
|||
|
" <th>yr_built_1907</th>\n",
|
|||
|
" <th>yr_built_1908</th>\n",
|
|||
|
" <th>yr_built_1909</th>\n",
|
|||
|
" <th>yr_built_1910</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>price_4489000.0</th>\n",
|
|||
|
" <th>price_4500000.0</th>\n",
|
|||
|
" <th>price_4668000.0</th>\n",
|
|||
|
" <th>price_5110800.0</th>\n",
|
|||
|
" <th>price_5300000.0</th>\n",
|
|||
|
" <th>price_5350000.0</th>\n",
|
|||
|
" <th>price_5570000.0</th>\n",
|
|||
|
" <th>price_6885000.0</th>\n",
|
|||
|
" <th>price_7062500.0</th>\n",
|
|||
|
" <th>price_7700000.0</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21608</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21609</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21610</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21611</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21612</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>21613 rows × 4142 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" yr_built_1901 yr_built_1902 yr_built_1903 yr_built_1904 \\\n",
|
|||
|
"0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"21608 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21609 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21610 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21611 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21612 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" yr_built_1905 yr_built_1906 yr_built_1907 yr_built_1908 \\\n",
|
|||
|
"0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"21608 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21609 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21610 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21611 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21612 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" yr_built_1909 yr_built_1910 ... price_4489000.0 price_4500000.0 \\\n",
|
|||
|
"0 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"21608 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"21609 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"21610 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"21611 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"21612 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" price_4668000.0 price_5110800.0 price_5300000.0 price_5350000.0 \\\n",
|
|||
|
"0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"21608 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21609 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21610 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21611 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21612 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" price_5570000.0 price_6885000.0 price_7062500.0 price_7700000.0 \n",
|
|||
|
"0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"21608 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21609 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21610 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21611 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21612 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[21613 rows x 4142 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 18,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
|||
|
"\n",
|
|||
|
"encoded_values = encoder.fit_transform(house[[\"yr_built\", \"price\"]])\n",
|
|||
|
"\n",
|
|||
|
"encoded_columns = encoder.get_feature_names_out([\"yr_built\", \"price\"])\n",
|
|||
|
"\n",
|
|||
|
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
|||
|
"\n",
|
|||
|
"encoded_values_df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>date</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>waterfront</th>\n",
|
|||
|
" <th>view</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>price_4489000.0</th>\n",
|
|||
|
" <th>price_4500000.0</th>\n",
|
|||
|
" <th>price_4668000.0</th>\n",
|
|||
|
" <th>price_5110800.0</th>\n",
|
|||
|
" <th>price_5300000.0</th>\n",
|
|||
|
" <th>price_5350000.0</th>\n",
|
|||
|
" <th>price_5570000.0</th>\n",
|
|||
|
" <th>price_6885000.0</th>\n",
|
|||
|
" <th>price_7062500.0</th>\n",
|
|||
|
" <th>price_7700000.0</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>20141013T000000</td>\n",
|
|||
|
" <td>221900.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>1180</td>\n",
|
|||
|
" <td>5650</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>20141209T000000</td>\n",
|
|||
|
" <td>538000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>2570</td>\n",
|
|||
|
" <td>7242</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>20150225T000000</td>\n",
|
|||
|
" <td>180000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>770</td>\n",
|
|||
|
" <td>10000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>20141209T000000</td>\n",
|
|||
|
" <td>604000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>3.00</td>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>5000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>20150218T000000</td>\n",
|
|||
|
" <td>510000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.00</td>\n",
|
|||
|
" <td>1680</td>\n",
|
|||
|
" <td>8080</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21608</th>\n",
|
|||
|
" <td>20140521T000000</td>\n",
|
|||
|
" <td>360000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1530</td>\n",
|
|||
|
" <td>1131</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21609</th>\n",
|
|||
|
" <td>20150223T000000</td>\n",
|
|||
|
" <td>400000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2310</td>\n",
|
|||
|
" <td>5813</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21610</th>\n",
|
|||
|
" <td>20140623T000000</td>\n",
|
|||
|
" <td>402101.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>1350</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21611</th>\n",
|
|||
|
" <td>20150116T000000</td>\n",
|
|||
|
" <td>400000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1600</td>\n",
|
|||
|
" <td>2388</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21612</th>\n",
|
|||
|
" <td>20141015T000000</td>\n",
|
|||
|
" <td>325000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>1076</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>21613 rows × 4162 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" date price bedrooms bathrooms sqft_living sqft_lot \\\n",
|
|||
|
"0 20141013T000000 221900.0 3 1.00 1180 5650 \n",
|
|||
|
"1 20141209T000000 538000.0 3 2.25 2570 7242 \n",
|
|||
|
"2 20150225T000000 180000.0 2 1.00 770 10000 \n",
|
|||
|
"3 20141209T000000 604000.0 4 3.00 1960 5000 \n",
|
|||
|
"4 20150218T000000 510000.0 3 2.00 1680 8080 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"21608 20140521T000000 360000.0 3 2.50 1530 1131 \n",
|
|||
|
"21609 20150223T000000 400000.0 4 2.50 2310 5813 \n",
|
|||
|
"21610 20140623T000000 402101.0 2 0.75 1020 1350 \n",
|
|||
|
"21611 20150116T000000 400000.0 3 2.50 1600 2388 \n",
|
|||
|
"21612 20141015T000000 325000.0 2 0.75 1020 1076 \n",
|
|||
|
"\n",
|
|||
|
" floors waterfront view condition ... price_4489000.0 \\\n",
|
|||
|
"0 1.0 0 0 3 ... 0.0 \n",
|
|||
|
"1 2.0 0 0 3 ... 0.0 \n",
|
|||
|
"2 1.0 0 0 3 ... 0.0 \n",
|
|||
|
"3 1.0 0 0 5 ... 0.0 \n",
|
|||
|
"4 1.0 0 0 3 ... 0.0 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"21608 3.0 0 0 3 ... 0.0 \n",
|
|||
|
"21609 2.0 0 0 3 ... 0.0 \n",
|
|||
|
"21610 2.0 0 0 3 ... 0.0 \n",
|
|||
|
"21611 2.0 0 0 3 ... 0.0 \n",
|
|||
|
"21612 2.0 0 0 3 ... 0.0 \n",
|
|||
|
"\n",
|
|||
|
" price_4500000.0 price_4668000.0 price_5110800.0 price_5300000.0 \\\n",
|
|||
|
"0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"21608 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21609 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21610 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21611 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21612 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" price_5350000.0 price_5570000.0 price_6885000.0 price_7062500.0 \\\n",
|
|||
|
"0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"21608 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21609 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21610 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21611 0.0 0.0 0.0 0.0 \n",
|
|||
|
"21612 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" price_7700000.0 \n",
|
|||
|
"0 0.0 \n",
|
|||
|
"1 0.0 \n",
|
|||
|
"2 0.0 \n",
|
|||
|
"3 0.0 \n",
|
|||
|
"4 0.0 \n",
|
|||
|
"... ... \n",
|
|||
|
"21608 0.0 \n",
|
|||
|
"21609 0.0 \n",
|
|||
|
"21610 0.0 \n",
|
|||
|
"21611 0.0 \n",
|
|||
|
"21612 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[21613 rows x 4162 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"house = pd.concat([house, encoded_values_df], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"house"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"labels = [\"old\", \"middle\", \"new\"]\n",
|
|||
|
"num_bins = 3"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 19,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"(array([1900. , 1938.33333333, 1976.66666667, 2015. ]),\n",
|
|||
|
" array([ 3067, 8120, 10426]))"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 19,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"hist1, bins1 = np.histogram(\n",
|
|||
|
" house[\"yr_built\"].fillna(house[\"yr_built\"].median()), bins=num_bins\n",
|
|||
|
")\n",
|
|||
|
"bins1, hist1"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 20,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>1955</td>\n",
|
|||
|
" <td>(1938.333, 1976.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>1951</td>\n",
|
|||
|
" <td>(1938.333, 1976.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>1933</td>\n",
|
|||
|
" <td>(1900.0, 1938.333]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>(1938.333, 1976.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>1987</td>\n",
|
|||
|
" <td>(1976.667, 2015.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>2001</td>\n",
|
|||
|
" <td>(1976.667, 2015.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>1995</td>\n",
|
|||
|
" <td>(1976.667, 2015.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>1963</td>\n",
|
|||
|
" <td>(1938.333, 1976.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>(1938.333, 1976.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>2003</td>\n",
|
|||
|
" <td>(1976.667, 2015.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>(1938.333, 1976.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <td>1942</td>\n",
|
|||
|
" <td>(1938.333, 1976.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>1927</td>\n",
|
|||
|
" <td>(1900.0, 1938.333]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>1977</td>\n",
|
|||
|
" <td>(1976.667, 2015.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>1900</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>1979</td>\n",
|
|||
|
" <td>(1976.667, 2015.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>1994</td>\n",
|
|||
|
" <td>(1976.667, 2015.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>1916</td>\n",
|
|||
|
" <td>(1900.0, 1938.333]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <td>1921</td>\n",
|
|||
|
" <td>(1900.0, 1938.333]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <td>1969</td>\n",
|
|||
|
" <td>(1938.333, 1976.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" yr_built yr_built\n",
|
|||
|
"0 1955 (1938.333, 1976.667]\n",
|
|||
|
"1 1951 (1938.333, 1976.667]\n",
|
|||
|
"2 1933 (1900.0, 1938.333]\n",
|
|||
|
"3 1965 (1938.333, 1976.667]\n",
|
|||
|
"4 1987 (1976.667, 2015.0]\n",
|
|||
|
"5 2001 (1976.667, 2015.0]\n",
|
|||
|
"6 1995 (1976.667, 2015.0]\n",
|
|||
|
"7 1963 (1938.333, 1976.667]\n",
|
|||
|
"8 1960 (1938.333, 1976.667]\n",
|
|||
|
"9 2003 (1976.667, 2015.0]\n",
|
|||
|
"10 1965 (1938.333, 1976.667]\n",
|
|||
|
"11 1942 (1938.333, 1976.667]\n",
|
|||
|
"12 1927 (1900.0, 1938.333]\n",
|
|||
|
"13 1977 (1976.667, 2015.0]\n",
|
|||
|
"14 1900 NaN\n",
|
|||
|
"15 1979 (1976.667, 2015.0]\n",
|
|||
|
"16 1994 (1976.667, 2015.0]\n",
|
|||
|
"17 1916 (1900.0, 1938.333]\n",
|
|||
|
"18 1921 (1900.0, 1938.333]\n",
|
|||
|
"19 1969 (1938.333, 1976.667]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 20,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"pd.concat([house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins1))], axis=1).head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 21,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>1955</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>1951</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>1933</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>1987</td>\n",
|
|||
|
" <td>new</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>2001</td>\n",
|
|||
|
" <td>new</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>1995</td>\n",
|
|||
|
" <td>new</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>1963</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>2003</td>\n",
|
|||
|
" <td>new</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <td>1942</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>1927</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>1977</td>\n",
|
|||
|
" <td>new</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>1900</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>1979</td>\n",
|
|||
|
" <td>new</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>1994</td>\n",
|
|||
|
" <td>new</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>1916</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <td>1921</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <td>1969</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" yr_built yr_built\n",
|
|||
|
"0 1955 middle\n",
|
|||
|
"1 1951 middle\n",
|
|||
|
"2 1933 old\n",
|
|||
|
"3 1965 middle\n",
|
|||
|
"4 1987 new\n",
|
|||
|
"5 2001 new\n",
|
|||
|
"6 1995 new\n",
|
|||
|
"7 1963 middle\n",
|
|||
|
"8 1960 middle\n",
|
|||
|
"9 2003 new\n",
|
|||
|
"10 1965 middle\n",
|
|||
|
"11 1942 middle\n",
|
|||
|
"12 1927 old\n",
|
|||
|
"13 1977 new\n",
|
|||
|
"14 1900 NaN\n",
|
|||
|
"15 1979 new\n",
|
|||
|
"16 1994 new\n",
|
|||
|
"17 1916 old\n",
|
|||
|
"18 1921 old\n",
|
|||
|
"19 1969 middle"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 21,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"pd.concat(\n",
|
|||
|
" [house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins1), labels=labels)], axis=1\n",
|
|||
|
").head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"(array([1899., 1928., 1957., 1986., 2015.]),\n",
|
|||
|
" array([2403, 4230, 6914, 8028, 38]))"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 11,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"bins2 = np.linspace(1899, 2015, 5)\n",
|
|||
|
"tmp_bins2 = np.digitize(house[\"yr_built\"].fillna(house[\"yr_built\"].median()), bins2)\n",
|
|||
|
"hist2 = np.bincount(tmp_bins2 - 1)\n",
|
|||
|
"bins2, hist2"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 22,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>1955</td>\n",
|
|||
|
" <td>(1928.0, 1957.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>1951</td>\n",
|
|||
|
" <td>(1928.0, 1957.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>1933</td>\n",
|
|||
|
" <td>(1928.0, 1957.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>(1957.0, 1986.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>1987</td>\n",
|
|||
|
" <td>(1986.0, 2015.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>2001</td>\n",
|
|||
|
" <td>(1986.0, 2015.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>1995</td>\n",
|
|||
|
" <td>(1986.0, 2015.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>1963</td>\n",
|
|||
|
" <td>(1957.0, 1986.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>(1957.0, 1986.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>2003</td>\n",
|
|||
|
" <td>(1986.0, 2015.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>(1957.0, 1986.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <td>1942</td>\n",
|
|||
|
" <td>(1928.0, 1957.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>1927</td>\n",
|
|||
|
" <td>(1899.0, 1928.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>1977</td>\n",
|
|||
|
" <td>(1957.0, 1986.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>1900</td>\n",
|
|||
|
" <td>(1899.0, 1928.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>1979</td>\n",
|
|||
|
" <td>(1957.0, 1986.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>1994</td>\n",
|
|||
|
" <td>(1986.0, 2015.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>1916</td>\n",
|
|||
|
" <td>(1899.0, 1928.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <td>1921</td>\n",
|
|||
|
" <td>(1899.0, 1928.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <td>1969</td>\n",
|
|||
|
" <td>(1957.0, 1986.0]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" yr_built yr_built\n",
|
|||
|
"0 1955 (1928.0, 1957.0]\n",
|
|||
|
"1 1951 (1928.0, 1957.0]\n",
|
|||
|
"2 1933 (1928.0, 1957.0]\n",
|
|||
|
"3 1965 (1957.0, 1986.0]\n",
|
|||
|
"4 1987 (1986.0, 2015.0]\n",
|
|||
|
"5 2001 (1986.0, 2015.0]\n",
|
|||
|
"6 1995 (1986.0, 2015.0]\n",
|
|||
|
"7 1963 (1957.0, 1986.0]\n",
|
|||
|
"8 1960 (1957.0, 1986.0]\n",
|
|||
|
"9 2003 (1986.0, 2015.0]\n",
|
|||
|
"10 1965 (1957.0, 1986.0]\n",
|
|||
|
"11 1942 (1928.0, 1957.0]\n",
|
|||
|
"12 1927 (1899.0, 1928.0]\n",
|
|||
|
"13 1977 (1957.0, 1986.0]\n",
|
|||
|
"14 1900 (1899.0, 1928.0]\n",
|
|||
|
"15 1979 (1957.0, 1986.0]\n",
|
|||
|
"16 1994 (1986.0, 2015.0]\n",
|
|||
|
"17 1916 (1899.0, 1928.0]\n",
|
|||
|
"18 1921 (1899.0, 1928.0]\n",
|
|||
|
"19 1969 (1957.0, 1986.0]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 22,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"pd.concat([house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins2))], axis=1).head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 23,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"(array([1899, 1957, 2001, 2015]), array([ 6633, 10439, 4541]))"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 23,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"hist3, bins3 = np.histogram(\n",
|
|||
|
" house[\"yr_built\"].fillna(house[\"yr_built\"].median()), bins=[1899, 1957, 2001, 2015]\n",
|
|||
|
")\n",
|
|||
|
"bins3, hist3"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 24,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>1955</td>\n",
|
|||
|
" <td>(1899, 1957]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>1951</td>\n",
|
|||
|
" <td>(1899, 1957]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>1933</td>\n",
|
|||
|
" <td>(1899, 1957]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>(1957, 2001]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>1987</td>\n",
|
|||
|
" <td>(1957, 2001]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>2001</td>\n",
|
|||
|
" <td>(1957, 2001]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>1995</td>\n",
|
|||
|
" <td>(1957, 2001]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>1963</td>\n",
|
|||
|
" <td>(1957, 2001]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>(1957, 2001]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>2003</td>\n",
|
|||
|
" <td>(2001, 2015]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>(1957, 2001]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <td>1942</td>\n",
|
|||
|
" <td>(1899, 1957]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>1927</td>\n",
|
|||
|
" <td>(1899, 1957]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>1977</td>\n",
|
|||
|
" <td>(1957, 2001]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>1900</td>\n",
|
|||
|
" <td>(1899, 1957]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>1979</td>\n",
|
|||
|
" <td>(1957, 2001]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>1994</td>\n",
|
|||
|
" <td>(1957, 2001]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>1916</td>\n",
|
|||
|
" <td>(1899, 1957]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <td>1921</td>\n",
|
|||
|
" <td>(1899, 1957]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <td>1969</td>\n",
|
|||
|
" <td>(1957, 2001]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" yr_built yr_built\n",
|
|||
|
"0 1955 (1899, 1957]\n",
|
|||
|
"1 1951 (1899, 1957]\n",
|
|||
|
"2 1933 (1899, 1957]\n",
|
|||
|
"3 1965 (1957, 2001]\n",
|
|||
|
"4 1987 (1957, 2001]\n",
|
|||
|
"5 2001 (1957, 2001]\n",
|
|||
|
"6 1995 (1957, 2001]\n",
|
|||
|
"7 1963 (1957, 2001]\n",
|
|||
|
"8 1960 (1957, 2001]\n",
|
|||
|
"9 2003 (2001, 2015]\n",
|
|||
|
"10 1965 (1957, 2001]\n",
|
|||
|
"11 1942 (1899, 1957]\n",
|
|||
|
"12 1927 (1899, 1957]\n",
|
|||
|
"13 1977 (1957, 2001]\n",
|
|||
|
"14 1900 (1899, 1957]\n",
|
|||
|
"15 1979 (1957, 2001]\n",
|
|||
|
"16 1994 (1957, 2001]\n",
|
|||
|
"17 1916 (1899, 1957]\n",
|
|||
|
"18 1921 (1899, 1957]\n",
|
|||
|
"19 1969 (1957, 2001]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 24,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"pd.concat([house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins3))], axis=1).head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>1955</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>1951</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>1933</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>1987</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>2001</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>1995</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>1963</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>2003</td>\n",
|
|||
|
" <td>new</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <td>1942</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>1927</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>1977</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>1900</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>1979</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>1994</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>1916</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <td>1921</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <td>1969</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" yr_built yr_built\n",
|
|||
|
"0 1955 old\n",
|
|||
|
"1 1951 old\n",
|
|||
|
"2 1933 old\n",
|
|||
|
"3 1965 middle\n",
|
|||
|
"4 1987 middle\n",
|
|||
|
"5 2001 middle\n",
|
|||
|
"6 1995 middle\n",
|
|||
|
"7 1963 middle\n",
|
|||
|
"8 1960 middle\n",
|
|||
|
"9 2003 new\n",
|
|||
|
"10 1965 middle\n",
|
|||
|
"11 1942 old\n",
|
|||
|
"12 1927 old\n",
|
|||
|
"13 1977 middle\n",
|
|||
|
"14 1900 old\n",
|
|||
|
"15 1979 middle\n",
|
|||
|
"16 1994 middle\n",
|
|||
|
"17 1916 old\n",
|
|||
|
"18 1921 old\n",
|
|||
|
"19 1969 middle"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"pd.concat(\n",
|
|||
|
" [house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins3), labels=labels)],\n",
|
|||
|
" axis=1,\n",
|
|||
|
").head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 26,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>1955</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>1951</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>1933</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>1987</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>2001</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>1995</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>1963</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>2003</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <td>1942</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>1927</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>1977</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>1900</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>1979</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>1994</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>1916</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <td>1921</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <td>1969</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" yr_built yr_built\n",
|
|||
|
"0 1955 0\n",
|
|||
|
"1 1951 0\n",
|
|||
|
"2 1933 0\n",
|
|||
|
"3 1965 1\n",
|
|||
|
"4 1987 1\n",
|
|||
|
"5 2001 2\n",
|
|||
|
"6 1995 2\n",
|
|||
|
"7 1963 1\n",
|
|||
|
"8 1960 1\n",
|
|||
|
"9 2003 2\n",
|
|||
|
"10 1965 1\n",
|
|||
|
"11 1942 0\n",
|
|||
|
"12 1927 0\n",
|
|||
|
"13 1977 1\n",
|
|||
|
"14 1900 0\n",
|
|||
|
"15 1979 1\n",
|
|||
|
"16 1994 2\n",
|
|||
|
"17 1916 0\n",
|
|||
|
"18 1921 0\n",
|
|||
|
"19 1969 1"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 26,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"pd.concat(\n",
|
|||
|
" [house[\"yr_built\"], pd.qcut(house[\"yr_built\"], q=3, labels=False)], axis=1\n",
|
|||
|
").head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 27,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>1955</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>1951</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>1933</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>1987</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>2001</td>\n",
|
|||
|
" <td>new</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>1995</td>\n",
|
|||
|
" <td>new</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>1963</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>2003</td>\n",
|
|||
|
" <td>new</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <td>1942</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>1927</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>1977</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>1900</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>1979</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>1994</td>\n",
|
|||
|
" <td>new</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>1916</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <td>1921</td>\n",
|
|||
|
" <td>old</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <td>1969</td>\n",
|
|||
|
" <td>middle</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" yr_built yr_built\n",
|
|||
|
"0 1955 old\n",
|
|||
|
"1 1951 old\n",
|
|||
|
"2 1933 old\n",
|
|||
|
"3 1965 middle\n",
|
|||
|
"4 1987 middle\n",
|
|||
|
"5 2001 new\n",
|
|||
|
"6 1995 new\n",
|
|||
|
"7 1963 middle\n",
|
|||
|
"8 1960 middle\n",
|
|||
|
"9 2003 new\n",
|
|||
|
"10 1965 middle\n",
|
|||
|
"11 1942 old\n",
|
|||
|
"12 1927 old\n",
|
|||
|
"13 1977 middle\n",
|
|||
|
"14 1900 old\n",
|
|||
|
"15 1979 middle\n",
|
|||
|
"16 1994 new\n",
|
|||
|
"17 1916 old\n",
|
|||
|
"18 1921 old\n",
|
|||
|
"19 1969 middle"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 27,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"pd.concat(\n",
|
|||
|
" [house[\"yr_built\"], pd.qcut(house[\"yr_built\"], q=3, labels=labels)], axis=1\n",
|
|||
|
").head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Пример конструирования признаков на основе существующих"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>date</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>bedrooms</th>\n",
|
|||
|
" <th>bathrooms</th>\n",
|
|||
|
" <th>sqft_living</th>\n",
|
|||
|
" <th>sqft_lot</th>\n",
|
|||
|
" <th>floors</th>\n",
|
|||
|
" <th>grade</th>\n",
|
|||
|
" <th>sqft_above</th>\n",
|
|||
|
" <th>sqft_basement</th>\n",
|
|||
|
" <th>yr_built</th>\n",
|
|||
|
" <th>yr_renovated</th>\n",
|
|||
|
" <th>zipcode</th>\n",
|
|||
|
" <th>lat</th>\n",
|
|||
|
" <th>long</th>\n",
|
|||
|
" <th>sqft_living15</th>\n",
|
|||
|
" <th>sqft_lot15</th>\n",
|
|||
|
" <th>Price_category</th>\n",
|
|||
|
" <th>Renovated_flag</th>\n",
|
|||
|
" <th>Zipcode_area</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>20141013T000000</td>\n",
|
|||
|
" <td>221900.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>1180</td>\n",
|
|||
|
" <td>5650</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1180</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1955</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98178</td>\n",
|
|||
|
" <td>47.5112</td>\n",
|
|||
|
" <td>-122.257</td>\n",
|
|||
|
" <td>1340</td>\n",
|
|||
|
" <td>5650</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>981</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>20141209T000000</td>\n",
|
|||
|
" <td>538000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.25</td>\n",
|
|||
|
" <td>2570</td>\n",
|
|||
|
" <td>7242</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>2170</td>\n",
|
|||
|
" <td>400</td>\n",
|
|||
|
" <td>1951</td>\n",
|
|||
|
" <td>1991</td>\n",
|
|||
|
" <td>98125</td>\n",
|
|||
|
" <td>47.7210</td>\n",
|
|||
|
" <td>-122.319</td>\n",
|
|||
|
" <td>1690</td>\n",
|
|||
|
" <td>7639</td>\n",
|
|||
|
" <td>Medium</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>981</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>20150225T000000</td>\n",
|
|||
|
" <td>180000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>770</td>\n",
|
|||
|
" <td>10000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>770</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1933</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98028</td>\n",
|
|||
|
" <td>47.7379</td>\n",
|
|||
|
" <td>-122.233</td>\n",
|
|||
|
" <td>2720</td>\n",
|
|||
|
" <td>8062</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>980</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>20141209T000000</td>\n",
|
|||
|
" <td>604000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>3.00</td>\n",
|
|||
|
" <td>1960</td>\n",
|
|||
|
" <td>5000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1050</td>\n",
|
|||
|
" <td>910</td>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98136</td>\n",
|
|||
|
" <td>47.5208</td>\n",
|
|||
|
" <td>-122.393</td>\n",
|
|||
|
" <td>1360</td>\n",
|
|||
|
" <td>5000</td>\n",
|
|||
|
" <td>High</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>981</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>20150218T000000</td>\n",
|
|||
|
" <td>510000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.00</td>\n",
|
|||
|
" <td>1680</td>\n",
|
|||
|
" <td>8080</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1680</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1987</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98074</td>\n",
|
|||
|
" <td>47.6168</td>\n",
|
|||
|
" <td>-122.045</td>\n",
|
|||
|
" <td>1800</td>\n",
|
|||
|
" <td>7503</td>\n",
|
|||
|
" <td>Medium</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>980</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21608</th>\n",
|
|||
|
" <td>20140521T000000</td>\n",
|
|||
|
" <td>360000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1530</td>\n",
|
|||
|
" <td>1131</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1530</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2009</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98103</td>\n",
|
|||
|
" <td>47.6993</td>\n",
|
|||
|
" <td>-122.346</td>\n",
|
|||
|
" <td>1530</td>\n",
|
|||
|
" <td>1509</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>981</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21609</th>\n",
|
|||
|
" <td>20150223T000000</td>\n",
|
|||
|
" <td>400000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>2310</td>\n",
|
|||
|
" <td>5813</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>2310</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98146</td>\n",
|
|||
|
" <td>47.5107</td>\n",
|
|||
|
" <td>-122.362</td>\n",
|
|||
|
" <td>1830</td>\n",
|
|||
|
" <td>7200</td>\n",
|
|||
|
" <td>Medium</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>981</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21610</th>\n",
|
|||
|
" <td>20140623T000000</td>\n",
|
|||
|
" <td>402101.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>1350</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2009</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98144</td>\n",
|
|||
|
" <td>47.5944</td>\n",
|
|||
|
" <td>-122.299</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>2007</td>\n",
|
|||
|
" <td>Medium</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>981</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21611</th>\n",
|
|||
|
" <td>20150116T000000</td>\n",
|
|||
|
" <td>400000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>2.50</td>\n",
|
|||
|
" <td>1600</td>\n",
|
|||
|
" <td>2388</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>1600</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2004</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98027</td>\n",
|
|||
|
" <td>47.5345</td>\n",
|
|||
|
" <td>-122.069</td>\n",
|
|||
|
" <td>1410</td>\n",
|
|||
|
" <td>1287</td>\n",
|
|||
|
" <td>Medium</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>980</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21612</th>\n",
|
|||
|
" <td>20141015T000000</td>\n",
|
|||
|
" <td>325000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>1076</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>2008</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>98144</td>\n",
|
|||
|
" <td>47.5941</td>\n",
|
|||
|
" <td>-122.299</td>\n",
|
|||
|
" <td>1020</td>\n",
|
|||
|
" <td>1357</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>981</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>21613 rows × 20 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" date price bedrooms bathrooms sqft_living sqft_lot \\\n",
|
|||
|
"0 20141013T000000 221900.0 3 1.00 1180 5650 \n",
|
|||
|
"1 20141209T000000 538000.0 3 2.25 2570 7242 \n",
|
|||
|
"2 20150225T000000 180000.0 2 1.00 770 10000 \n",
|
|||
|
"3 20141209T000000 604000.0 4 3.00 1960 5000 \n",
|
|||
|
"4 20150218T000000 510000.0 3 2.00 1680 8080 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"21608 20140521T000000 360000.0 3 2.50 1530 1131 \n",
|
|||
|
"21609 20150223T000000 400000.0 4 2.50 2310 5813 \n",
|
|||
|
"21610 20140623T000000 402101.0 2 0.75 1020 1350 \n",
|
|||
|
"21611 20150116T000000 400000.0 3 2.50 1600 2388 \n",
|
|||
|
"21612 20141015T000000 325000.0 2 0.75 1020 1076 \n",
|
|||
|
"\n",
|
|||
|
" floors grade sqft_above sqft_basement yr_built yr_renovated \\\n",
|
|||
|
"0 1.0 7 1180 0 1955 0 \n",
|
|||
|
"1 2.0 7 2170 400 1951 1991 \n",
|
|||
|
"2 1.0 6 770 0 1933 0 \n",
|
|||
|
"3 1.0 7 1050 910 1965 0 \n",
|
|||
|
"4 1.0 8 1680 0 1987 0 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"21608 3.0 8 1530 0 2009 0 \n",
|
|||
|
"21609 2.0 8 2310 0 2014 0 \n",
|
|||
|
"21610 2.0 7 1020 0 2009 0 \n",
|
|||
|
"21611 2.0 8 1600 0 2004 0 \n",
|
|||
|
"21612 2.0 7 1020 0 2008 0 \n",
|
|||
|
"\n",
|
|||
|
" zipcode lat long sqft_living15 sqft_lot15 Price_category \\\n",
|
|||
|
"0 98178 47.5112 -122.257 1340 5650 Low \n",
|
|||
|
"1 98125 47.7210 -122.319 1690 7639 Medium \n",
|
|||
|
"2 98028 47.7379 -122.233 2720 8062 Low \n",
|
|||
|
"3 98136 47.5208 -122.393 1360 5000 High \n",
|
|||
|
"4 98074 47.6168 -122.045 1800 7503 Medium \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"21608 98103 47.6993 -122.346 1530 1509 Low \n",
|
|||
|
"21609 98146 47.5107 -122.362 1830 7200 Medium \n",
|
|||
|
"21610 98144 47.5944 -122.299 1020 2007 Medium \n",
|
|||
|
"21611 98027 47.5345 -122.069 1410 1287 Medium \n",
|
|||
|
"21612 98144 47.5941 -122.299 1020 1357 Low \n",
|
|||
|
"\n",
|
|||
|
" Renovated_flag Zipcode_area \n",
|
|||
|
"0 0 981 \n",
|
|||
|
"1 1 981 \n",
|
|||
|
"2 0 980 \n",
|
|||
|
"3 0 981 \n",
|
|||
|
"4 0 980 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"21608 0 981 \n",
|
|||
|
"21609 0 981 \n",
|
|||
|
"21610 0 981 \n",
|
|||
|
"21611 0 980 \n",
|
|||
|
"21612 0 981 \n",
|
|||
|
"\n",
|
|||
|
"[21613 rows x 20 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"house_cleaned = house.drop([\"waterfront\", \"view\", \"condition\"], axis=1, errors=\"ignore\")\n",
|
|||
|
"\n",
|
|||
|
"house_cleaned = house_cleaned.dropna()\n",
|
|||
|
"\n",
|
|||
|
"# Признак \"Price_category\": разделение домов на категории цен\n",
|
|||
|
"house_cleaned[\"Price_category\"] = pd.qcut(\n",
|
|||
|
" house_cleaned[\"price\"], q=3, labels=[\"Low\", \"Medium\", \"High\"]\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Признак \"Renovated_flag\": 1, если дом был отремонтирован, иначе 0\n",
|
|||
|
"house_cleaned[\"Renovated_flag\"] = house_cleaned[\"yr_renovated\"].apply(\n",
|
|||
|
" lambda x: 1 if x > 0 else 0\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Признак \"Zipcode_area\": используем первые три цифры из почтового индекса\n",
|
|||
|
"house_cleaned[\"Zipcode_area\"] = house_cleaned[\"zipcode\"].apply(lambda x: str(x)[:3])\n",
|
|||
|
"\n",
|
|||
|
"house_cleaned"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" price bedrooms bathrooms sqft_living sqft_lot floors waterfront \\\n",
|
|||
|
"id \n",
|
|||
|
"0 221900.0 3 1.00 1180 5650 1.0 0 \n",
|
|||
|
"1 538000.0 3 2.25 2570 7242 2.0 0 \n",
|
|||
|
"2 180000.0 2 1.00 770 10000 1.0 0 \n",
|
|||
|
"3 604000.0 4 3.00 1960 5000 1.0 0 \n",
|
|||
|
"4 510000.0 3 2.00 1680 8080 1.0 0 \n",
|
|||
|
"\n",
|
|||
|
" view condition grade ... yr_renovated zipcode lat long \\\n",
|
|||
|
"id ... \n",
|
|||
|
"0 0 3 7 ... 0 98178 47.5112 -122.257 \n",
|
|||
|
"1 0 3 7 ... 1991 98125 47.7210 -122.319 \n",
|
|||
|
"2 0 3 6 ... 0 98028 47.7379 -122.233 \n",
|
|||
|
"3 0 5 7 ... 0 98136 47.5208 -122.393 \n",
|
|||
|
"4 0 3 8 ... 0 98074 47.6168 -122.045 \n",
|
|||
|
"\n",
|
|||
|
" sqft_living15 sqft_lot15 HOUR(date) MONTH(date) WEEKDAY(date) \\\n",
|
|||
|
"id \n",
|
|||
|
"0 1340 5650 0 10 0 \n",
|
|||
|
"1 1690 7639 0 12 1 \n",
|
|||
|
"2 2720 8062 0 2 2 \n",
|
|||
|
"3 1360 5000 0 12 1 \n",
|
|||
|
"4 1800 7503 0 2 2 \n",
|
|||
|
"\n",
|
|||
|
" YEAR(date) \n",
|
|||
|
"id \n",
|
|||
|
"0 2014 \n",
|
|||
|
"1 2014 \n",
|
|||
|
"2 2015 \n",
|
|||
|
"3 2014 \n",
|
|||
|
"4 2015 \n",
|
|||
|
"\n",
|
|||
|
"[5 rows x 23 columns]\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Mai\\mai\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\Mai\\mai\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:\n",
|
|||
|
" agg_primitives: ['count', 'mean', 'mode', 'sum']\n",
|
|||
|
"This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n",
|
|||
|
" warnings.warn(warning_msg, UnusedPrimitiveWarning)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import featuretools as ft\n",
|
|||
|
"from woodwork.logical_types import Categorical, Datetime\n",
|
|||
|
"\n",
|
|||
|
"import featuretools as ft\n",
|
|||
|
"from woodwork.logical_types import Categorical, Datetime\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"data/kc_house_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Убедимся, что есть уникальный идентификатор для каждой строки (если нет, создаем)\n",
|
|||
|
"df[\"id\"] = range(len(df))\n",
|
|||
|
"\n",
|
|||
|
"# Создаем EntitySet для данных о домах\n",
|
|||
|
"es = ft.EntitySet(id=\"house_sales\")\n",
|
|||
|
"\n",
|
|||
|
"# Добавляем основной DataFrame в EntitySet с указанием типов данных\n",
|
|||
|
"es = es.add_dataframe(\n",
|
|||
|
" dataframe_name=\"houses\",\n",
|
|||
|
" dataframe=df,\n",
|
|||
|
" index=\"id\", # Уникальный идентификатор для домов\n",
|
|||
|
" logical_types={\n",
|
|||
|
" \"date\": Datetime,\n",
|
|||
|
" \"zipcode\": Categorical,\n",
|
|||
|
" \"condition\": Categorical,\n",
|
|||
|
" \"grade\": Categorical,\n",
|
|||
|
" \"view\": Categorical,\n",
|
|||
|
" \"waterfront\": Categorical,\n",
|
|||
|
" },\n",
|
|||
|
")\n",
|
|||
|
"ft.primitives.list_primitives()\n",
|
|||
|
"# Автоматическое конструирование признаков с применением корректных примитивов\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(\n",
|
|||
|
" entityset=es,\n",
|
|||
|
" target_dataframe_name=\"houses\", # Название основной таблицы\n",
|
|||
|
" agg_primitives=[\"mean\", \"count\", \"mode\", \"sum\"], # Агрегирующие примитивы\n",
|
|||
|
" trans_primitives=[\n",
|
|||
|
" \"year\",\n",
|
|||
|
" \"month\",\n",
|
|||
|
" \"weekday\",\n",
|
|||
|
" \"hour\",\n",
|
|||
|
" ], # Корректные трансформационные примитивы\n",
|
|||
|
" max_depth=2, # Максимальная глубина для генерации признаков\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Просмотр полученной feature_matrix\n",
|
|||
|
"print(feature_matrix.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 31,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Axes: >"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 31,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAAGdCAYAAADAAnMpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAlr0lEQVR4nO3de3CU5aHH8d9CNpsEsuFmbhBoFAzhEiAgEi4Fi9yHMbZknNQx2IO2x4EeOFjspFcj6jJKDnKGFkFr4znTlBZa0hmLkogEpCE2QdIJFNMDVcLRJIhKliS6vJA9f1i27smF3WTxScL3M5OJ++zzvu+TzLzZr+8uuzav1+sVAACAIf1MLwAAANzciBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYFWZ6AYFobW3VBx98oOjoaNlsNtPLAQAAAfB6vbp06ZISExPVr1/H1z96RYx88MEHSkpKMr0MAADQBefOndOIESM6vL9XxEh0dLSkz38Yp9NpeDUAQsmyLBUXF2vhwoWy2+2mlwMghNxut5KSknyP4x3pFTFy7akZp9NJjAB9jGVZioqKktPpJEaAPup6L7HgBawAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMCooGLk8ccfl81m8/saO3Zsp9vs3r1bY8eOVUREhCZOnKh9+/Z1a8EAAKBvCfrKyPjx41VXV+f7OnLkSIdzy8rKlJ2drVWrVun48ePKzMxUZmamTpw40a1FAwCAviPoGAkLC1N8fLzva9iwYR3O3bp1qxYvXqwNGzYoNTVVGzduVHp6urZt29atRQMAgL4j6A/K+5//+R8lJiYqIiJCGRkZcrlcGjlyZLtzjx49qvXr1/uNLVq0SEVFRZ0ew+PxyOPx+G673W5Jn3+glmVZwS4ZQIi1tLSopqYmJPtq+tSjsuozih5UroGRjm7vLyUlRVFRUSFYGYDuCvQxO6gYufPOO1VQUKCUlBTV1dUpLy9Pc+bM0YkTJ9r9eOD6+nrFxcX5jcXFxam+vr7T47hcLuXl5bUZLy4u5o8M0AOcOXNGjz76aEj3+UyI9pOfn6/bbrstRHsD0B0tLS0BzQsqRpYsWeL777S0NN15550aNWqUfvvb32rVqlXBrbATubm5fldU3G63kpKStHDhQjmdzpAdB0DXtLS0aPbs2SHZ19/qGrVh71/17L3jdHtCTLf3x5URoOe49szG9QT9NM0XDRo0SLfffrtOnz7d7v3x8fFqaGjwG2toaFB8fHyn+3U4HHI42l6utdvtstvtXV8wgJCIiYnR9OnTQ7Kv8LMfyXH0siZMTtfkUUNDsk8APUOgj9ndep+RpqYmnTlzRgkJCe3en5GRoQMHDviNlZSUKCMjozuHBQAAfUhQMfK9731Phw4d0nvvvaeysjLde++96t+/v7KzsyVJOTk5ys3N9c1fu3atXnvtNeXn5+udd97R448/rsrKSq1Zsya0PwUAAOi1gnqa5n//93+VnZ2tjz76SLfccotmz56t8vJy3XLLLZKk2tpa9ev3z76ZOXOmCgsL9aMf/Ug/+MEPNGbMGBUVFWnChAmh/SkAAECvFVSM7Nq1q9P7S0tL24xlZWUpKysrqEUBAICbB59NAwAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKO6FSObNm2SzWbTunXrOpxTUFAgm83m9xUREdGdwwIAgD4krKsbVlRUaMeOHUpLS7vuXKfTqZqaGt9tm83W1cMCAIA+pktXRpqamnT//ffrhRde0ODBg68732azKT4+3vcVFxfXlcMCAIA+qEtXRlavXq1ly5bp7rvv1pNPPnnd+U1NTRo1apRaW1uVnp6up59+WuPHj+9wvsfjkcfj8d12u92SJMuyZFlWV5YMoIe6cuWK7zvnN9C3BHpOBx0ju3bt0ttvv62KioqA5qekpOill15SWlqaGhsbtXnzZs2cOVMnT57UiBEj2t3G5XIpLy+vzXhxcbGioqKCXTKAHuxckySFqby8XO+fML0aAKHU0tIS0Dyb1+v1BrrTc+fOadq0aSopKfG9VmTevHmaPHmynnvuuYD2YVmWUlNTlZ2drY0bN7Y7p70rI0lJSbpw4YKcTmegywXQC/yl9mOteKFSex6epkkjh5heDoAQcrvdGjZsmBobGzt9/A7qysixY8d0/vx5paen+8auXr2qw4cPa9u2bfJ4POrfv3+n+7Db7ZoyZYpOnz7d4RyHwyGHw9Hutna7PZglA+jhwsLCfN85v4G+JdBzOqgYmT9/vqqrq/3GvvWtb2ns2LH6/ve/f90QkT6Pl+rqai1dujSYQwMAgD4qqBiJjo7WhAkT/MYGDBigoUOH+sZzcnI0fPhwuVwuSdITTzyhGTNmaPTo0bp48aKeffZZnT17Vg899FCIfgQAANCbdfl9RjpSW1urfv3++S+GP/nkEz388MOqr6/X4MGDNXXqVJWVlWncuHGhPjQAAOiFgnoBqylut1sxMTHXfQEMgN6n6uxHytxerqJHZmjyqKGmlwMghAJ9/OazaQAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGBUt2Jk06ZNstlsWrduXafzdu/erbFjxyoiIkITJ07Uvn37unNYAADQh3Q5RioqKrRjxw6lpaV1Oq+srEzZ2dlatWqVjh8/rszMTGVmZurEiRNdPTQAAOhDuhQjTU1Nuv/++/XCCy9o8ODBnc7dunWrFi9erA0bNig1NVUbN25Uenq6tm3b1qUFAwCAviWsKxutXr1ay5Yt0913360nn3yy07lHjx7V+vXr/cYWLVqkoqKiDrfxeDzyeDy+2263W5JkWZYsy+rKkgH8w3sfNavZc9X0Mnz+Vt/o972nGODor68MHWB6GUCvFuhjdtAxsmvXLr399tuqqKgIaH59fb3i4uL8xuLi4lRfX9/hNi6XS3l5eW3Gi4uLFRUVFdyCAfic/1R6qqpL/w9ywz2295TpJbTxw8lXFBtpehVA79XS0hLQvKD+Kp07d05r165VSUmJIiIiurSwQOTm5vpdTXG73UpKStLChQvldDpv2HGBvu7kB26pqlybV0zU6Ft6xv/1N3/m0WtvVmjxnDs0IMJhejmSpNMfNut7e6p1R8ZsjU/kbw7QVdee2bieoGLk2LFjOn/+vNLT031jV69e1eHDh7Vt2zZ5PB7179/fb5v4+Hg1NDT4jTU0NCg+Pr7D4zgcDjkcbf8o2e122e32YJYM4AvCwj4/5ccmxGjC8BjDq/mcZVm68I40/dZbesz5fe33FBYW1mPWBPRGgZ4/Qb2Adf78+aqurlZVVZXva9q0abr//vtVVVXVJkQkKSMjQwcOHPAbKykpUUZGRjCHBgAAfVRQV0aio6M1YcIEv7EBAwZo6NChvvGcnBwNHz5cLpdLkrR27VrNnTtX+fn5WrZsmXbt2qXKykrt3LkzRD8CAADozUL+Dqy1tbWqq6vz3Z45c6YKCwu1c+dOTZo0SXv27FFRUVGbqAEAADenbr+svrS0tNPbkpSVlaWsrKzuHgoAAPRBfDYNAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgA
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"house.boxplot(column=\"condition\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 33,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>date</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>ConditionClip</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>36</th>\n",
|
|||
|
" <td>20140528T000000</td>\n",
|
|||
|
" <td>550000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>380</th>\n",
|
|||
|
" <td>20140916T000000</td>\n",
|
|||
|
" <td>270000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>397</th>\n",
|
|||
|
" <td>20140623T000000</td>\n",
|
|||
|
" <td>365000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1442</th>\n",
|
|||
|
" <td>20141107T000000</td>\n",
|
|||
|
" <td>352950.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1734</th>\n",
|
|||
|
" <td>20150102T000000</td>\n",
|
|||
|
" <td>252000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2223</th>\n",
|
|||
|
" <td>20150316T000000</td>\n",
|
|||
|
" <td>535000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3004</th>\n",
|
|||
|
" <td>20141231T000000</td>\n",
|
|||
|
" <td>441000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3202</th>\n",
|
|||
|
" <td>20140509T000000</td>\n",
|
|||
|
" <td>255000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3975</th>\n",
|
|||
|
" <td>20150511T000000</td>\n",
|
|||
|
" <td>210000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4651</th>\n",
|
|||
|
" <td>20141002T000000</td>\n",
|
|||
|
" <td>125000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7376</th>\n",
|
|||
|
" <td>20141107T000000</td>\n",
|
|||
|
" <td>295000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7636</th>\n",
|
|||
|
" <td>20150120T000000</td>\n",
|
|||
|
" <td>190000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12306</th>\n",
|
|||
|
" <td>20150128T000000</td>\n",
|
|||
|
" <td>196000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12453</th>\n",
|
|||
|
" <td>20150402T000000</td>\n",
|
|||
|
" <td>305000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12668</th>\n",
|
|||
|
" <td>20140729T000000</td>\n",
|
|||
|
" <td>227000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13628</th>\n",
|
|||
|
" <td>20140716T000000</td>\n",
|
|||
|
" <td>105500.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13629</th>\n",
|
|||
|
" <td>20150316T000000</td>\n",
|
|||
|
" <td>445000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14987</th>\n",
|
|||
|
" <td>20141202T000000</td>\n",
|
|||
|
" <td>432500.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15293</th>\n",
|
|||
|
" <td>20140506T000000</td>\n",
|
|||
|
" <td>78000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15337</th>\n",
|
|||
|
" <td>20140630T000000</td>\n",
|
|||
|
" <td>235000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15371</th>\n",
|
|||
|
" <td>20150114T000000</td>\n",
|
|||
|
" <td>658000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15712</th>\n",
|
|||
|
" <td>20140724T000000</td>\n",
|
|||
|
" <td>150000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16198</th>\n",
|
|||
|
" <td>20150324T000000</td>\n",
|
|||
|
" <td>81000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16893</th>\n",
|
|||
|
" <td>20141210T000000</td>\n",
|
|||
|
" <td>125000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16942</th>\n",
|
|||
|
" <td>20140611T000000</td>\n",
|
|||
|
" <td>427000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17805</th>\n",
|
|||
|
" <td>20150501T000000</td>\n",
|
|||
|
" <td>380000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18332</th>\n",
|
|||
|
" <td>20140924T000000</td>\n",
|
|||
|
" <td>130000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18645</th>\n",
|
|||
|
" <td>20141216T000000</td>\n",
|
|||
|
" <td>575000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18876</th>\n",
|
|||
|
" <td>20150211T000000</td>\n",
|
|||
|
" <td>1500000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19452</th>\n",
|
|||
|
" <td>20140926T000000</td>\n",
|
|||
|
" <td>142000.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" date price ConditionClip\n",
|
|||
|
"36 20140528T000000 550000.0 2\n",
|
|||
|
"380 20140916T000000 270000.0 2\n",
|
|||
|
"397 20140623T000000 365000.0 2\n",
|
|||
|
"1442 20141107T000000 352950.0 2\n",
|
|||
|
"1734 20150102T000000 252000.0 2\n",
|
|||
|
"2223 20150316T000000 535000.0 2\n",
|
|||
|
"3004 20141231T000000 441000.0 2\n",
|
|||
|
"3202 20140509T000000 255000.0 2\n",
|
|||
|
"3975 20150511T000000 210000.0 2\n",
|
|||
|
"4651 20141002T000000 125000.0 2\n",
|
|||
|
"7376 20141107T000000 295000.0 2\n",
|
|||
|
"7636 20150120T000000 190000.0 2\n",
|
|||
|
"12306 20150128T000000 196000.0 2\n",
|
|||
|
"12453 20150402T000000 305000.0 2\n",
|
|||
|
"12668 20140729T000000 227000.0 2\n",
|
|||
|
"13628 20140716T000000 105500.0 2\n",
|
|||
|
"13629 20150316T000000 445000.0 2\n",
|
|||
|
"14987 20141202T000000 432500.0 2\n",
|
|||
|
"15293 20140506T000000 78000.0 2\n",
|
|||
|
"15337 20140630T000000 235000.0 2\n",
|
|||
|
"15371 20150114T000000 658000.0 2\n",
|
|||
|
"15712 20140724T000000 150000.0 2\n",
|
|||
|
"16198 20150324T000000 81000.0 2\n",
|
|||
|
"16893 20141210T000000 125000.0 2\n",
|
|||
|
"16942 20140611T000000 427000.0 2\n",
|
|||
|
"17805 20150501T000000 380000.0 2\n",
|
|||
|
"18332 20140924T000000 130000.0 2\n",
|
|||
|
"18645 20141216T000000 575000.0 2\n",
|
|||
|
"18876 20150211T000000 1500000.0 2\n",
|
|||
|
"19452 20140926T000000 142000.0 2"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 33,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"house_norm = house.copy()\n",
|
|||
|
"\n",
|
|||
|
"house_norm[\"ConditionClip\"] = house[\"condition\"].clip(2, 5)\n",
|
|||
|
"\n",
|
|||
|
"house_norm[house_norm[\"condition\"] < 2][[\"date\", \"price\", \"ConditionClip\"]]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 34,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"5.0\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>date</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>ConditionWinsorize</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>36</th>\n",
|
|||
|
" <td>20140528T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>380</th>\n",
|
|||
|
" <td>20140916T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>397</th>\n",
|
|||
|
" <td>20140623T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1442</th>\n",
|
|||
|
" <td>20141107T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1734</th>\n",
|
|||
|
" <td>20150102T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2223</th>\n",
|
|||
|
" <td>20150316T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3004</th>\n",
|
|||
|
" <td>20141231T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3202</th>\n",
|
|||
|
" <td>20140509T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3975</th>\n",
|
|||
|
" <td>20150511T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4651</th>\n",
|
|||
|
" <td>20141002T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7376</th>\n",
|
|||
|
" <td>20141107T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7636</th>\n",
|
|||
|
" <td>20150120T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12306</th>\n",
|
|||
|
" <td>20150128T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12453</th>\n",
|
|||
|
" <td>20150402T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12668</th>\n",
|
|||
|
" <td>20140729T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13628</th>\n",
|
|||
|
" <td>20140716T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13629</th>\n",
|
|||
|
" <td>20150316T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14987</th>\n",
|
|||
|
" <td>20141202T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15293</th>\n",
|
|||
|
" <td>20140506T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15337</th>\n",
|
|||
|
" <td>20140630T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15371</th>\n",
|
|||
|
" <td>20150114T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15712</th>\n",
|
|||
|
" <td>20140724T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16198</th>\n",
|
|||
|
" <td>20150324T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16893</th>\n",
|
|||
|
" <td>20141210T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16942</th>\n",
|
|||
|
" <td>20140611T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17805</th>\n",
|
|||
|
" <td>20150501T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18332</th>\n",
|
|||
|
" <td>20140924T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18645</th>\n",
|
|||
|
" <td>20141216T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18876</th>\n",
|
|||
|
" <td>20150211T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19452</th>\n",
|
|||
|
" <td>20140926T000000</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" date condition ConditionWinsorize\n",
|
|||
|
"36 20140528T000000 1 3\n",
|
|||
|
"380 20140916T000000 1 3\n",
|
|||
|
"397 20140623T000000 1 3\n",
|
|||
|
"1442 20141107T000000 1 3\n",
|
|||
|
"1734 20150102T000000 1 3\n",
|
|||
|
"2223 20150316T000000 1 3\n",
|
|||
|
"3004 20141231T000000 1 3\n",
|
|||
|
"3202 20140509T000000 1 3\n",
|
|||
|
"3975 20150511T000000 1 3\n",
|
|||
|
"4651 20141002T000000 1 3\n",
|
|||
|
"7376 20141107T000000 1 3\n",
|
|||
|
"7636 20150120T000000 1 3\n",
|
|||
|
"12306 20150128T000000 1 3\n",
|
|||
|
"12453 20150402T000000 1 3\n",
|
|||
|
"12668 20140729T000000 1 3\n",
|
|||
|
"13628 20140716T000000 1 3\n",
|
|||
|
"13629 20150316T000000 1 3\n",
|
|||
|
"14987 20141202T000000 1 3\n",
|
|||
|
"15293 20140506T000000 1 3\n",
|
|||
|
"15337 20140630T000000 1 3\n",
|
|||
|
"15371 20150114T000000 1 3\n",
|
|||
|
"15712 20140724T000000 1 3\n",
|
|||
|
"16198 20150324T000000 1 3\n",
|
|||
|
"16893 20141210T000000 1 3\n",
|
|||
|
"16942 20140611T000000 1 3\n",
|
|||
|
"17805 20150501T000000 1 3\n",
|
|||
|
"18332 20140924T000000 1 3\n",
|
|||
|
"18645 20141216T000000 1 3\n",
|
|||
|
"18876 20150211T000000 1 3\n",
|
|||
|
"19452 20140926T000000 1 3"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 34,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from scipy.stats.mstats import winsorize\n",
|
|||
|
"\n",
|
|||
|
"print(house_norm[\"condition\"].quantile(q=0.95))\n",
|
|||
|
"\n",
|
|||
|
"house_norm[\"ConditionWinsorize\"] = winsorize(\n",
|
|||
|
" house_norm[\"condition\"].fillna(house_norm[\"condition\"].mean()), (0.01, 0.05), inplace=False\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"house_norm[house_norm[\"condition\"] < 2][[\"date\", \"condition\", \"ConditionWinsorize\"]]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 35,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>ConditionNorm</th>\n",
|
|||
|
" <th>ConditionClipNorm</th>\n",
|
|||
|
" <th>ConditionWinsorizeNorm</th>\n",
|
|||
|
" <th>ConditionWinsorizeNorm2</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>221900.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>0.50</td>\n",
|
|||
|
" <td>0.333333</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>-1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>538000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>0.50</td>\n",
|
|||
|
" <td>0.333333</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>-1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>180000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>0.50</td>\n",
|
|||
|
" <td>0.333333</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>-1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>604000.0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>510000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>0.50</td>\n",
|
|||
|
" <td>0.333333</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>-1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>1225000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>0.50</td>\n",
|
|||
|
" <td>0.333333</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>-1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>257500.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>0.50</td>\n",
|
|||
|
" <td>0.333333</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>-1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>291850.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>0.50</td>\n",
|
|||
|
" <td>0.333333</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>-1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>229500.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>0.50</td>\n",
|
|||
|
" <td>0.333333</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>-1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>323000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>0.50</td>\n",
|
|||
|
" <td>0.333333</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>-1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>662500.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>0.50</td>\n",
|
|||
|
" <td>0.333333</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>-1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <td>468000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>0.666667</td>\n",
|
|||
|
" <td>0.5</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>310000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>0.666667</td>\n",
|
|||
|
" <td>0.5</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>400000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>0.666667</td>\n",
|
|||
|
" <td>0.5</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>530000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>0.50</td>\n",
|
|||
|
" <td>0.333333</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>-1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>650000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>0.50</td>\n",
|
|||
|
" <td>0.333333</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>-1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>395000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>0.50</td>\n",
|
|||
|
" <td>0.333333</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>-1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>485000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>0.666667</td>\n",
|
|||
|
" <td>0.5</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <td>189000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>0.666667</td>\n",
|
|||
|
" <td>0.5</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <td>230000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>0.75</td>\n",
|
|||
|
" <td>0.666667</td>\n",
|
|||
|
" <td>0.5</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" price condition ConditionNorm ConditionClipNorm \\\n",
|
|||
|
"0 221900.0 3 0.50 0.333333 \n",
|
|||
|
"1 538000.0 3 0.50 0.333333 \n",
|
|||
|
"2 180000.0 3 0.50 0.333333 \n",
|
|||
|
"3 604000.0 5 1.00 1.000000 \n",
|
|||
|
"4 510000.0 3 0.50 0.333333 \n",
|
|||
|
"5 1225000.0 3 0.50 0.333333 \n",
|
|||
|
"6 257500.0 3 0.50 0.333333 \n",
|
|||
|
"7 291850.0 3 0.50 0.333333 \n",
|
|||
|
"8 229500.0 3 0.50 0.333333 \n",
|
|||
|
"9 323000.0 3 0.50 0.333333 \n",
|
|||
|
"10 662500.0 3 0.50 0.333333 \n",
|
|||
|
"11 468000.0 4 0.75 0.666667 \n",
|
|||
|
"12 310000.0 4 0.75 0.666667 \n",
|
|||
|
"13 400000.0 4 0.75 0.666667 \n",
|
|||
|
"14 530000.0 3 0.50 0.333333 \n",
|
|||
|
"15 650000.0 3 0.50 0.333333 \n",
|
|||
|
"16 395000.0 3 0.50 0.333333 \n",
|
|||
|
"17 485000.0 4 0.75 0.666667 \n",
|
|||
|
"18 189000.0 4 0.75 0.666667 \n",
|
|||
|
"19 230000.0 4 0.75 0.666667 \n",
|
|||
|
"\n",
|
|||
|
" ConditionWinsorizeNorm ConditionWinsorizeNorm2 \n",
|
|||
|
"0 0.0 -1.0 \n",
|
|||
|
"1 0.0 -1.0 \n",
|
|||
|
"2 0.0 -1.0 \n",
|
|||
|
"3 1.0 1.0 \n",
|
|||
|
"4 0.0 -1.0 \n",
|
|||
|
"5 0.0 -1.0 \n",
|
|||
|
"6 0.0 -1.0 \n",
|
|||
|
"7 0.0 -1.0 \n",
|
|||
|
"8 0.0 -1.0 \n",
|
|||
|
"9 0.0 -1.0 \n",
|
|||
|
"10 0.0 -1.0 \n",
|
|||
|
"11 0.5 0.0 \n",
|
|||
|
"12 0.5 0.0 \n",
|
|||
|
"13 0.5 0.0 \n",
|
|||
|
"14 0.0 -1.0 \n",
|
|||
|
"15 0.0 -1.0 \n",
|
|||
|
"16 0.0 -1.0 \n",
|
|||
|
"17 0.5 0.0 \n",
|
|||
|
"18 0.5 0.0 \n",
|
|||
|
"19 0.5 0.0 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 35,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn import preprocessing\n",
|
|||
|
"\n",
|
|||
|
"min_max_scaler = preprocessing.MinMaxScaler()\n",
|
|||
|
"\n",
|
|||
|
"min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n",
|
|||
|
"\n",
|
|||
|
"house_norm[\"ConditionNorm\"] = min_max_scaler.fit_transform(\n",
|
|||
|
" house_norm[\"condition\"].to_numpy().reshape(-1, 1)\n",
|
|||
|
").reshape(house_norm[\"condition\"].shape)\n",
|
|||
|
"\n",
|
|||
|
"house_norm[\"ConditionClipNorm\"] = min_max_scaler.fit_transform(\n",
|
|||
|
" house_norm[\"ConditionClip\"].to_numpy().reshape(-1, 1)\n",
|
|||
|
").reshape(house_norm[\"condition\"].shape)\n",
|
|||
|
"\n",
|
|||
|
"house_norm[\"ConditionWinsorizeNorm\"] = min_max_scaler.fit_transform(\n",
|
|||
|
" house_norm[\"ConditionWinsorize\"].to_numpy().reshape(-1, 1)\n",
|
|||
|
").reshape(house_norm[\"condition\"].shape)\n",
|
|||
|
"\n",
|
|||
|
"house_norm[\"ConditionWinsorizeNorm2\"] = min_max_scaler_2.fit_transform(\n",
|
|||
|
" house_norm[\"ConditionWinsorize\"].to_numpy().reshape(-1, 1)\n",
|
|||
|
").reshape(house_norm[\"condition\"].shape)\n",
|
|||
|
"\n",
|
|||
|
"house_norm[\n",
|
|||
|
" [\n",
|
|||
|
" \"price\",\n",
|
|||
|
" \"condition\",\n",
|
|||
|
" \"ConditionNorm\",\n",
|
|||
|
" \"ConditionClipNorm\",\n",
|
|||
|
" \"ConditionWinsorizeNorm\",\n",
|
|||
|
" \"ConditionWinsorizeNorm2\",\n",
|
|||
|
" ]\n",
|
|||
|
"].head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 36,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>condition</th>\n",
|
|||
|
" <th>ConditionStand</th>\n",
|
|||
|
" <th>ConditionClipStand</th>\n",
|
|||
|
" <th>ConditionWinsorizeStand</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>221900.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>-0.629187</td>\n",
|
|||
|
" <td>-0.635310</td>\n",
|
|||
|
" <td>-0.663482</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>538000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>-0.629187</td>\n",
|
|||
|
" <td>-0.635310</td>\n",
|
|||
|
" <td>-0.663482</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>180000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>-0.629187</td>\n",
|
|||
|
" <td>-0.635310</td>\n",
|
|||
|
" <td>-0.663482</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>604000.0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>2.444294</td>\n",
|
|||
|
" <td>2.457597</td>\n",
|
|||
|
" <td>2.494726</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>510000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>-0.629187</td>\n",
|
|||
|
" <td>-0.635310</td>\n",
|
|||
|
" <td>-0.663482</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>1225000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>-0.629187</td>\n",
|
|||
|
" <td>-0.635310</td>\n",
|
|||
|
" <td>-0.663482</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>257500.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>-0.629187</td>\n",
|
|||
|
" <td>-0.635310</td>\n",
|
|||
|
" <td>-0.663482</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>291850.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>-0.629187</td>\n",
|
|||
|
" <td>-0.635310</td>\n",
|
|||
|
" <td>-0.663482</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>229500.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>-0.629187</td>\n",
|
|||
|
" <td>-0.635310</td>\n",
|
|||
|
" <td>-0.663482</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>323000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>-0.629187</td>\n",
|
|||
|
" <td>-0.635310</td>\n",
|
|||
|
" <td>-0.663482</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>662500.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>-0.629187</td>\n",
|
|||
|
" <td>-0.635310</td>\n",
|
|||
|
" <td>-0.663482</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <td>468000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>0.907554</td>\n",
|
|||
|
" <td>0.911143</td>\n",
|
|||
|
" <td>0.915622</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>310000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>0.907554</td>\n",
|
|||
|
" <td>0.911143</td>\n",
|
|||
|
" <td>0.915622</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>400000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>0.907554</td>\n",
|
|||
|
" <td>0.911143</td>\n",
|
|||
|
" <td>0.915622</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>530000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>-0.629187</td>\n",
|
|||
|
" <td>-0.635310</td>\n",
|
|||
|
" <td>-0.663482</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>650000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>-0.629187</td>\n",
|
|||
|
" <td>-0.635310</td>\n",
|
|||
|
" <td>-0.663482</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>395000.0</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>-0.629187</td>\n",
|
|||
|
" <td>-0.635310</td>\n",
|
|||
|
" <td>-0.663482</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>485000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>0.907554</td>\n",
|
|||
|
" <td>0.911143</td>\n",
|
|||
|
" <td>0.915622</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <td>189000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>0.907554</td>\n",
|
|||
|
" <td>0.911143</td>\n",
|
|||
|
" <td>0.915622</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <td>230000.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>0.907554</td>\n",
|
|||
|
" <td>0.911143</td>\n",
|
|||
|
" <td>0.915622</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" price condition ConditionStand ConditionClipStand \\\n",
|
|||
|
"0 221900.0 3 -0.629187 -0.635310 \n",
|
|||
|
"1 538000.0 3 -0.629187 -0.635310 \n",
|
|||
|
"2 180000.0 3 -0.629187 -0.635310 \n",
|
|||
|
"3 604000.0 5 2.444294 2.457597 \n",
|
|||
|
"4 510000.0 3 -0.629187 -0.635310 \n",
|
|||
|
"5 1225000.0 3 -0.629187 -0.635310 \n",
|
|||
|
"6 257500.0 3 -0.629187 -0.635310 \n",
|
|||
|
"7 291850.0 3 -0.629187 -0.635310 \n",
|
|||
|
"8 229500.0 3 -0.629187 -0.635310 \n",
|
|||
|
"9 323000.0 3 -0.629187 -0.635310 \n",
|
|||
|
"10 662500.0 3 -0.629187 -0.635310 \n",
|
|||
|
"11 468000.0 4 0.907554 0.911143 \n",
|
|||
|
"12 310000.0 4 0.907554 0.911143 \n",
|
|||
|
"13 400000.0 4 0.907554 0.911143 \n",
|
|||
|
"14 530000.0 3 -0.629187 -0.635310 \n",
|
|||
|
"15 650000.0 3 -0.629187 -0.635310 \n",
|
|||
|
"16 395000.0 3 -0.629187 -0.635310 \n",
|
|||
|
"17 485000.0 4 0.907554 0.911143 \n",
|
|||
|
"18 189000.0 4 0.907554 0.911143 \n",
|
|||
|
"19 230000.0 4 0.907554 0.911143 \n",
|
|||
|
"\n",
|
|||
|
" ConditionWinsorizeStand \n",
|
|||
|
"0 -0.663482 \n",
|
|||
|
"1 -0.663482 \n",
|
|||
|
"2 -0.663482 \n",
|
|||
|
"3 2.494726 \n",
|
|||
|
"4 -0.663482 \n",
|
|||
|
"5 -0.663482 \n",
|
|||
|
"6 -0.663482 \n",
|
|||
|
"7 -0.663482 \n",
|
|||
|
"8 -0.663482 \n",
|
|||
|
"9 -0.663482 \n",
|
|||
|
"10 -0.663482 \n",
|
|||
|
"11 0.915622 \n",
|
|||
|
"12 0.915622 \n",
|
|||
|
"13 0.915622 \n",
|
|||
|
"14 -0.663482 \n",
|
|||
|
"15 -0.663482 \n",
|
|||
|
"16 -0.663482 \n",
|
|||
|
"17 0.915622 \n",
|
|||
|
"18 0.915622 \n",
|
|||
|
"19 0.915622 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 36,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn import preprocessing\n",
|
|||
|
"\n",
|
|||
|
"stndart_scaler = preprocessing.StandardScaler()\n",
|
|||
|
"\n",
|
|||
|
"house_norm[\"ConditionStand\"] = stndart_scaler.fit_transform(\n",
|
|||
|
" house_norm[\"condition\"].to_numpy().reshape(-1, 1)\n",
|
|||
|
").reshape(house_norm[\"condition\"].shape)\n",
|
|||
|
"\n",
|
|||
|
"house_norm[\"ConditionClipStand\"] = stndart_scaler.fit_transform(\n",
|
|||
|
" house_norm[\"ConditionClip\"].to_numpy().reshape(-1, 1)\n",
|
|||
|
").reshape(house_norm[\"condition\"].shape)\n",
|
|||
|
"\n",
|
|||
|
"house_norm[\"ConditionWinsorizeStand\"] = stndart_scaler.fit_transform(\n",
|
|||
|
" house_norm[\"ConditionWinsorize\"].to_numpy().reshape(-1, 1)\n",
|
|||
|
").reshape(house_norm[\"condition\"].shape)\n",
|
|||
|
"\n",
|
|||
|
"house_norm[\n",
|
|||
|
" [\n",
|
|||
|
" \"price\",\n",
|
|||
|
" \"condition\",\n",
|
|||
|
" \"ConditionStand\",\n",
|
|||
|
" \"ConditionClipStand\",\n",
|
|||
|
" \"ConditionWinsorizeStand\",\n",
|
|||
|
" ]\n",
|
|||
|
"].head(20)"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.2"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|