{
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" price | \n",
" bedrooms | \n",
" bathrooms | \n",
" sqft_living | \n",
" sqft_lot | \n",
" floors | \n",
" waterfront | \n",
" view | \n",
" condition | \n",
" grade | \n",
" sqft_above | \n",
" sqft_basement | \n",
" yr_built | \n",
" yr_renovated | \n",
" zipcode | \n",
" lat | \n",
" long | \n",
" sqft_living15 | \n",
" sqft_lot15 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 20141013T000000 | \n",
" 221900.0 | \n",
" 3 | \n",
" 1.00 | \n",
" 1180 | \n",
" 5650 | \n",
" 1.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 7 | \n",
" 1180 | \n",
" 0 | \n",
" 1955 | \n",
" 0 | \n",
" 98178 | \n",
" 47.5112 | \n",
" -122.257 | \n",
" 1340 | \n",
" 5650 | \n",
"
\n",
" \n",
" 1 | \n",
" 20141209T000000 | \n",
" 538000.0 | \n",
" 3 | \n",
" 2.25 | \n",
" 2570 | \n",
" 7242 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 7 | \n",
" 2170 | \n",
" 400 | \n",
" 1951 | \n",
" 1991 | \n",
" 98125 | \n",
" 47.7210 | \n",
" -122.319 | \n",
" 1690 | \n",
" 7639 | \n",
"
\n",
" \n",
" 2 | \n",
" 20150225T000000 | \n",
" 180000.0 | \n",
" 2 | \n",
" 1.00 | \n",
" 770 | \n",
" 10000 | \n",
" 1.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 6 | \n",
" 770 | \n",
" 0 | \n",
" 1933 | \n",
" 0 | \n",
" 98028 | \n",
" 47.7379 | \n",
" -122.233 | \n",
" 2720 | \n",
" 8062 | \n",
"
\n",
" \n",
" 3 | \n",
" 20141209T000000 | \n",
" 604000.0 | \n",
" 4 | \n",
" 3.00 | \n",
" 1960 | \n",
" 5000 | \n",
" 1.0 | \n",
" 0 | \n",
" 0 | \n",
" 5 | \n",
" 7 | \n",
" 1050 | \n",
" 910 | \n",
" 1965 | \n",
" 0 | \n",
" 98136 | \n",
" 47.5208 | \n",
" -122.393 | \n",
" 1360 | \n",
" 5000 | \n",
"
\n",
" \n",
" 4 | \n",
" 20150218T000000 | \n",
" 510000.0 | \n",
" 3 | \n",
" 2.00 | \n",
" 1680 | \n",
" 8080 | \n",
" 1.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 8 | \n",
" 1680 | \n",
" 0 | \n",
" 1987 | \n",
" 0 | \n",
" 98074 | \n",
" 47.6168 | \n",
" -122.045 | \n",
" 1800 | \n",
" 7503 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 21608 | \n",
" 20140521T000000 | \n",
" 360000.0 | \n",
" 3 | \n",
" 2.50 | \n",
" 1530 | \n",
" 1131 | \n",
" 3.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 8 | \n",
" 1530 | \n",
" 0 | \n",
" 2009 | \n",
" 0 | \n",
" 98103 | \n",
" 47.6993 | \n",
" -122.346 | \n",
" 1530 | \n",
" 1509 | \n",
"
\n",
" \n",
" 21609 | \n",
" 20150223T000000 | \n",
" 400000.0 | \n",
" 4 | \n",
" 2.50 | \n",
" 2310 | \n",
" 5813 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 8 | \n",
" 2310 | \n",
" 0 | \n",
" 2014 | \n",
" 0 | \n",
" 98146 | \n",
" 47.5107 | \n",
" -122.362 | \n",
" 1830 | \n",
" 7200 | \n",
"
\n",
" \n",
" 21610 | \n",
" 20140623T000000 | \n",
" 402101.0 | \n",
" 2 | \n",
" 0.75 | \n",
" 1020 | \n",
" 1350 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 7 | \n",
" 1020 | \n",
" 0 | \n",
" 2009 | \n",
" 0 | \n",
" 98144 | \n",
" 47.5944 | \n",
" -122.299 | \n",
" 1020 | \n",
" 2007 | \n",
"
\n",
" \n",
" 21611 | \n",
" 20150116T000000 | \n",
" 400000.0 | \n",
" 3 | \n",
" 2.50 | \n",
" 1600 | \n",
" 2388 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 8 | \n",
" 1600 | \n",
" 0 | \n",
" 2004 | \n",
" 0 | \n",
" 98027 | \n",
" 47.5345 | \n",
" -122.069 | \n",
" 1410 | \n",
" 1287 | \n",
"
\n",
" \n",
" 21612 | \n",
" 20141015T000000 | \n",
" 325000.0 | \n",
" 2 | \n",
" 0.75 | \n",
" 1020 | \n",
" 1076 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 7 | \n",
" 1020 | \n",
" 0 | \n",
" 2008 | \n",
" 0 | \n",
" 98144 | \n",
" 47.5941 | \n",
" -122.299 | \n",
" 1020 | \n",
" 1357 | \n",
"
\n",
" \n",
"
\n",
"
21613 rows × 20 columns
\n",
"
"
],
"text/plain": [
" date price bedrooms bathrooms sqft_living sqft_lot \\\n",
"0 20141013T000000 221900.0 3 1.00 1180 5650 \n",
"1 20141209T000000 538000.0 3 2.25 2570 7242 \n",
"2 20150225T000000 180000.0 2 1.00 770 10000 \n",
"3 20141209T000000 604000.0 4 3.00 1960 5000 \n",
"4 20150218T000000 510000.0 3 2.00 1680 8080 \n",
"... ... ... ... ... ... ... \n",
"21608 20140521T000000 360000.0 3 2.50 1530 1131 \n",
"21609 20150223T000000 400000.0 4 2.50 2310 5813 \n",
"21610 20140623T000000 402101.0 2 0.75 1020 1350 \n",
"21611 20150116T000000 400000.0 3 2.50 1600 2388 \n",
"21612 20141015T000000 325000.0 2 0.75 1020 1076 \n",
"\n",
" floors waterfront view condition grade sqft_above sqft_basement \\\n",
"0 1.0 0 0 3 7 1180 0 \n",
"1 2.0 0 0 3 7 2170 400 \n",
"2 1.0 0 0 3 6 770 0 \n",
"3 1.0 0 0 5 7 1050 910 \n",
"4 1.0 0 0 3 8 1680 0 \n",
"... ... ... ... ... ... ... ... \n",
"21608 3.0 0 0 3 8 1530 0 \n",
"21609 2.0 0 0 3 8 2310 0 \n",
"21610 2.0 0 0 3 7 1020 0 \n",
"21611 2.0 0 0 3 8 1600 0 \n",
"21612 2.0 0 0 3 7 1020 0 \n",
"\n",
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
"0 1955 0 98178 47.5112 -122.257 1340 \n",
"1 1951 1991 98125 47.7210 -122.319 1690 \n",
"2 1933 0 98028 47.7379 -122.233 2720 \n",
"3 1965 0 98136 47.5208 -122.393 1360 \n",
"4 1987 0 98074 47.6168 -122.045 1800 \n",
"... ... ... ... ... ... ... \n",
"21608 2009 0 98103 47.6993 -122.346 1530 \n",
"21609 2014 0 98146 47.5107 -122.362 1830 \n",
"21610 2009 0 98144 47.5944 -122.299 1020 \n",
"21611 2004 0 98027 47.5345 -122.069 1410 \n",
"21612 2008 0 98144 47.5941 -122.299 1020 \n",
"\n",
" sqft_lot15 \n",
"0 5650 \n",
"1 7639 \n",
"2 8062 \n",
"3 5000 \n",
"4 7503 \n",
"... ... \n",
"21608 1509 \n",
"21609 7200 \n",
"21610 2007 \n",
"21611 1287 \n",
"21612 1357 \n",
"\n",
"[21613 rows x 20 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"house = pd.read_csv(\"data/kc_house_data.csv\", index_col=\"id\")\n",
"\n",
"house = house.reset_index(drop=True)\n",
"\n",
"house"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" yr_built_1901 | \n",
" yr_built_1902 | \n",
" yr_built_1903 | \n",
" yr_built_1904 | \n",
" yr_built_1905 | \n",
" yr_built_1906 | \n",
" yr_built_1907 | \n",
" yr_built_1908 | \n",
" yr_built_1909 | \n",
" yr_built_1910 | \n",
" ... | \n",
" price_4489000.0 | \n",
" price_4500000.0 | \n",
" price_4668000.0 | \n",
" price_5110800.0 | \n",
" price_5300000.0 | \n",
" price_5350000.0 | \n",
" price_5570000.0 | \n",
" price_6885000.0 | \n",
" price_7062500.0 | \n",
" price_7700000.0 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 21608 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 21609 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 21610 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 21611 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 21612 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
21613 rows × 4142 columns
\n",
"
"
],
"text/plain": [
" yr_built_1901 yr_built_1902 yr_built_1903 yr_built_1904 \\\n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... \n",
"21608 0.0 0.0 0.0 0.0 \n",
"21609 0.0 0.0 0.0 0.0 \n",
"21610 0.0 0.0 0.0 0.0 \n",
"21611 0.0 0.0 0.0 0.0 \n",
"21612 0.0 0.0 0.0 0.0 \n",
"\n",
" yr_built_1905 yr_built_1906 yr_built_1907 yr_built_1908 \\\n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... \n",
"21608 0.0 0.0 0.0 0.0 \n",
"21609 0.0 0.0 0.0 0.0 \n",
"21610 0.0 0.0 0.0 0.0 \n",
"21611 0.0 0.0 0.0 0.0 \n",
"21612 0.0 0.0 0.0 0.0 \n",
"\n",
" yr_built_1909 yr_built_1910 ... price_4489000.0 price_4500000.0 \\\n",
"0 0.0 0.0 ... 0.0 0.0 \n",
"1 0.0 0.0 ... 0.0 0.0 \n",
"2 0.0 0.0 ... 0.0 0.0 \n",
"3 0.0 0.0 ... 0.0 0.0 \n",
"4 0.0 0.0 ... 0.0 0.0 \n",
"... ... ... ... ... ... \n",
"21608 0.0 0.0 ... 0.0 0.0 \n",
"21609 0.0 0.0 ... 0.0 0.0 \n",
"21610 0.0 0.0 ... 0.0 0.0 \n",
"21611 0.0 0.0 ... 0.0 0.0 \n",
"21612 0.0 0.0 ... 0.0 0.0 \n",
"\n",
" price_4668000.0 price_5110800.0 price_5300000.0 price_5350000.0 \\\n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... \n",
"21608 0.0 0.0 0.0 0.0 \n",
"21609 0.0 0.0 0.0 0.0 \n",
"21610 0.0 0.0 0.0 0.0 \n",
"21611 0.0 0.0 0.0 0.0 \n",
"21612 0.0 0.0 0.0 0.0 \n",
"\n",
" price_5570000.0 price_6885000.0 price_7062500.0 price_7700000.0 \n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... \n",
"21608 0.0 0.0 0.0 0.0 \n",
"21609 0.0 0.0 0.0 0.0 \n",
"21610 0.0 0.0 0.0 0.0 \n",
"21611 0.0 0.0 0.0 0.0 \n",
"21612 0.0 0.0 0.0 0.0 \n",
"\n",
"[21613 rows x 4142 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"import numpy as np\n",
"\n",
"\n",
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
"\n",
"encoded_values = encoder.fit_transform(house[[\"yr_built\", \"price\"]])\n",
"\n",
"encoded_columns = encoder.get_feature_names_out([\"yr_built\", \"price\"])\n",
"\n",
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
"\n",
"encoded_values_df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" price | \n",
" bedrooms | \n",
" bathrooms | \n",
" sqft_living | \n",
" sqft_lot | \n",
" floors | \n",
" waterfront | \n",
" view | \n",
" condition | \n",
" ... | \n",
" price_4489000.0 | \n",
" price_4500000.0 | \n",
" price_4668000.0 | \n",
" price_5110800.0 | \n",
" price_5300000.0 | \n",
" price_5350000.0 | \n",
" price_5570000.0 | \n",
" price_6885000.0 | \n",
" price_7062500.0 | \n",
" price_7700000.0 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 20141013T000000 | \n",
" 221900.0 | \n",
" 3 | \n",
" 1.00 | \n",
" 1180 | \n",
" 5650 | \n",
" 1.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 20141209T000000 | \n",
" 538000.0 | \n",
" 3 | \n",
" 2.25 | \n",
" 2570 | \n",
" 7242 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 20150225T000000 | \n",
" 180000.0 | \n",
" 2 | \n",
" 1.00 | \n",
" 770 | \n",
" 10000 | \n",
" 1.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 20141209T000000 | \n",
" 604000.0 | \n",
" 4 | \n",
" 3.00 | \n",
" 1960 | \n",
" 5000 | \n",
" 1.0 | \n",
" 0 | \n",
" 0 | \n",
" 5 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 20150218T000000 | \n",
" 510000.0 | \n",
" 3 | \n",
" 2.00 | \n",
" 1680 | \n",
" 8080 | \n",
" 1.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 21608 | \n",
" 20140521T000000 | \n",
" 360000.0 | \n",
" 3 | \n",
" 2.50 | \n",
" 1530 | \n",
" 1131 | \n",
" 3.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 21609 | \n",
" 20150223T000000 | \n",
" 400000.0 | \n",
" 4 | \n",
" 2.50 | \n",
" 2310 | \n",
" 5813 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 21610 | \n",
" 20140623T000000 | \n",
" 402101.0 | \n",
" 2 | \n",
" 0.75 | \n",
" 1020 | \n",
" 1350 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 21611 | \n",
" 20150116T000000 | \n",
" 400000.0 | \n",
" 3 | \n",
" 2.50 | \n",
" 1600 | \n",
" 2388 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 21612 | \n",
" 20141015T000000 | \n",
" 325000.0 | \n",
" 2 | \n",
" 0.75 | \n",
" 1020 | \n",
" 1076 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
21613 rows × 4162 columns
\n",
"
"
],
"text/plain": [
" date price bedrooms bathrooms sqft_living sqft_lot \\\n",
"0 20141013T000000 221900.0 3 1.00 1180 5650 \n",
"1 20141209T000000 538000.0 3 2.25 2570 7242 \n",
"2 20150225T000000 180000.0 2 1.00 770 10000 \n",
"3 20141209T000000 604000.0 4 3.00 1960 5000 \n",
"4 20150218T000000 510000.0 3 2.00 1680 8080 \n",
"... ... ... ... ... ... ... \n",
"21608 20140521T000000 360000.0 3 2.50 1530 1131 \n",
"21609 20150223T000000 400000.0 4 2.50 2310 5813 \n",
"21610 20140623T000000 402101.0 2 0.75 1020 1350 \n",
"21611 20150116T000000 400000.0 3 2.50 1600 2388 \n",
"21612 20141015T000000 325000.0 2 0.75 1020 1076 \n",
"\n",
" floors waterfront view condition ... price_4489000.0 \\\n",
"0 1.0 0 0 3 ... 0.0 \n",
"1 2.0 0 0 3 ... 0.0 \n",
"2 1.0 0 0 3 ... 0.0 \n",
"3 1.0 0 0 5 ... 0.0 \n",
"4 1.0 0 0 3 ... 0.0 \n",
"... ... ... ... ... ... ... \n",
"21608 3.0 0 0 3 ... 0.0 \n",
"21609 2.0 0 0 3 ... 0.0 \n",
"21610 2.0 0 0 3 ... 0.0 \n",
"21611 2.0 0 0 3 ... 0.0 \n",
"21612 2.0 0 0 3 ... 0.0 \n",
"\n",
" price_4500000.0 price_4668000.0 price_5110800.0 price_5300000.0 \\\n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... \n",
"21608 0.0 0.0 0.0 0.0 \n",
"21609 0.0 0.0 0.0 0.0 \n",
"21610 0.0 0.0 0.0 0.0 \n",
"21611 0.0 0.0 0.0 0.0 \n",
"21612 0.0 0.0 0.0 0.0 \n",
"\n",
" price_5350000.0 price_5570000.0 price_6885000.0 price_7062500.0 \\\n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... \n",
"21608 0.0 0.0 0.0 0.0 \n",
"21609 0.0 0.0 0.0 0.0 \n",
"21610 0.0 0.0 0.0 0.0 \n",
"21611 0.0 0.0 0.0 0.0 \n",
"21612 0.0 0.0 0.0 0.0 \n",
"\n",
" price_7700000.0 \n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 0.0 \n",
"4 0.0 \n",
"... ... \n",
"21608 0.0 \n",
"21609 0.0 \n",
"21610 0.0 \n",
"21611 0.0 \n",
"21612 0.0 \n",
"\n",
"[21613 rows x 4162 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"house = pd.concat([house, encoded_values_df], axis=1)\n",
"\n",
"house"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"labels = [\"old\", \"middle\", \"new\"]\n",
"num_bins = 3"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([1900. , 1938.33333333, 1976.66666667, 2015. ]),\n",
" array([ 3067, 8120, 10426]))"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hist1, bins1 = np.histogram(\n",
" house[\"yr_built\"].fillna(house[\"yr_built\"].median()), bins=num_bins\n",
")\n",
"bins1, hist1"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" yr_built | \n",
" yr_built | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1955 | \n",
" (1938.333, 1976.667] | \n",
"
\n",
" \n",
" 1 | \n",
" 1951 | \n",
" (1938.333, 1976.667] | \n",
"
\n",
" \n",
" 2 | \n",
" 1933 | \n",
" (1900.0, 1938.333] | \n",
"
\n",
" \n",
" 3 | \n",
" 1965 | \n",
" (1938.333, 1976.667] | \n",
"
\n",
" \n",
" 4 | \n",
" 1987 | \n",
" (1976.667, 2015.0] | \n",
"
\n",
" \n",
" 5 | \n",
" 2001 | \n",
" (1976.667, 2015.0] | \n",
"
\n",
" \n",
" 6 | \n",
" 1995 | \n",
" (1976.667, 2015.0] | \n",
"
\n",
" \n",
" 7 | \n",
" 1963 | \n",
" (1938.333, 1976.667] | \n",
"
\n",
" \n",
" 8 | \n",
" 1960 | \n",
" (1938.333, 1976.667] | \n",
"
\n",
" \n",
" 9 | \n",
" 2003 | \n",
" (1976.667, 2015.0] | \n",
"
\n",
" \n",
" 10 | \n",
" 1965 | \n",
" (1938.333, 1976.667] | \n",
"
\n",
" \n",
" 11 | \n",
" 1942 | \n",
" (1938.333, 1976.667] | \n",
"
\n",
" \n",
" 12 | \n",
" 1927 | \n",
" (1900.0, 1938.333] | \n",
"
\n",
" \n",
" 13 | \n",
" 1977 | \n",
" (1976.667, 2015.0] | \n",
"
\n",
" \n",
" 14 | \n",
" 1900 | \n",
" NaN | \n",
"
\n",
" \n",
" 15 | \n",
" 1979 | \n",
" (1976.667, 2015.0] | \n",
"
\n",
" \n",
" 16 | \n",
" 1994 | \n",
" (1976.667, 2015.0] | \n",
"
\n",
" \n",
" 17 | \n",
" 1916 | \n",
" (1900.0, 1938.333] | \n",
"
\n",
" \n",
" 18 | \n",
" 1921 | \n",
" (1900.0, 1938.333] | \n",
"
\n",
" \n",
" 19 | \n",
" 1969 | \n",
" (1938.333, 1976.667] | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" yr_built yr_built\n",
"0 1955 (1938.333, 1976.667]\n",
"1 1951 (1938.333, 1976.667]\n",
"2 1933 (1900.0, 1938.333]\n",
"3 1965 (1938.333, 1976.667]\n",
"4 1987 (1976.667, 2015.0]\n",
"5 2001 (1976.667, 2015.0]\n",
"6 1995 (1976.667, 2015.0]\n",
"7 1963 (1938.333, 1976.667]\n",
"8 1960 (1938.333, 1976.667]\n",
"9 2003 (1976.667, 2015.0]\n",
"10 1965 (1938.333, 1976.667]\n",
"11 1942 (1938.333, 1976.667]\n",
"12 1927 (1900.0, 1938.333]\n",
"13 1977 (1976.667, 2015.0]\n",
"14 1900 NaN\n",
"15 1979 (1976.667, 2015.0]\n",
"16 1994 (1976.667, 2015.0]\n",
"17 1916 (1900.0, 1938.333]\n",
"18 1921 (1900.0, 1938.333]\n",
"19 1969 (1938.333, 1976.667]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins1))], axis=1).head(20)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" yr_built | \n",
" yr_built | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1955 | \n",
" middle | \n",
"
\n",
" \n",
" 1 | \n",
" 1951 | \n",
" middle | \n",
"
\n",
" \n",
" 2 | \n",
" 1933 | \n",
" old | \n",
"
\n",
" \n",
" 3 | \n",
" 1965 | \n",
" middle | \n",
"
\n",
" \n",
" 4 | \n",
" 1987 | \n",
" new | \n",
"
\n",
" \n",
" 5 | \n",
" 2001 | \n",
" new | \n",
"
\n",
" \n",
" 6 | \n",
" 1995 | \n",
" new | \n",
"
\n",
" \n",
" 7 | \n",
" 1963 | \n",
" middle | \n",
"
\n",
" \n",
" 8 | \n",
" 1960 | \n",
" middle | \n",
"
\n",
" \n",
" 9 | \n",
" 2003 | \n",
" new | \n",
"
\n",
" \n",
" 10 | \n",
" 1965 | \n",
" middle | \n",
"
\n",
" \n",
" 11 | \n",
" 1942 | \n",
" middle | \n",
"
\n",
" \n",
" 12 | \n",
" 1927 | \n",
" old | \n",
"
\n",
" \n",
" 13 | \n",
" 1977 | \n",
" new | \n",
"
\n",
" \n",
" 14 | \n",
" 1900 | \n",
" NaN | \n",
"
\n",
" \n",
" 15 | \n",
" 1979 | \n",
" new | \n",
"
\n",
" \n",
" 16 | \n",
" 1994 | \n",
" new | \n",
"
\n",
" \n",
" 17 | \n",
" 1916 | \n",
" old | \n",
"
\n",
" \n",
" 18 | \n",
" 1921 | \n",
" old | \n",
"
\n",
" \n",
" 19 | \n",
" 1969 | \n",
" middle | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" yr_built yr_built\n",
"0 1955 middle\n",
"1 1951 middle\n",
"2 1933 old\n",
"3 1965 middle\n",
"4 1987 new\n",
"5 2001 new\n",
"6 1995 new\n",
"7 1963 middle\n",
"8 1960 middle\n",
"9 2003 new\n",
"10 1965 middle\n",
"11 1942 middle\n",
"12 1927 old\n",
"13 1977 new\n",
"14 1900 NaN\n",
"15 1979 new\n",
"16 1994 new\n",
"17 1916 old\n",
"18 1921 old\n",
"19 1969 middle"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat(\n",
" [house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins1), labels=labels)], axis=1\n",
").head(20)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([1899., 1928., 1957., 1986., 2015.]),\n",
" array([2403, 4230, 6914, 8028, 38]))"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bins2 = np.linspace(1899, 2015, 5)\n",
"tmp_bins2 = np.digitize(house[\"yr_built\"].fillna(house[\"yr_built\"].median()), bins2)\n",
"hist2 = np.bincount(tmp_bins2 - 1)\n",
"bins2, hist2"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" yr_built | \n",
" yr_built | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1955 | \n",
" (1928.0, 1957.0] | \n",
"
\n",
" \n",
" 1 | \n",
" 1951 | \n",
" (1928.0, 1957.0] | \n",
"
\n",
" \n",
" 2 | \n",
" 1933 | \n",
" (1928.0, 1957.0] | \n",
"
\n",
" \n",
" 3 | \n",
" 1965 | \n",
" (1957.0, 1986.0] | \n",
"
\n",
" \n",
" 4 | \n",
" 1987 | \n",
" (1986.0, 2015.0] | \n",
"
\n",
" \n",
" 5 | \n",
" 2001 | \n",
" (1986.0, 2015.0] | \n",
"
\n",
" \n",
" 6 | \n",
" 1995 | \n",
" (1986.0, 2015.0] | \n",
"
\n",
" \n",
" 7 | \n",
" 1963 | \n",
" (1957.0, 1986.0] | \n",
"
\n",
" \n",
" 8 | \n",
" 1960 | \n",
" (1957.0, 1986.0] | \n",
"
\n",
" \n",
" 9 | \n",
" 2003 | \n",
" (1986.0, 2015.0] | \n",
"
\n",
" \n",
" 10 | \n",
" 1965 | \n",
" (1957.0, 1986.0] | \n",
"
\n",
" \n",
" 11 | \n",
" 1942 | \n",
" (1928.0, 1957.0] | \n",
"
\n",
" \n",
" 12 | \n",
" 1927 | \n",
" (1899.0, 1928.0] | \n",
"
\n",
" \n",
" 13 | \n",
" 1977 | \n",
" (1957.0, 1986.0] | \n",
"
\n",
" \n",
" 14 | \n",
" 1900 | \n",
" (1899.0, 1928.0] | \n",
"
\n",
" \n",
" 15 | \n",
" 1979 | \n",
" (1957.0, 1986.0] | \n",
"
\n",
" \n",
" 16 | \n",
" 1994 | \n",
" (1986.0, 2015.0] | \n",
"
\n",
" \n",
" 17 | \n",
" 1916 | \n",
" (1899.0, 1928.0] | \n",
"
\n",
" \n",
" 18 | \n",
" 1921 | \n",
" (1899.0, 1928.0] | \n",
"
\n",
" \n",
" 19 | \n",
" 1969 | \n",
" (1957.0, 1986.0] | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" yr_built yr_built\n",
"0 1955 (1928.0, 1957.0]\n",
"1 1951 (1928.0, 1957.0]\n",
"2 1933 (1928.0, 1957.0]\n",
"3 1965 (1957.0, 1986.0]\n",
"4 1987 (1986.0, 2015.0]\n",
"5 2001 (1986.0, 2015.0]\n",
"6 1995 (1986.0, 2015.0]\n",
"7 1963 (1957.0, 1986.0]\n",
"8 1960 (1957.0, 1986.0]\n",
"9 2003 (1986.0, 2015.0]\n",
"10 1965 (1957.0, 1986.0]\n",
"11 1942 (1928.0, 1957.0]\n",
"12 1927 (1899.0, 1928.0]\n",
"13 1977 (1957.0, 1986.0]\n",
"14 1900 (1899.0, 1928.0]\n",
"15 1979 (1957.0, 1986.0]\n",
"16 1994 (1986.0, 2015.0]\n",
"17 1916 (1899.0, 1928.0]\n",
"18 1921 (1899.0, 1928.0]\n",
"19 1969 (1957.0, 1986.0]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins2))], axis=1).head(20)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([1899, 1957, 2001, 2015]), array([ 6633, 10439, 4541]))"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hist3, bins3 = np.histogram(\n",
" house[\"yr_built\"].fillna(house[\"yr_built\"].median()), bins=[1899, 1957, 2001, 2015]\n",
")\n",
"bins3, hist3"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" yr_built | \n",
" yr_built | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1955 | \n",
" (1899, 1957] | \n",
"
\n",
" \n",
" 1 | \n",
" 1951 | \n",
" (1899, 1957] | \n",
"
\n",
" \n",
" 2 | \n",
" 1933 | \n",
" (1899, 1957] | \n",
"
\n",
" \n",
" 3 | \n",
" 1965 | \n",
" (1957, 2001] | \n",
"
\n",
" \n",
" 4 | \n",
" 1987 | \n",
" (1957, 2001] | \n",
"
\n",
" \n",
" 5 | \n",
" 2001 | \n",
" (1957, 2001] | \n",
"
\n",
" \n",
" 6 | \n",
" 1995 | \n",
" (1957, 2001] | \n",
"
\n",
" \n",
" 7 | \n",
" 1963 | \n",
" (1957, 2001] | \n",
"
\n",
" \n",
" 8 | \n",
" 1960 | \n",
" (1957, 2001] | \n",
"
\n",
" \n",
" 9 | \n",
" 2003 | \n",
" (2001, 2015] | \n",
"
\n",
" \n",
" 10 | \n",
" 1965 | \n",
" (1957, 2001] | \n",
"
\n",
" \n",
" 11 | \n",
" 1942 | \n",
" (1899, 1957] | \n",
"
\n",
" \n",
" 12 | \n",
" 1927 | \n",
" (1899, 1957] | \n",
"
\n",
" \n",
" 13 | \n",
" 1977 | \n",
" (1957, 2001] | \n",
"
\n",
" \n",
" 14 | \n",
" 1900 | \n",
" (1899, 1957] | \n",
"
\n",
" \n",
" 15 | \n",
" 1979 | \n",
" (1957, 2001] | \n",
"
\n",
" \n",
" 16 | \n",
" 1994 | \n",
" (1957, 2001] | \n",
"
\n",
" \n",
" 17 | \n",
" 1916 | \n",
" (1899, 1957] | \n",
"
\n",
" \n",
" 18 | \n",
" 1921 | \n",
" (1899, 1957] | \n",
"
\n",
" \n",
" 19 | \n",
" 1969 | \n",
" (1957, 2001] | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" yr_built yr_built\n",
"0 1955 (1899, 1957]\n",
"1 1951 (1899, 1957]\n",
"2 1933 (1899, 1957]\n",
"3 1965 (1957, 2001]\n",
"4 1987 (1957, 2001]\n",
"5 2001 (1957, 2001]\n",
"6 1995 (1957, 2001]\n",
"7 1963 (1957, 2001]\n",
"8 1960 (1957, 2001]\n",
"9 2003 (2001, 2015]\n",
"10 1965 (1957, 2001]\n",
"11 1942 (1899, 1957]\n",
"12 1927 (1899, 1957]\n",
"13 1977 (1957, 2001]\n",
"14 1900 (1899, 1957]\n",
"15 1979 (1957, 2001]\n",
"16 1994 (1957, 2001]\n",
"17 1916 (1899, 1957]\n",
"18 1921 (1899, 1957]\n",
"19 1969 (1957, 2001]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins3))], axis=1).head(20)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" yr_built | \n",
" yr_built | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1955 | \n",
" old | \n",
"
\n",
" \n",
" 1 | \n",
" 1951 | \n",
" old | \n",
"
\n",
" \n",
" 2 | \n",
" 1933 | \n",
" old | \n",
"
\n",
" \n",
" 3 | \n",
" 1965 | \n",
" middle | \n",
"
\n",
" \n",
" 4 | \n",
" 1987 | \n",
" middle | \n",
"
\n",
" \n",
" 5 | \n",
" 2001 | \n",
" middle | \n",
"
\n",
" \n",
" 6 | \n",
" 1995 | \n",
" middle | \n",
"
\n",
" \n",
" 7 | \n",
" 1963 | \n",
" middle | \n",
"
\n",
" \n",
" 8 | \n",
" 1960 | \n",
" middle | \n",
"
\n",
" \n",
" 9 | \n",
" 2003 | \n",
" new | \n",
"
\n",
" \n",
" 10 | \n",
" 1965 | \n",
" middle | \n",
"
\n",
" \n",
" 11 | \n",
" 1942 | \n",
" old | \n",
"
\n",
" \n",
" 12 | \n",
" 1927 | \n",
" old | \n",
"
\n",
" \n",
" 13 | \n",
" 1977 | \n",
" middle | \n",
"
\n",
" \n",
" 14 | \n",
" 1900 | \n",
" old | \n",
"
\n",
" \n",
" 15 | \n",
" 1979 | \n",
" middle | \n",
"
\n",
" \n",
" 16 | \n",
" 1994 | \n",
" middle | \n",
"
\n",
" \n",
" 17 | \n",
" 1916 | \n",
" old | \n",
"
\n",
" \n",
" 18 | \n",
" 1921 | \n",
" old | \n",
"
\n",
" \n",
" 19 | \n",
" 1969 | \n",
" middle | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" yr_built yr_built\n",
"0 1955 old\n",
"1 1951 old\n",
"2 1933 old\n",
"3 1965 middle\n",
"4 1987 middle\n",
"5 2001 middle\n",
"6 1995 middle\n",
"7 1963 middle\n",
"8 1960 middle\n",
"9 2003 new\n",
"10 1965 middle\n",
"11 1942 old\n",
"12 1927 old\n",
"13 1977 middle\n",
"14 1900 old\n",
"15 1979 middle\n",
"16 1994 middle\n",
"17 1916 old\n",
"18 1921 old\n",
"19 1969 middle"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat(\n",
" [house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins3), labels=labels)],\n",
" axis=1,\n",
").head(20)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" yr_built | \n",
" yr_built | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1955 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1951 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1933 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 1965 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 1987 | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" 2001 | \n",
" 2 | \n",
"
\n",
" \n",
" 6 | \n",
" 1995 | \n",
" 2 | \n",
"
\n",
" \n",
" 7 | \n",
" 1963 | \n",
" 1 | \n",
"
\n",
" \n",
" 8 | \n",
" 1960 | \n",
" 1 | \n",
"
\n",
" \n",
" 9 | \n",
" 2003 | \n",
" 2 | \n",
"
\n",
" \n",
" 10 | \n",
" 1965 | \n",
" 1 | \n",
"
\n",
" \n",
" 11 | \n",
" 1942 | \n",
" 0 | \n",
"
\n",
" \n",
" 12 | \n",
" 1927 | \n",
" 0 | \n",
"
\n",
" \n",
" 13 | \n",
" 1977 | \n",
" 1 | \n",
"
\n",
" \n",
" 14 | \n",
" 1900 | \n",
" 0 | \n",
"
\n",
" \n",
" 15 | \n",
" 1979 | \n",
" 1 | \n",
"
\n",
" \n",
" 16 | \n",
" 1994 | \n",
" 2 | \n",
"
\n",
" \n",
" 17 | \n",
" 1916 | \n",
" 0 | \n",
"
\n",
" \n",
" 18 | \n",
" 1921 | \n",
" 0 | \n",
"
\n",
" \n",
" 19 | \n",
" 1969 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" yr_built yr_built\n",
"0 1955 0\n",
"1 1951 0\n",
"2 1933 0\n",
"3 1965 1\n",
"4 1987 1\n",
"5 2001 2\n",
"6 1995 2\n",
"7 1963 1\n",
"8 1960 1\n",
"9 2003 2\n",
"10 1965 1\n",
"11 1942 0\n",
"12 1927 0\n",
"13 1977 1\n",
"14 1900 0\n",
"15 1979 1\n",
"16 1994 2\n",
"17 1916 0\n",
"18 1921 0\n",
"19 1969 1"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat(\n",
" [house[\"yr_built\"], pd.qcut(house[\"yr_built\"], q=3, labels=False)], axis=1\n",
").head(20)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" yr_built | \n",
" yr_built | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1955 | \n",
" old | \n",
"
\n",
" \n",
" 1 | \n",
" 1951 | \n",
" old | \n",
"
\n",
" \n",
" 2 | \n",
" 1933 | \n",
" old | \n",
"
\n",
" \n",
" 3 | \n",
" 1965 | \n",
" middle | \n",
"
\n",
" \n",
" 4 | \n",
" 1987 | \n",
" middle | \n",
"
\n",
" \n",
" 5 | \n",
" 2001 | \n",
" new | \n",
"
\n",
" \n",
" 6 | \n",
" 1995 | \n",
" new | \n",
"
\n",
" \n",
" 7 | \n",
" 1963 | \n",
" middle | \n",
"
\n",
" \n",
" 8 | \n",
" 1960 | \n",
" middle | \n",
"
\n",
" \n",
" 9 | \n",
" 2003 | \n",
" new | \n",
"
\n",
" \n",
" 10 | \n",
" 1965 | \n",
" middle | \n",
"
\n",
" \n",
" 11 | \n",
" 1942 | \n",
" old | \n",
"
\n",
" \n",
" 12 | \n",
" 1927 | \n",
" old | \n",
"
\n",
" \n",
" 13 | \n",
" 1977 | \n",
" middle | \n",
"
\n",
" \n",
" 14 | \n",
" 1900 | \n",
" old | \n",
"
\n",
" \n",
" 15 | \n",
" 1979 | \n",
" middle | \n",
"
\n",
" \n",
" 16 | \n",
" 1994 | \n",
" new | \n",
"
\n",
" \n",
" 17 | \n",
" 1916 | \n",
" old | \n",
"
\n",
" \n",
" 18 | \n",
" 1921 | \n",
" old | \n",
"
\n",
" \n",
" 19 | \n",
" 1969 | \n",
" middle | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" yr_built yr_built\n",
"0 1955 old\n",
"1 1951 old\n",
"2 1933 old\n",
"3 1965 middle\n",
"4 1987 middle\n",
"5 2001 new\n",
"6 1995 new\n",
"7 1963 middle\n",
"8 1960 middle\n",
"9 2003 new\n",
"10 1965 middle\n",
"11 1942 old\n",
"12 1927 old\n",
"13 1977 middle\n",
"14 1900 old\n",
"15 1979 middle\n",
"16 1994 new\n",
"17 1916 old\n",
"18 1921 old\n",
"19 1969 middle"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat(\n",
" [house[\"yr_built\"], pd.qcut(house[\"yr_built\"], q=3, labels=labels)], axis=1\n",
").head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Пример конструирования признаков на основе существующих"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" price | \n",
" bedrooms | \n",
" bathrooms | \n",
" sqft_living | \n",
" sqft_lot | \n",
" floors | \n",
" grade | \n",
" sqft_above | \n",
" sqft_basement | \n",
" yr_built | \n",
" yr_renovated | \n",
" zipcode | \n",
" lat | \n",
" long | \n",
" sqft_living15 | \n",
" sqft_lot15 | \n",
" Price_category | \n",
" Renovated_flag | \n",
" Zipcode_area | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 20141013T000000 | \n",
" 221900.0 | \n",
" 3 | \n",
" 1.00 | \n",
" 1180 | \n",
" 5650 | \n",
" 1.0 | \n",
" 7 | \n",
" 1180 | \n",
" 0 | \n",
" 1955 | \n",
" 0 | \n",
" 98178 | \n",
" 47.5112 | \n",
" -122.257 | \n",
" 1340 | \n",
" 5650 | \n",
" Low | \n",
" 0 | \n",
" 981 | \n",
"
\n",
" \n",
" 1 | \n",
" 20141209T000000 | \n",
" 538000.0 | \n",
" 3 | \n",
" 2.25 | \n",
" 2570 | \n",
" 7242 | \n",
" 2.0 | \n",
" 7 | \n",
" 2170 | \n",
" 400 | \n",
" 1951 | \n",
" 1991 | \n",
" 98125 | \n",
" 47.7210 | \n",
" -122.319 | \n",
" 1690 | \n",
" 7639 | \n",
" Medium | \n",
" 1 | \n",
" 981 | \n",
"
\n",
" \n",
" 2 | \n",
" 20150225T000000 | \n",
" 180000.0 | \n",
" 2 | \n",
" 1.00 | \n",
" 770 | \n",
" 10000 | \n",
" 1.0 | \n",
" 6 | \n",
" 770 | \n",
" 0 | \n",
" 1933 | \n",
" 0 | \n",
" 98028 | \n",
" 47.7379 | \n",
" -122.233 | \n",
" 2720 | \n",
" 8062 | \n",
" Low | \n",
" 0 | \n",
" 980 | \n",
"
\n",
" \n",
" 3 | \n",
" 20141209T000000 | \n",
" 604000.0 | \n",
" 4 | \n",
" 3.00 | \n",
" 1960 | \n",
" 5000 | \n",
" 1.0 | \n",
" 7 | \n",
" 1050 | \n",
" 910 | \n",
" 1965 | \n",
" 0 | \n",
" 98136 | \n",
" 47.5208 | \n",
" -122.393 | \n",
" 1360 | \n",
" 5000 | \n",
" High | \n",
" 0 | \n",
" 981 | \n",
"
\n",
" \n",
" 4 | \n",
" 20150218T000000 | \n",
" 510000.0 | \n",
" 3 | \n",
" 2.00 | \n",
" 1680 | \n",
" 8080 | \n",
" 1.0 | \n",
" 8 | \n",
" 1680 | \n",
" 0 | \n",
" 1987 | \n",
" 0 | \n",
" 98074 | \n",
" 47.6168 | \n",
" -122.045 | \n",
" 1800 | \n",
" 7503 | \n",
" Medium | \n",
" 0 | \n",
" 980 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 21608 | \n",
" 20140521T000000 | \n",
" 360000.0 | \n",
" 3 | \n",
" 2.50 | \n",
" 1530 | \n",
" 1131 | \n",
" 3.0 | \n",
" 8 | \n",
" 1530 | \n",
" 0 | \n",
" 2009 | \n",
" 0 | \n",
" 98103 | \n",
" 47.6993 | \n",
" -122.346 | \n",
" 1530 | \n",
" 1509 | \n",
" Low | \n",
" 0 | \n",
" 981 | \n",
"
\n",
" \n",
" 21609 | \n",
" 20150223T000000 | \n",
" 400000.0 | \n",
" 4 | \n",
" 2.50 | \n",
" 2310 | \n",
" 5813 | \n",
" 2.0 | \n",
" 8 | \n",
" 2310 | \n",
" 0 | \n",
" 2014 | \n",
" 0 | \n",
" 98146 | \n",
" 47.5107 | \n",
" -122.362 | \n",
" 1830 | \n",
" 7200 | \n",
" Medium | \n",
" 0 | \n",
" 981 | \n",
"
\n",
" \n",
" 21610 | \n",
" 20140623T000000 | \n",
" 402101.0 | \n",
" 2 | \n",
" 0.75 | \n",
" 1020 | \n",
" 1350 | \n",
" 2.0 | \n",
" 7 | \n",
" 1020 | \n",
" 0 | \n",
" 2009 | \n",
" 0 | \n",
" 98144 | \n",
" 47.5944 | \n",
" -122.299 | \n",
" 1020 | \n",
" 2007 | \n",
" Medium | \n",
" 0 | \n",
" 981 | \n",
"
\n",
" \n",
" 21611 | \n",
" 20150116T000000 | \n",
" 400000.0 | \n",
" 3 | \n",
" 2.50 | \n",
" 1600 | \n",
" 2388 | \n",
" 2.0 | \n",
" 8 | \n",
" 1600 | \n",
" 0 | \n",
" 2004 | \n",
" 0 | \n",
" 98027 | \n",
" 47.5345 | \n",
" -122.069 | \n",
" 1410 | \n",
" 1287 | \n",
" Medium | \n",
" 0 | \n",
" 980 | \n",
"
\n",
" \n",
" 21612 | \n",
" 20141015T000000 | \n",
" 325000.0 | \n",
" 2 | \n",
" 0.75 | \n",
" 1020 | \n",
" 1076 | \n",
" 2.0 | \n",
" 7 | \n",
" 1020 | \n",
" 0 | \n",
" 2008 | \n",
" 0 | \n",
" 98144 | \n",
" 47.5941 | \n",
" -122.299 | \n",
" 1020 | \n",
" 1357 | \n",
" Low | \n",
" 0 | \n",
" 981 | \n",
"
\n",
" \n",
"
\n",
"
21613 rows × 20 columns
\n",
"
"
],
"text/plain": [
" date price bedrooms bathrooms sqft_living sqft_lot \\\n",
"0 20141013T000000 221900.0 3 1.00 1180 5650 \n",
"1 20141209T000000 538000.0 3 2.25 2570 7242 \n",
"2 20150225T000000 180000.0 2 1.00 770 10000 \n",
"3 20141209T000000 604000.0 4 3.00 1960 5000 \n",
"4 20150218T000000 510000.0 3 2.00 1680 8080 \n",
"... ... ... ... ... ... ... \n",
"21608 20140521T000000 360000.0 3 2.50 1530 1131 \n",
"21609 20150223T000000 400000.0 4 2.50 2310 5813 \n",
"21610 20140623T000000 402101.0 2 0.75 1020 1350 \n",
"21611 20150116T000000 400000.0 3 2.50 1600 2388 \n",
"21612 20141015T000000 325000.0 2 0.75 1020 1076 \n",
"\n",
" floors grade sqft_above sqft_basement yr_built yr_renovated \\\n",
"0 1.0 7 1180 0 1955 0 \n",
"1 2.0 7 2170 400 1951 1991 \n",
"2 1.0 6 770 0 1933 0 \n",
"3 1.0 7 1050 910 1965 0 \n",
"4 1.0 8 1680 0 1987 0 \n",
"... ... ... ... ... ... ... \n",
"21608 3.0 8 1530 0 2009 0 \n",
"21609 2.0 8 2310 0 2014 0 \n",
"21610 2.0 7 1020 0 2009 0 \n",
"21611 2.0 8 1600 0 2004 0 \n",
"21612 2.0 7 1020 0 2008 0 \n",
"\n",
" zipcode lat long sqft_living15 sqft_lot15 Price_category \\\n",
"0 98178 47.5112 -122.257 1340 5650 Low \n",
"1 98125 47.7210 -122.319 1690 7639 Medium \n",
"2 98028 47.7379 -122.233 2720 8062 Low \n",
"3 98136 47.5208 -122.393 1360 5000 High \n",
"4 98074 47.6168 -122.045 1800 7503 Medium \n",
"... ... ... ... ... ... ... \n",
"21608 98103 47.6993 -122.346 1530 1509 Low \n",
"21609 98146 47.5107 -122.362 1830 7200 Medium \n",
"21610 98144 47.5944 -122.299 1020 2007 Medium \n",
"21611 98027 47.5345 -122.069 1410 1287 Medium \n",
"21612 98144 47.5941 -122.299 1020 1357 Low \n",
"\n",
" Renovated_flag Zipcode_area \n",
"0 0 981 \n",
"1 1 981 \n",
"2 0 980 \n",
"3 0 981 \n",
"4 0 980 \n",
"... ... ... \n",
"21608 0 981 \n",
"21609 0 981 \n",
"21610 0 981 \n",
"21611 0 980 \n",
"21612 0 981 \n",
"\n",
"[21613 rows x 20 columns]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"house_cleaned = house.drop([\"waterfront\", \"view\", \"condition\"], axis=1, errors=\"ignore\")\n",
"\n",
"house_cleaned = house_cleaned.dropna()\n",
"\n",
"# Признак \"Price_category\": разделение домов на категории цен\n",
"house_cleaned[\"Price_category\"] = pd.qcut(\n",
" house_cleaned[\"price\"], q=3, labels=[\"Low\", \"Medium\", \"High\"]\n",
")\n",
"\n",
"# Признак \"Renovated_flag\": 1, если дом был отремонтирован, иначе 0\n",
"house_cleaned[\"Renovated_flag\"] = house_cleaned[\"yr_renovated\"].apply(\n",
" lambda x: 1 if x > 0 else 0\n",
")\n",
"\n",
"# Признак \"Zipcode_area\": используем первые три цифры из почтового индекса\n",
"house_cleaned[\"Zipcode_area\"] = house_cleaned[\"zipcode\"].apply(lambda x: str(x)[:3])\n",
"\n",
"house_cleaned"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" price bedrooms bathrooms sqft_living sqft_lot floors waterfront \\\n",
"id \n",
"0 221900.0 3 1.00 1180 5650 1.0 0 \n",
"1 538000.0 3 2.25 2570 7242 2.0 0 \n",
"2 180000.0 2 1.00 770 10000 1.0 0 \n",
"3 604000.0 4 3.00 1960 5000 1.0 0 \n",
"4 510000.0 3 2.00 1680 8080 1.0 0 \n",
"\n",
" view condition grade ... yr_renovated zipcode lat long \\\n",
"id ... \n",
"0 0 3 7 ... 0 98178 47.5112 -122.257 \n",
"1 0 3 7 ... 1991 98125 47.7210 -122.319 \n",
"2 0 3 6 ... 0 98028 47.7379 -122.233 \n",
"3 0 5 7 ... 0 98136 47.5208 -122.393 \n",
"4 0 3 8 ... 0 98074 47.6168 -122.045 \n",
"\n",
" sqft_living15 sqft_lot15 HOUR(date) MONTH(date) WEEKDAY(date) \\\n",
"id \n",
"0 1340 5650 0 10 0 \n",
"1 1690 7639 0 12 1 \n",
"2 2720 8062 0 2 2 \n",
"3 1360 5000 0 12 1 \n",
"4 1800 7503 0 2 2 \n",
"\n",
" YEAR(date) \n",
"id \n",
"0 2014 \n",
"1 2014 \n",
"2 2015 \n",
"3 2014 \n",
"4 2015 \n",
"\n",
"[5 rows x 23 columns]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Mai\\mai\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n",
"c:\\Mai\\mai\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:\n",
" agg_primitives: ['count', 'mean', 'mode', 'sum']\n",
"This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n",
" warnings.warn(warning_msg, UnusedPrimitiveWarning)\n"
]
}
],
"source": [
"import featuretools as ft\n",
"from woodwork.logical_types import Categorical, Datetime\n",
"\n",
"import featuretools as ft\n",
"from woodwork.logical_types import Categorical, Datetime\n",
"import pandas as pd\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"data/kc_house_data.csv\")\n",
"\n",
"# Убедимся, что есть уникальный идентификатор для каждой строки (если нет, создаем)\n",
"df[\"id\"] = range(len(df))\n",
"\n",
"# Создаем EntitySet для данных о домах\n",
"es = ft.EntitySet(id=\"house_sales\")\n",
"\n",
"# Добавляем основной DataFrame в EntitySet с указанием типов данных\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"houses\",\n",
" dataframe=df,\n",
" index=\"id\", # Уникальный идентификатор для домов\n",
" logical_types={\n",
" \"date\": Datetime,\n",
" \"zipcode\": Categorical,\n",
" \"condition\": Categorical,\n",
" \"grade\": Categorical,\n",
" \"view\": Categorical,\n",
" \"waterfront\": Categorical,\n",
" },\n",
")\n",
"ft.primitives.list_primitives()\n",
"# Автоматическое конструирование признаков с применением корректных примитивов\n",
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=es,\n",
" target_dataframe_name=\"houses\", # Название основной таблицы\n",
" agg_primitives=[\"mean\", \"count\", \"mode\", \"sum\"], # Агрегирующие примитивы\n",
" trans_primitives=[\n",
" \"year\",\n",
" \"month\",\n",
" \"weekday\",\n",
" \"hour\",\n",
" ], # Корректные трансформационные примитивы\n",
" max_depth=2, # Максимальная глубина для генерации признаков\n",
")\n",
"\n",
"# Просмотр полученной feature_matrix\n",
"print(feature_matrix.head())"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"house.boxplot(column=\"condition\")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" price | \n",
" ConditionClip | \n",
"
\n",
" \n",
" \n",
" \n",
" 36 | \n",
" 20140528T000000 | \n",
" 550000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 380 | \n",
" 20140916T000000 | \n",
" 270000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 397 | \n",
" 20140623T000000 | \n",
" 365000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 1442 | \n",
" 20141107T000000 | \n",
" 352950.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 1734 | \n",
" 20150102T000000 | \n",
" 252000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 2223 | \n",
" 20150316T000000 | \n",
" 535000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 3004 | \n",
" 20141231T000000 | \n",
" 441000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 3202 | \n",
" 20140509T000000 | \n",
" 255000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 3975 | \n",
" 20150511T000000 | \n",
" 210000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 4651 | \n",
" 20141002T000000 | \n",
" 125000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 7376 | \n",
" 20141107T000000 | \n",
" 295000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 7636 | \n",
" 20150120T000000 | \n",
" 190000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 12306 | \n",
" 20150128T000000 | \n",
" 196000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 12453 | \n",
" 20150402T000000 | \n",
" 305000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 12668 | \n",
" 20140729T000000 | \n",
" 227000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 13628 | \n",
" 20140716T000000 | \n",
" 105500.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 13629 | \n",
" 20150316T000000 | \n",
" 445000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 14987 | \n",
" 20141202T000000 | \n",
" 432500.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 15293 | \n",
" 20140506T000000 | \n",
" 78000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 15337 | \n",
" 20140630T000000 | \n",
" 235000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 15371 | \n",
" 20150114T000000 | \n",
" 658000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 15712 | \n",
" 20140724T000000 | \n",
" 150000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 16198 | \n",
" 20150324T000000 | \n",
" 81000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 16893 | \n",
" 20141210T000000 | \n",
" 125000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 16942 | \n",
" 20140611T000000 | \n",
" 427000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 17805 | \n",
" 20150501T000000 | \n",
" 380000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 18332 | \n",
" 20140924T000000 | \n",
" 130000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 18645 | \n",
" 20141216T000000 | \n",
" 575000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 18876 | \n",
" 20150211T000000 | \n",
" 1500000.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 19452 | \n",
" 20140926T000000 | \n",
" 142000.0 | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" date price ConditionClip\n",
"36 20140528T000000 550000.0 2\n",
"380 20140916T000000 270000.0 2\n",
"397 20140623T000000 365000.0 2\n",
"1442 20141107T000000 352950.0 2\n",
"1734 20150102T000000 252000.0 2\n",
"2223 20150316T000000 535000.0 2\n",
"3004 20141231T000000 441000.0 2\n",
"3202 20140509T000000 255000.0 2\n",
"3975 20150511T000000 210000.0 2\n",
"4651 20141002T000000 125000.0 2\n",
"7376 20141107T000000 295000.0 2\n",
"7636 20150120T000000 190000.0 2\n",
"12306 20150128T000000 196000.0 2\n",
"12453 20150402T000000 305000.0 2\n",
"12668 20140729T000000 227000.0 2\n",
"13628 20140716T000000 105500.0 2\n",
"13629 20150316T000000 445000.0 2\n",
"14987 20141202T000000 432500.0 2\n",
"15293 20140506T000000 78000.0 2\n",
"15337 20140630T000000 235000.0 2\n",
"15371 20150114T000000 658000.0 2\n",
"15712 20140724T000000 150000.0 2\n",
"16198 20150324T000000 81000.0 2\n",
"16893 20141210T000000 125000.0 2\n",
"16942 20140611T000000 427000.0 2\n",
"17805 20150501T000000 380000.0 2\n",
"18332 20140924T000000 130000.0 2\n",
"18645 20141216T000000 575000.0 2\n",
"18876 20150211T000000 1500000.0 2\n",
"19452 20140926T000000 142000.0 2"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"house_norm = house.copy()\n",
"\n",
"house_norm[\"ConditionClip\"] = house[\"condition\"].clip(2, 5)\n",
"\n",
"house_norm[house_norm[\"condition\"] < 2][[\"date\", \"price\", \"ConditionClip\"]]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5.0\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" condition | \n",
" ConditionWinsorize | \n",
"
\n",
" \n",
" \n",
" \n",
" 36 | \n",
" 20140528T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 380 | \n",
" 20140916T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 397 | \n",
" 20140623T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 1442 | \n",
" 20141107T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 1734 | \n",
" 20150102T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 2223 | \n",
" 20150316T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 3004 | \n",
" 20141231T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 3202 | \n",
" 20140509T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 3975 | \n",
" 20150511T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 4651 | \n",
" 20141002T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 7376 | \n",
" 20141107T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 7636 | \n",
" 20150120T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 12306 | \n",
" 20150128T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 12453 | \n",
" 20150402T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 12668 | \n",
" 20140729T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 13628 | \n",
" 20140716T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 13629 | \n",
" 20150316T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 14987 | \n",
" 20141202T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 15293 | \n",
" 20140506T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 15337 | \n",
" 20140630T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 15371 | \n",
" 20150114T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 15712 | \n",
" 20140724T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 16198 | \n",
" 20150324T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 16893 | \n",
" 20141210T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 16942 | \n",
" 20140611T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 17805 | \n",
" 20150501T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 18332 | \n",
" 20140924T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 18645 | \n",
" 20141216T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 18876 | \n",
" 20150211T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 19452 | \n",
" 20140926T000000 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" date condition ConditionWinsorize\n",
"36 20140528T000000 1 3\n",
"380 20140916T000000 1 3\n",
"397 20140623T000000 1 3\n",
"1442 20141107T000000 1 3\n",
"1734 20150102T000000 1 3\n",
"2223 20150316T000000 1 3\n",
"3004 20141231T000000 1 3\n",
"3202 20140509T000000 1 3\n",
"3975 20150511T000000 1 3\n",
"4651 20141002T000000 1 3\n",
"7376 20141107T000000 1 3\n",
"7636 20150120T000000 1 3\n",
"12306 20150128T000000 1 3\n",
"12453 20150402T000000 1 3\n",
"12668 20140729T000000 1 3\n",
"13628 20140716T000000 1 3\n",
"13629 20150316T000000 1 3\n",
"14987 20141202T000000 1 3\n",
"15293 20140506T000000 1 3\n",
"15337 20140630T000000 1 3\n",
"15371 20150114T000000 1 3\n",
"15712 20140724T000000 1 3\n",
"16198 20150324T000000 1 3\n",
"16893 20141210T000000 1 3\n",
"16942 20140611T000000 1 3\n",
"17805 20150501T000000 1 3\n",
"18332 20140924T000000 1 3\n",
"18645 20141216T000000 1 3\n",
"18876 20150211T000000 1 3\n",
"19452 20140926T000000 1 3"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from scipy.stats.mstats import winsorize\n",
"\n",
"print(house_norm[\"condition\"].quantile(q=0.95))\n",
"\n",
"house_norm[\"ConditionWinsorize\"] = winsorize(\n",
" house_norm[\"condition\"].fillna(house_norm[\"condition\"].mean()), (0.01, 0.05), inplace=False\n",
")\n",
"\n",
"house_norm[house_norm[\"condition\"] < 2][[\"date\", \"condition\", \"ConditionWinsorize\"]]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" price | \n",
" condition | \n",
" ConditionNorm | \n",
" ConditionClipNorm | \n",
" ConditionWinsorizeNorm | \n",
" ConditionWinsorizeNorm2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 221900.0 | \n",
" 3 | \n",
" 0.50 | \n",
" 0.333333 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 538000.0 | \n",
" 3 | \n",
" 0.50 | \n",
" 0.333333 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 180000.0 | \n",
" 3 | \n",
" 0.50 | \n",
" 0.333333 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 604000.0 | \n",
" 5 | \n",
" 1.00 | \n",
" 1.000000 | \n",
" 1.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 510000.0 | \n",
" 3 | \n",
" 0.50 | \n",
" 0.333333 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 1225000.0 | \n",
" 3 | \n",
" 0.50 | \n",
" 0.333333 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 6 | \n",
" 257500.0 | \n",
" 3 | \n",
" 0.50 | \n",
" 0.333333 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 7 | \n",
" 291850.0 | \n",
" 3 | \n",
" 0.50 | \n",
" 0.333333 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 8 | \n",
" 229500.0 | \n",
" 3 | \n",
" 0.50 | \n",
" 0.333333 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 9 | \n",
" 323000.0 | \n",
" 3 | \n",
" 0.50 | \n",
" 0.333333 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 10 | \n",
" 662500.0 | \n",
" 3 | \n",
" 0.50 | \n",
" 0.333333 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 11 | \n",
" 468000.0 | \n",
" 4 | \n",
" 0.75 | \n",
" 0.666667 | \n",
" 0.5 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 12 | \n",
" 310000.0 | \n",
" 4 | \n",
" 0.75 | \n",
" 0.666667 | \n",
" 0.5 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 13 | \n",
" 400000.0 | \n",
" 4 | \n",
" 0.75 | \n",
" 0.666667 | \n",
" 0.5 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 14 | \n",
" 530000.0 | \n",
" 3 | \n",
" 0.50 | \n",
" 0.333333 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 15 | \n",
" 650000.0 | \n",
" 3 | \n",
" 0.50 | \n",
" 0.333333 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 16 | \n",
" 395000.0 | \n",
" 3 | \n",
" 0.50 | \n",
" 0.333333 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 17 | \n",
" 485000.0 | \n",
" 4 | \n",
" 0.75 | \n",
" 0.666667 | \n",
" 0.5 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 18 | \n",
" 189000.0 | \n",
" 4 | \n",
" 0.75 | \n",
" 0.666667 | \n",
" 0.5 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 19 | \n",
" 230000.0 | \n",
" 4 | \n",
" 0.75 | \n",
" 0.666667 | \n",
" 0.5 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" price condition ConditionNorm ConditionClipNorm \\\n",
"0 221900.0 3 0.50 0.333333 \n",
"1 538000.0 3 0.50 0.333333 \n",
"2 180000.0 3 0.50 0.333333 \n",
"3 604000.0 5 1.00 1.000000 \n",
"4 510000.0 3 0.50 0.333333 \n",
"5 1225000.0 3 0.50 0.333333 \n",
"6 257500.0 3 0.50 0.333333 \n",
"7 291850.0 3 0.50 0.333333 \n",
"8 229500.0 3 0.50 0.333333 \n",
"9 323000.0 3 0.50 0.333333 \n",
"10 662500.0 3 0.50 0.333333 \n",
"11 468000.0 4 0.75 0.666667 \n",
"12 310000.0 4 0.75 0.666667 \n",
"13 400000.0 4 0.75 0.666667 \n",
"14 530000.0 3 0.50 0.333333 \n",
"15 650000.0 3 0.50 0.333333 \n",
"16 395000.0 3 0.50 0.333333 \n",
"17 485000.0 4 0.75 0.666667 \n",
"18 189000.0 4 0.75 0.666667 \n",
"19 230000.0 4 0.75 0.666667 \n",
"\n",
" ConditionWinsorizeNorm ConditionWinsorizeNorm2 \n",
"0 0.0 -1.0 \n",
"1 0.0 -1.0 \n",
"2 0.0 -1.0 \n",
"3 1.0 1.0 \n",
"4 0.0 -1.0 \n",
"5 0.0 -1.0 \n",
"6 0.0 -1.0 \n",
"7 0.0 -1.0 \n",
"8 0.0 -1.0 \n",
"9 0.0 -1.0 \n",
"10 0.0 -1.0 \n",
"11 0.5 0.0 \n",
"12 0.5 0.0 \n",
"13 0.5 0.0 \n",
"14 0.0 -1.0 \n",
"15 0.0 -1.0 \n",
"16 0.0 -1.0 \n",
"17 0.5 0.0 \n",
"18 0.5 0.0 \n",
"19 0.5 0.0 "
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import preprocessing\n",
"\n",
"min_max_scaler = preprocessing.MinMaxScaler()\n",
"\n",
"min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n",
"\n",
"house_norm[\"ConditionNorm\"] = min_max_scaler.fit_transform(\n",
" house_norm[\"condition\"].to_numpy().reshape(-1, 1)\n",
").reshape(house_norm[\"condition\"].shape)\n",
"\n",
"house_norm[\"ConditionClipNorm\"] = min_max_scaler.fit_transform(\n",
" house_norm[\"ConditionClip\"].to_numpy().reshape(-1, 1)\n",
").reshape(house_norm[\"condition\"].shape)\n",
"\n",
"house_norm[\"ConditionWinsorizeNorm\"] = min_max_scaler.fit_transform(\n",
" house_norm[\"ConditionWinsorize\"].to_numpy().reshape(-1, 1)\n",
").reshape(house_norm[\"condition\"].shape)\n",
"\n",
"house_norm[\"ConditionWinsorizeNorm2\"] = min_max_scaler_2.fit_transform(\n",
" house_norm[\"ConditionWinsorize\"].to_numpy().reshape(-1, 1)\n",
").reshape(house_norm[\"condition\"].shape)\n",
"\n",
"house_norm[\n",
" [\n",
" \"price\",\n",
" \"condition\",\n",
" \"ConditionNorm\",\n",
" \"ConditionClipNorm\",\n",
" \"ConditionWinsorizeNorm\",\n",
" \"ConditionWinsorizeNorm2\",\n",
" ]\n",
"].head(20)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" price | \n",
" condition | \n",
" ConditionStand | \n",
" ConditionClipStand | \n",
" ConditionWinsorizeStand | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 221900.0 | \n",
" 3 | \n",
" -0.629187 | \n",
" -0.635310 | \n",
" -0.663482 | \n",
"
\n",
" \n",
" 1 | \n",
" 538000.0 | \n",
" 3 | \n",
" -0.629187 | \n",
" -0.635310 | \n",
" -0.663482 | \n",
"
\n",
" \n",
" 2 | \n",
" 180000.0 | \n",
" 3 | \n",
" -0.629187 | \n",
" -0.635310 | \n",
" -0.663482 | \n",
"
\n",
" \n",
" 3 | \n",
" 604000.0 | \n",
" 5 | \n",
" 2.444294 | \n",
" 2.457597 | \n",
" 2.494726 | \n",
"
\n",
" \n",
" 4 | \n",
" 510000.0 | \n",
" 3 | \n",
" -0.629187 | \n",
" -0.635310 | \n",
" -0.663482 | \n",
"
\n",
" \n",
" 5 | \n",
" 1225000.0 | \n",
" 3 | \n",
" -0.629187 | \n",
" -0.635310 | \n",
" -0.663482 | \n",
"
\n",
" \n",
" 6 | \n",
" 257500.0 | \n",
" 3 | \n",
" -0.629187 | \n",
" -0.635310 | \n",
" -0.663482 | \n",
"
\n",
" \n",
" 7 | \n",
" 291850.0 | \n",
" 3 | \n",
" -0.629187 | \n",
" -0.635310 | \n",
" -0.663482 | \n",
"
\n",
" \n",
" 8 | \n",
" 229500.0 | \n",
" 3 | \n",
" -0.629187 | \n",
" -0.635310 | \n",
" -0.663482 | \n",
"
\n",
" \n",
" 9 | \n",
" 323000.0 | \n",
" 3 | \n",
" -0.629187 | \n",
" -0.635310 | \n",
" -0.663482 | \n",
"
\n",
" \n",
" 10 | \n",
" 662500.0 | \n",
" 3 | \n",
" -0.629187 | \n",
" -0.635310 | \n",
" -0.663482 | \n",
"
\n",
" \n",
" 11 | \n",
" 468000.0 | \n",
" 4 | \n",
" 0.907554 | \n",
" 0.911143 | \n",
" 0.915622 | \n",
"
\n",
" \n",
" 12 | \n",
" 310000.0 | \n",
" 4 | \n",
" 0.907554 | \n",
" 0.911143 | \n",
" 0.915622 | \n",
"
\n",
" \n",
" 13 | \n",
" 400000.0 | \n",
" 4 | \n",
" 0.907554 | \n",
" 0.911143 | \n",
" 0.915622 | \n",
"
\n",
" \n",
" 14 | \n",
" 530000.0 | \n",
" 3 | \n",
" -0.629187 | \n",
" -0.635310 | \n",
" -0.663482 | \n",
"
\n",
" \n",
" 15 | \n",
" 650000.0 | \n",
" 3 | \n",
" -0.629187 | \n",
" -0.635310 | \n",
" -0.663482 | \n",
"
\n",
" \n",
" 16 | \n",
" 395000.0 | \n",
" 3 | \n",
" -0.629187 | \n",
" -0.635310 | \n",
" -0.663482 | \n",
"
\n",
" \n",
" 17 | \n",
" 485000.0 | \n",
" 4 | \n",
" 0.907554 | \n",
" 0.911143 | \n",
" 0.915622 | \n",
"
\n",
" \n",
" 18 | \n",
" 189000.0 | \n",
" 4 | \n",
" 0.907554 | \n",
" 0.911143 | \n",
" 0.915622 | \n",
"
\n",
" \n",
" 19 | \n",
" 230000.0 | \n",
" 4 | \n",
" 0.907554 | \n",
" 0.911143 | \n",
" 0.915622 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" price condition ConditionStand ConditionClipStand \\\n",
"0 221900.0 3 -0.629187 -0.635310 \n",
"1 538000.0 3 -0.629187 -0.635310 \n",
"2 180000.0 3 -0.629187 -0.635310 \n",
"3 604000.0 5 2.444294 2.457597 \n",
"4 510000.0 3 -0.629187 -0.635310 \n",
"5 1225000.0 3 -0.629187 -0.635310 \n",
"6 257500.0 3 -0.629187 -0.635310 \n",
"7 291850.0 3 -0.629187 -0.635310 \n",
"8 229500.0 3 -0.629187 -0.635310 \n",
"9 323000.0 3 -0.629187 -0.635310 \n",
"10 662500.0 3 -0.629187 -0.635310 \n",
"11 468000.0 4 0.907554 0.911143 \n",
"12 310000.0 4 0.907554 0.911143 \n",
"13 400000.0 4 0.907554 0.911143 \n",
"14 530000.0 3 -0.629187 -0.635310 \n",
"15 650000.0 3 -0.629187 -0.635310 \n",
"16 395000.0 3 -0.629187 -0.635310 \n",
"17 485000.0 4 0.907554 0.911143 \n",
"18 189000.0 4 0.907554 0.911143 \n",
"19 230000.0 4 0.907554 0.911143 \n",
"\n",
" ConditionWinsorizeStand \n",
"0 -0.663482 \n",
"1 -0.663482 \n",
"2 -0.663482 \n",
"3 2.494726 \n",
"4 -0.663482 \n",
"5 -0.663482 \n",
"6 -0.663482 \n",
"7 -0.663482 \n",
"8 -0.663482 \n",
"9 -0.663482 \n",
"10 -0.663482 \n",
"11 0.915622 \n",
"12 0.915622 \n",
"13 0.915622 \n",
"14 -0.663482 \n",
"15 -0.663482 \n",
"16 -0.663482 \n",
"17 0.915622 \n",
"18 0.915622 \n",
"19 0.915622 "
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import preprocessing\n",
"\n",
"stndart_scaler = preprocessing.StandardScaler()\n",
"\n",
"house_norm[\"ConditionStand\"] = stndart_scaler.fit_transform(\n",
" house_norm[\"condition\"].to_numpy().reshape(-1, 1)\n",
").reshape(house_norm[\"condition\"].shape)\n",
"\n",
"house_norm[\"ConditionClipStand\"] = stndart_scaler.fit_transform(\n",
" house_norm[\"ConditionClip\"].to_numpy().reshape(-1, 1)\n",
").reshape(house_norm[\"condition\"].shape)\n",
"\n",
"house_norm[\"ConditionWinsorizeStand\"] = stndart_scaler.fit_transform(\n",
" house_norm[\"ConditionWinsorize\"].to_numpy().reshape(-1, 1)\n",
").reshape(house_norm[\"condition\"].shape)\n",
"\n",
"house_norm[\n",
" [\n",
" \"price\",\n",
" \"condition\",\n",
" \"ConditionStand\",\n",
" \"ConditionClipStand\",\n",
" \"ConditionWinsorizeStand\",\n",
" ]\n",
"].head(20)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}