{ "cells": [ { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
020141013T000000221900.031.00118056501.0003711800195509817847.5112-122.25713405650
120141209T000000538000.032.25257072422.000372170400195119919812547.7210-122.31916907639
220150225T000000180000.021.00770100001.000367700193309802847.7379-122.23327208062
320141209T000000604000.043.00196050001.000571050910196509813647.5208-122.39313605000
420150218T000000510000.032.00168080801.0003816800198709807447.6168-122.04518007503
...............................................................
2160820140521T000000360000.032.50153011313.0003815300200909810347.6993-122.34615301509
2160920150223T000000400000.042.50231058132.0003823100201409814647.5107-122.36218307200
2161020140623T000000402101.020.75102013502.0003710200200909814447.5944-122.29910202007
2161120150116T000000400000.032.50160023882.0003816000200409802747.5345-122.06914101287
2161220141015T000000325000.020.75102010762.0003710200200809814447.5941-122.29910201357
\n", "

21613 rows × 20 columns

\n", "
" ], "text/plain": [ " date price bedrooms bathrooms sqft_living sqft_lot \\\n", "0 20141013T000000 221900.0 3 1.00 1180 5650 \n", "1 20141209T000000 538000.0 3 2.25 2570 7242 \n", "2 20150225T000000 180000.0 2 1.00 770 10000 \n", "3 20141209T000000 604000.0 4 3.00 1960 5000 \n", "4 20150218T000000 510000.0 3 2.00 1680 8080 \n", "... ... ... ... ... ... ... \n", "21608 20140521T000000 360000.0 3 2.50 1530 1131 \n", "21609 20150223T000000 400000.0 4 2.50 2310 5813 \n", "21610 20140623T000000 402101.0 2 0.75 1020 1350 \n", "21611 20150116T000000 400000.0 3 2.50 1600 2388 \n", "21612 20141015T000000 325000.0 2 0.75 1020 1076 \n", "\n", " floors waterfront view condition grade sqft_above sqft_basement \\\n", "0 1.0 0 0 3 7 1180 0 \n", "1 2.0 0 0 3 7 2170 400 \n", "2 1.0 0 0 3 6 770 0 \n", "3 1.0 0 0 5 7 1050 910 \n", "4 1.0 0 0 3 8 1680 0 \n", "... ... ... ... ... ... ... ... \n", "21608 3.0 0 0 3 8 1530 0 \n", "21609 2.0 0 0 3 8 2310 0 \n", "21610 2.0 0 0 3 7 1020 0 \n", "21611 2.0 0 0 3 8 1600 0 \n", "21612 2.0 0 0 3 7 1020 0 \n", "\n", " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", "0 1955 0 98178 47.5112 -122.257 1340 \n", "1 1951 1991 98125 47.7210 -122.319 1690 \n", "2 1933 0 98028 47.7379 -122.233 2720 \n", "3 1965 0 98136 47.5208 -122.393 1360 \n", "4 1987 0 98074 47.6168 -122.045 1800 \n", "... ... ... ... ... ... ... \n", "21608 2009 0 98103 47.6993 -122.346 1530 \n", "21609 2014 0 98146 47.5107 -122.362 1830 \n", "21610 2009 0 98144 47.5944 -122.299 1020 \n", "21611 2004 0 98027 47.5345 -122.069 1410 \n", "21612 2008 0 98144 47.5941 -122.299 1020 \n", "\n", " sqft_lot15 \n", "0 5650 \n", "1 7639 \n", "2 8062 \n", "3 5000 \n", "4 7503 \n", "... ... \n", "21608 1509 \n", "21609 7200 \n", "21610 2007 \n", "21611 1287 \n", "21612 1357 \n", "\n", "[21613 rows x 20 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "house = pd.read_csv(\"data/kc_house_data.csv\", index_col=\"id\")\n", "\n", "house = house.reset_index(drop=True)\n", "\n", "house" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yr_built_1901yr_built_1902yr_built_1903yr_built_1904yr_built_1905yr_built_1906yr_built_1907yr_built_1908yr_built_1909yr_built_1910...price_4489000.0price_4500000.0price_4668000.0price_5110800.0price_5300000.0price_5350000.0price_5570000.0price_6885000.0price_7062500.0price_7700000.0
00.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
40.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
..................................................................
216080.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
216090.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
216100.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
216110.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
216120.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", "

21613 rows × 4142 columns

\n", "
" ], "text/plain": [ " yr_built_1901 yr_built_1902 yr_built_1903 yr_built_1904 \\\n", "0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "21608 0.0 0.0 0.0 0.0 \n", "21609 0.0 0.0 0.0 0.0 \n", "21610 0.0 0.0 0.0 0.0 \n", "21611 0.0 0.0 0.0 0.0 \n", "21612 0.0 0.0 0.0 0.0 \n", "\n", " yr_built_1905 yr_built_1906 yr_built_1907 yr_built_1908 \\\n", "0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "21608 0.0 0.0 0.0 0.0 \n", "21609 0.0 0.0 0.0 0.0 \n", "21610 0.0 0.0 0.0 0.0 \n", "21611 0.0 0.0 0.0 0.0 \n", "21612 0.0 0.0 0.0 0.0 \n", "\n", " yr_built_1909 yr_built_1910 ... price_4489000.0 price_4500000.0 \\\n", "0 0.0 0.0 ... 0.0 0.0 \n", "1 0.0 0.0 ... 0.0 0.0 \n", "2 0.0 0.0 ... 0.0 0.0 \n", "3 0.0 0.0 ... 0.0 0.0 \n", "4 0.0 0.0 ... 0.0 0.0 \n", "... ... ... ... ... ... \n", "21608 0.0 0.0 ... 0.0 0.0 \n", "21609 0.0 0.0 ... 0.0 0.0 \n", "21610 0.0 0.0 ... 0.0 0.0 \n", "21611 0.0 0.0 ... 0.0 0.0 \n", "21612 0.0 0.0 ... 0.0 0.0 \n", "\n", " price_4668000.0 price_5110800.0 price_5300000.0 price_5350000.0 \\\n", "0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "21608 0.0 0.0 0.0 0.0 \n", "21609 0.0 0.0 0.0 0.0 \n", "21610 0.0 0.0 0.0 0.0 \n", "21611 0.0 0.0 0.0 0.0 \n", "21612 0.0 0.0 0.0 0.0 \n", "\n", " price_5570000.0 price_6885000.0 price_7062500.0 price_7700000.0 \n", "0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "21608 0.0 0.0 0.0 0.0 \n", "21609 0.0 0.0 0.0 0.0 \n", "21610 0.0 0.0 0.0 0.0 \n", "21611 0.0 0.0 0.0 0.0 \n", "21612 0.0 0.0 0.0 0.0 \n", "\n", "[21613 rows x 4142 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import OneHotEncoder\n", "import numpy as np\n", "\n", "\n", "encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n", "\n", "encoded_values = encoder.fit_transform(house[[\"yr_built\", \"price\"]])\n", "\n", "encoded_columns = encoder.get_feature_names_out([\"yr_built\", \"price\"])\n", "\n", "encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n", "\n", "encoded_values_df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewcondition...price_4489000.0price_4500000.0price_4668000.0price_5110800.0price_5300000.0price_5350000.0price_5570000.0price_6885000.0price_7062500.0price_7700000.0
020141013T000000221900.031.00118056501.0003...0.00.00.00.00.00.00.00.00.00.0
120141209T000000538000.032.25257072422.0003...0.00.00.00.00.00.00.00.00.00.0
220150225T000000180000.021.00770100001.0003...0.00.00.00.00.00.00.00.00.00.0
320141209T000000604000.043.00196050001.0005...0.00.00.00.00.00.00.00.00.00.0
420150218T000000510000.032.00168080801.0003...0.00.00.00.00.00.00.00.00.00.0
..................................................................
2160820140521T000000360000.032.50153011313.0003...0.00.00.00.00.00.00.00.00.00.0
2160920150223T000000400000.042.50231058132.0003...0.00.00.00.00.00.00.00.00.00.0
2161020140623T000000402101.020.75102013502.0003...0.00.00.00.00.00.00.00.00.00.0
2161120150116T000000400000.032.50160023882.0003...0.00.00.00.00.00.00.00.00.00.0
2161220141015T000000325000.020.75102010762.0003...0.00.00.00.00.00.00.00.00.00.0
\n", "

21613 rows × 4162 columns

\n", "
" ], "text/plain": [ " date price bedrooms bathrooms sqft_living sqft_lot \\\n", "0 20141013T000000 221900.0 3 1.00 1180 5650 \n", "1 20141209T000000 538000.0 3 2.25 2570 7242 \n", "2 20150225T000000 180000.0 2 1.00 770 10000 \n", "3 20141209T000000 604000.0 4 3.00 1960 5000 \n", "4 20150218T000000 510000.0 3 2.00 1680 8080 \n", "... ... ... ... ... ... ... \n", "21608 20140521T000000 360000.0 3 2.50 1530 1131 \n", "21609 20150223T000000 400000.0 4 2.50 2310 5813 \n", "21610 20140623T000000 402101.0 2 0.75 1020 1350 \n", "21611 20150116T000000 400000.0 3 2.50 1600 2388 \n", "21612 20141015T000000 325000.0 2 0.75 1020 1076 \n", "\n", " floors waterfront view condition ... price_4489000.0 \\\n", "0 1.0 0 0 3 ... 0.0 \n", "1 2.0 0 0 3 ... 0.0 \n", "2 1.0 0 0 3 ... 0.0 \n", "3 1.0 0 0 5 ... 0.0 \n", "4 1.0 0 0 3 ... 0.0 \n", "... ... ... ... ... ... ... \n", "21608 3.0 0 0 3 ... 0.0 \n", "21609 2.0 0 0 3 ... 0.0 \n", "21610 2.0 0 0 3 ... 0.0 \n", "21611 2.0 0 0 3 ... 0.0 \n", "21612 2.0 0 0 3 ... 0.0 \n", "\n", " price_4500000.0 price_4668000.0 price_5110800.0 price_5300000.0 \\\n", "0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "21608 0.0 0.0 0.0 0.0 \n", "21609 0.0 0.0 0.0 0.0 \n", "21610 0.0 0.0 0.0 0.0 \n", "21611 0.0 0.0 0.0 0.0 \n", "21612 0.0 0.0 0.0 0.0 \n", "\n", " price_5350000.0 price_5570000.0 price_6885000.0 price_7062500.0 \\\n", "0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "21608 0.0 0.0 0.0 0.0 \n", "21609 0.0 0.0 0.0 0.0 \n", "21610 0.0 0.0 0.0 0.0 \n", "21611 0.0 0.0 0.0 0.0 \n", "21612 0.0 0.0 0.0 0.0 \n", "\n", " price_7700000.0 \n", "0 0.0 \n", "1 0.0 \n", "2 0.0 \n", "3 0.0 \n", "4 0.0 \n", "... ... \n", "21608 0.0 \n", "21609 0.0 \n", "21610 0.0 \n", "21611 0.0 \n", "21612 0.0 \n", "\n", "[21613 rows x 4162 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "house = pd.concat([house, encoded_values_df], axis=1)\n", "\n", "house" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "labels = [\"old\", \"middle\", \"new\"]\n", "num_bins = 3" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([1900. , 1938.33333333, 1976.66666667, 2015. ]),\n", " array([ 3067, 8120, 10426]))" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hist1, bins1 = np.histogram(\n", " house[\"yr_built\"].fillna(house[\"yr_built\"].median()), bins=num_bins\n", ")\n", "bins1, hist1" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yr_builtyr_built
01955(1938.333, 1976.667]
11951(1938.333, 1976.667]
21933(1900.0, 1938.333]
31965(1938.333, 1976.667]
41987(1976.667, 2015.0]
52001(1976.667, 2015.0]
61995(1976.667, 2015.0]
71963(1938.333, 1976.667]
81960(1938.333, 1976.667]
92003(1976.667, 2015.0]
101965(1938.333, 1976.667]
111942(1938.333, 1976.667]
121927(1900.0, 1938.333]
131977(1976.667, 2015.0]
141900NaN
151979(1976.667, 2015.0]
161994(1976.667, 2015.0]
171916(1900.0, 1938.333]
181921(1900.0, 1938.333]
191969(1938.333, 1976.667]
\n", "
" ], "text/plain": [ " yr_built yr_built\n", "0 1955 (1938.333, 1976.667]\n", "1 1951 (1938.333, 1976.667]\n", "2 1933 (1900.0, 1938.333]\n", "3 1965 (1938.333, 1976.667]\n", "4 1987 (1976.667, 2015.0]\n", "5 2001 (1976.667, 2015.0]\n", "6 1995 (1976.667, 2015.0]\n", "7 1963 (1938.333, 1976.667]\n", "8 1960 (1938.333, 1976.667]\n", "9 2003 (1976.667, 2015.0]\n", "10 1965 (1938.333, 1976.667]\n", "11 1942 (1938.333, 1976.667]\n", "12 1927 (1900.0, 1938.333]\n", "13 1977 (1976.667, 2015.0]\n", "14 1900 NaN\n", "15 1979 (1976.667, 2015.0]\n", "16 1994 (1976.667, 2015.0]\n", "17 1916 (1900.0, 1938.333]\n", "18 1921 (1900.0, 1938.333]\n", "19 1969 (1938.333, 1976.667]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins1))], axis=1).head(20)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yr_builtyr_built
01955middle
11951middle
21933old
31965middle
41987new
52001new
61995new
71963middle
81960middle
92003new
101965middle
111942middle
121927old
131977new
141900NaN
151979new
161994new
171916old
181921old
191969middle
\n", "
" ], "text/plain": [ " yr_built yr_built\n", "0 1955 middle\n", "1 1951 middle\n", "2 1933 old\n", "3 1965 middle\n", "4 1987 new\n", "5 2001 new\n", "6 1995 new\n", "7 1963 middle\n", "8 1960 middle\n", "9 2003 new\n", "10 1965 middle\n", "11 1942 middle\n", "12 1927 old\n", "13 1977 new\n", "14 1900 NaN\n", "15 1979 new\n", "16 1994 new\n", "17 1916 old\n", "18 1921 old\n", "19 1969 middle" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(\n", " [house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins1), labels=labels)], axis=1\n", ").head(20)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([1899., 1928., 1957., 1986., 2015.]),\n", " array([2403, 4230, 6914, 8028, 38]))" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bins2 = np.linspace(1899, 2015, 5)\n", "tmp_bins2 = np.digitize(house[\"yr_built\"].fillna(house[\"yr_built\"].median()), bins2)\n", "hist2 = np.bincount(tmp_bins2 - 1)\n", "bins2, hist2" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yr_builtyr_built
01955(1928.0, 1957.0]
11951(1928.0, 1957.0]
21933(1928.0, 1957.0]
31965(1957.0, 1986.0]
41987(1986.0, 2015.0]
52001(1986.0, 2015.0]
61995(1986.0, 2015.0]
71963(1957.0, 1986.0]
81960(1957.0, 1986.0]
92003(1986.0, 2015.0]
101965(1957.0, 1986.0]
111942(1928.0, 1957.0]
121927(1899.0, 1928.0]
131977(1957.0, 1986.0]
141900(1899.0, 1928.0]
151979(1957.0, 1986.0]
161994(1986.0, 2015.0]
171916(1899.0, 1928.0]
181921(1899.0, 1928.0]
191969(1957.0, 1986.0]
\n", "
" ], "text/plain": [ " yr_built yr_built\n", "0 1955 (1928.0, 1957.0]\n", "1 1951 (1928.0, 1957.0]\n", "2 1933 (1928.0, 1957.0]\n", "3 1965 (1957.0, 1986.0]\n", "4 1987 (1986.0, 2015.0]\n", "5 2001 (1986.0, 2015.0]\n", "6 1995 (1986.0, 2015.0]\n", "7 1963 (1957.0, 1986.0]\n", "8 1960 (1957.0, 1986.0]\n", "9 2003 (1986.0, 2015.0]\n", "10 1965 (1957.0, 1986.0]\n", "11 1942 (1928.0, 1957.0]\n", "12 1927 (1899.0, 1928.0]\n", "13 1977 (1957.0, 1986.0]\n", "14 1900 (1899.0, 1928.0]\n", "15 1979 (1957.0, 1986.0]\n", "16 1994 (1986.0, 2015.0]\n", "17 1916 (1899.0, 1928.0]\n", "18 1921 (1899.0, 1928.0]\n", "19 1969 (1957.0, 1986.0]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins2))], axis=1).head(20)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([1899, 1957, 2001, 2015]), array([ 6633, 10439, 4541]))" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hist3, bins3 = np.histogram(\n", " house[\"yr_built\"].fillna(house[\"yr_built\"].median()), bins=[1899, 1957, 2001, 2015]\n", ")\n", "bins3, hist3" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yr_builtyr_built
01955(1899, 1957]
11951(1899, 1957]
21933(1899, 1957]
31965(1957, 2001]
41987(1957, 2001]
52001(1957, 2001]
61995(1957, 2001]
71963(1957, 2001]
81960(1957, 2001]
92003(2001, 2015]
101965(1957, 2001]
111942(1899, 1957]
121927(1899, 1957]
131977(1957, 2001]
141900(1899, 1957]
151979(1957, 2001]
161994(1957, 2001]
171916(1899, 1957]
181921(1899, 1957]
191969(1957, 2001]
\n", "
" ], "text/plain": [ " yr_built yr_built\n", "0 1955 (1899, 1957]\n", "1 1951 (1899, 1957]\n", "2 1933 (1899, 1957]\n", "3 1965 (1957, 2001]\n", "4 1987 (1957, 2001]\n", "5 2001 (1957, 2001]\n", "6 1995 (1957, 2001]\n", "7 1963 (1957, 2001]\n", "8 1960 (1957, 2001]\n", "9 2003 (2001, 2015]\n", "10 1965 (1957, 2001]\n", "11 1942 (1899, 1957]\n", "12 1927 (1899, 1957]\n", "13 1977 (1957, 2001]\n", "14 1900 (1899, 1957]\n", "15 1979 (1957, 2001]\n", "16 1994 (1957, 2001]\n", "17 1916 (1899, 1957]\n", "18 1921 (1899, 1957]\n", "19 1969 (1957, 2001]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins3))], axis=1).head(20)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yr_builtyr_built
01955old
11951old
21933old
31965middle
41987middle
52001middle
61995middle
71963middle
81960middle
92003new
101965middle
111942old
121927old
131977middle
141900old
151979middle
161994middle
171916old
181921old
191969middle
\n", "
" ], "text/plain": [ " yr_built yr_built\n", "0 1955 old\n", "1 1951 old\n", "2 1933 old\n", "3 1965 middle\n", "4 1987 middle\n", "5 2001 middle\n", "6 1995 middle\n", "7 1963 middle\n", "8 1960 middle\n", "9 2003 new\n", "10 1965 middle\n", "11 1942 old\n", "12 1927 old\n", "13 1977 middle\n", "14 1900 old\n", "15 1979 middle\n", "16 1994 middle\n", "17 1916 old\n", "18 1921 old\n", "19 1969 middle" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(\n", " [house[\"yr_built\"], pd.cut(house[\"yr_built\"], list(bins3), labels=labels)],\n", " axis=1,\n", ").head(20)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yr_builtyr_built
019550
119510
219330
319651
419871
520012
619952
719631
819601
920032
1019651
1119420
1219270
1319771
1419000
1519791
1619942
1719160
1819210
1919691
\n", "
" ], "text/plain": [ " yr_built yr_built\n", "0 1955 0\n", "1 1951 0\n", "2 1933 0\n", "3 1965 1\n", "4 1987 1\n", "5 2001 2\n", "6 1995 2\n", "7 1963 1\n", "8 1960 1\n", "9 2003 2\n", "10 1965 1\n", "11 1942 0\n", "12 1927 0\n", "13 1977 1\n", "14 1900 0\n", "15 1979 1\n", "16 1994 2\n", "17 1916 0\n", "18 1921 0\n", "19 1969 1" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(\n", " [house[\"yr_built\"], pd.qcut(house[\"yr_built\"], q=3, labels=False)], axis=1\n", ").head(20)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yr_builtyr_built
01955old
11951old
21933old
31965middle
41987middle
52001new
61995new
71963middle
81960middle
92003new
101965middle
111942old
121927old
131977middle
141900old
151979middle
161994new
171916old
181921old
191969middle
\n", "
" ], "text/plain": [ " yr_built yr_built\n", "0 1955 old\n", "1 1951 old\n", "2 1933 old\n", "3 1965 middle\n", "4 1987 middle\n", "5 2001 new\n", "6 1995 new\n", "7 1963 middle\n", "8 1960 middle\n", "9 2003 new\n", "10 1965 middle\n", "11 1942 old\n", "12 1927 old\n", "13 1977 middle\n", "14 1900 old\n", "15 1979 middle\n", "16 1994 new\n", "17 1916 old\n", "18 1921 old\n", "19 1969 middle" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(\n", " [house[\"yr_built\"], pd.qcut(house[\"yr_built\"], q=3, labels=labels)], axis=1\n", ").head(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Пример конструирования признаков на основе существующих" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepricebedroomsbathroomssqft_livingsqft_lotfloorsgradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15Price_categoryRenovated_flagZipcode_area
020141013T000000221900.031.00118056501.0711800195509817847.5112-122.25713405650Low0981
120141209T000000538000.032.25257072422.072170400195119919812547.7210-122.31916907639Medium1981
220150225T000000180000.021.00770100001.067700193309802847.7379-122.23327208062Low0980
320141209T000000604000.043.00196050001.071050910196509813647.5208-122.39313605000High0981
420150218T000000510000.032.00168080801.0816800198709807447.6168-122.04518007503Medium0980
...............................................................
2160820140521T000000360000.032.50153011313.0815300200909810347.6993-122.34615301509Low0981
2160920150223T000000400000.042.50231058132.0823100201409814647.5107-122.36218307200Medium0981
2161020140623T000000402101.020.75102013502.0710200200909814447.5944-122.29910202007Medium0981
2161120150116T000000400000.032.50160023882.0816000200409802747.5345-122.06914101287Medium0980
2161220141015T000000325000.020.75102010762.0710200200809814447.5941-122.29910201357Low0981
\n", "

21613 rows × 20 columns

\n", "
" ], "text/plain": [ " date price bedrooms bathrooms sqft_living sqft_lot \\\n", "0 20141013T000000 221900.0 3 1.00 1180 5650 \n", "1 20141209T000000 538000.0 3 2.25 2570 7242 \n", "2 20150225T000000 180000.0 2 1.00 770 10000 \n", "3 20141209T000000 604000.0 4 3.00 1960 5000 \n", "4 20150218T000000 510000.0 3 2.00 1680 8080 \n", "... ... ... ... ... ... ... \n", "21608 20140521T000000 360000.0 3 2.50 1530 1131 \n", "21609 20150223T000000 400000.0 4 2.50 2310 5813 \n", "21610 20140623T000000 402101.0 2 0.75 1020 1350 \n", "21611 20150116T000000 400000.0 3 2.50 1600 2388 \n", "21612 20141015T000000 325000.0 2 0.75 1020 1076 \n", "\n", " floors grade sqft_above sqft_basement yr_built yr_renovated \\\n", "0 1.0 7 1180 0 1955 0 \n", "1 2.0 7 2170 400 1951 1991 \n", "2 1.0 6 770 0 1933 0 \n", "3 1.0 7 1050 910 1965 0 \n", "4 1.0 8 1680 0 1987 0 \n", "... ... ... ... ... ... ... \n", "21608 3.0 8 1530 0 2009 0 \n", "21609 2.0 8 2310 0 2014 0 \n", "21610 2.0 7 1020 0 2009 0 \n", "21611 2.0 8 1600 0 2004 0 \n", "21612 2.0 7 1020 0 2008 0 \n", "\n", " zipcode lat long sqft_living15 sqft_lot15 Price_category \\\n", "0 98178 47.5112 -122.257 1340 5650 Low \n", "1 98125 47.7210 -122.319 1690 7639 Medium \n", "2 98028 47.7379 -122.233 2720 8062 Low \n", "3 98136 47.5208 -122.393 1360 5000 High \n", "4 98074 47.6168 -122.045 1800 7503 Medium \n", "... ... ... ... ... ... ... \n", "21608 98103 47.6993 -122.346 1530 1509 Low \n", "21609 98146 47.5107 -122.362 1830 7200 Medium \n", "21610 98144 47.5944 -122.299 1020 2007 Medium \n", "21611 98027 47.5345 -122.069 1410 1287 Medium \n", "21612 98144 47.5941 -122.299 1020 1357 Low \n", "\n", " Renovated_flag Zipcode_area \n", "0 0 981 \n", "1 1 981 \n", "2 0 980 \n", "3 0 981 \n", "4 0 980 \n", "... ... ... \n", "21608 0 981 \n", "21609 0 981 \n", "21610 0 981 \n", "21611 0 980 \n", "21612 0 981 \n", "\n", "[21613 rows x 20 columns]" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "house_cleaned = house.drop([\"waterfront\", \"view\", \"condition\"], axis=1, errors=\"ignore\")\n", "\n", "house_cleaned = house_cleaned.dropna()\n", "\n", "# Признак \"Price_category\": разделение домов на категории цен\n", "house_cleaned[\"Price_category\"] = pd.qcut(\n", " house_cleaned[\"price\"], q=3, labels=[\"Low\", \"Medium\", \"High\"]\n", ")\n", "\n", "# Признак \"Renovated_flag\": 1, если дом был отремонтирован, иначе 0\n", "house_cleaned[\"Renovated_flag\"] = house_cleaned[\"yr_renovated\"].apply(\n", " lambda x: 1 if x > 0 else 0\n", ")\n", "\n", "# Признак \"Zipcode_area\": используем первые три цифры из почтового индекса\n", "house_cleaned[\"Zipcode_area\"] = house_cleaned[\"zipcode\"].apply(lambda x: str(x)[:3])\n", "\n", "house_cleaned" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " price bedrooms bathrooms sqft_living sqft_lot floors waterfront \\\n", "id \n", "0 221900.0 3 1.00 1180 5650 1.0 0 \n", "1 538000.0 3 2.25 2570 7242 2.0 0 \n", "2 180000.0 2 1.00 770 10000 1.0 0 \n", "3 604000.0 4 3.00 1960 5000 1.0 0 \n", "4 510000.0 3 2.00 1680 8080 1.0 0 \n", "\n", " view condition grade ... yr_renovated zipcode lat long \\\n", "id ... \n", "0 0 3 7 ... 0 98178 47.5112 -122.257 \n", "1 0 3 7 ... 1991 98125 47.7210 -122.319 \n", "2 0 3 6 ... 0 98028 47.7379 -122.233 \n", "3 0 5 7 ... 0 98136 47.5208 -122.393 \n", "4 0 3 8 ... 0 98074 47.6168 -122.045 \n", "\n", " sqft_living15 sqft_lot15 HOUR(date) MONTH(date) WEEKDAY(date) \\\n", "id \n", "0 1340 5650 0 10 0 \n", "1 1690 7639 0 12 1 \n", "2 2720 8062 0 2 2 \n", "3 1360 5000 0 12 1 \n", "4 1800 7503 0 2 2 \n", "\n", " YEAR(date) \n", "id \n", "0 2014 \n", "1 2014 \n", "2 2015 \n", "3 2014 \n", "4 2015 \n", "\n", "[5 rows x 23 columns]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Mai\\mai\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", " warnings.warn(\n", "c:\\Mai\\mai\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:\n", " agg_primitives: ['count', 'mean', 'mode', 'sum']\n", "This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n", " warnings.warn(warning_msg, UnusedPrimitiveWarning)\n" ] } ], "source": [ "import featuretools as ft\n", "from woodwork.logical_types import Categorical, Datetime\n", "\n", "import featuretools as ft\n", "from woodwork.logical_types import Categorical, Datetime\n", "import pandas as pd\n", "\n", "# Загрузка данных\n", "df = pd.read_csv(\"data/kc_house_data.csv\")\n", "\n", "# Убедимся, что есть уникальный идентификатор для каждой строки (если нет, создаем)\n", "df[\"id\"] = range(len(df))\n", "\n", "# Создаем EntitySet для данных о домах\n", "es = ft.EntitySet(id=\"house_sales\")\n", "\n", "# Добавляем основной DataFrame в EntitySet с указанием типов данных\n", "es = es.add_dataframe(\n", " dataframe_name=\"houses\",\n", " dataframe=df,\n", " index=\"id\", # Уникальный идентификатор для домов\n", " logical_types={\n", " \"date\": Datetime,\n", " \"zipcode\": Categorical,\n", " \"condition\": Categorical,\n", " \"grade\": Categorical,\n", " \"view\": Categorical,\n", " \"waterfront\": Categorical,\n", " },\n", ")\n", "ft.primitives.list_primitives()\n", "# Автоматическое конструирование признаков с применением корректных примитивов\n", "feature_matrix, feature_defs = ft.dfs(\n", " entityset=es,\n", " target_dataframe_name=\"houses\", # Название основной таблицы\n", " agg_primitives=[\"mean\", \"count\", \"mode\", \"sum\"], # Агрегирующие примитивы\n", " trans_primitives=[\n", " \"year\",\n", " \"month\",\n", " \"weekday\",\n", " \"hour\",\n", " ], # Корректные трансформационные примитивы\n", " max_depth=2, # Максимальная глубина для генерации признаков\n", ")\n", "\n", "# Просмотр полученной feature_matrix\n", "print(feature_matrix.head())" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAAGdCAYAAADAAnMpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAlr0lEQVR4nO3de3CU5aHH8d9CNpsEsuFmbhBoFAzhEiAgEi4Fi9yHMbZknNQx2IO2x4EeOFjspFcj6jJKDnKGFkFr4znTlBZa0hmLkogEpCE2QdIJFNMDVcLRJIhKliS6vJA9f1i27smF3WTxScL3M5OJ++zzvu+TzLzZr+8uuzav1+sVAACAIf1MLwAAANzciBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYFWZ6AYFobW3VBx98oOjoaNlsNtPLAQAAAfB6vbp06ZISExPVr1/H1z96RYx88MEHSkpKMr0MAADQBefOndOIESM6vL9XxEh0dLSkz38Yp9NpeDUAQsmyLBUXF2vhwoWy2+2mlwMghNxut5KSknyP4x3pFTFy7akZp9NJjAB9jGVZioqKktPpJEaAPup6L7HgBawAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMCooGLk8ccfl81m8/saO3Zsp9vs3r1bY8eOVUREhCZOnKh9+/Z1a8EAAKBvCfrKyPjx41VXV+f7OnLkSIdzy8rKlJ2drVWrVun48ePKzMxUZmamTpw40a1FAwCAviPoGAkLC1N8fLzva9iwYR3O3bp1qxYvXqwNGzYoNTVVGzduVHp6urZt29atRQMAgL4j6A/K+5//+R8lJiYqIiJCGRkZcrlcGjlyZLtzjx49qvXr1/uNLVq0SEVFRZ0ew+PxyOPx+G673W5Jn3+glmVZwS4ZQIi1tLSopqYmJPtq+tSjsuozih5UroGRjm7vLyUlRVFRUSFYGYDuCvQxO6gYufPOO1VQUKCUlBTV1dUpLy9Pc+bM0YkTJ9r9eOD6+nrFxcX5jcXFxam+vr7T47hcLuXl5bUZLy4u5o8M0AOcOXNGjz76aEj3+UyI9pOfn6/bbrstRHsD0B0tLS0BzQsqRpYsWeL777S0NN15550aNWqUfvvb32rVqlXBrbATubm5fldU3G63kpKStHDhQjmdzpAdB0DXtLS0aPbs2SHZ19/qGrVh71/17L3jdHtCTLf3x5URoOe49szG9QT9NM0XDRo0SLfffrtOnz7d7v3x8fFqaGjwG2toaFB8fHyn+3U4HHI42l6utdvtstvtXV8wgJCIiYnR9OnTQ7Kv8LMfyXH0siZMTtfkUUNDsk8APUOgj9ndep+RpqYmnTlzRgkJCe3en5GRoQMHDviNlZSUKCMjozuHBQAAfUhQMfK9731Phw4d0nvvvaeysjLde++96t+/v7KzsyVJOTk5ys3N9c1fu3atXnvtNeXn5+udd97R448/rsrKSq1Zsya0PwUAAOi1gnqa5n//93+VnZ2tjz76SLfccotmz56t8vJy3XLLLZKk2tpa9ev3z76ZOXOmCgsL9aMf/Ug/+MEPNGbMGBUVFWnChAmh/SkAAECvFVSM7Nq1q9P7S0tL24xlZWUpKysrqEUBAICbB59NAwAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKO6FSObNm2SzWbTunXrOpxTUFAgm83m9xUREdGdwwIAgD4krKsbVlRUaMeOHUpLS7vuXKfTqZqaGt9tm83W1cMCAIA+pktXRpqamnT//ffrhRde0ODBg68732azKT4+3vcVFxfXlcMCAIA+qEtXRlavXq1ly5bp7rvv1pNPPnnd+U1NTRo1apRaW1uVnp6up59+WuPHj+9wvsfjkcfj8d12u92SJMuyZFlWV5YMoIe6cuWK7zvnN9C3BHpOBx0ju3bt0ttvv62KioqA5qekpOill15SWlqaGhsbtXnzZs2cOVMnT57UiBEj2t3G5XIpLy+vzXhxcbGioqKCXTKAHuxckySFqby8XO+fML0aAKHU0tIS0Dyb1+v1BrrTc+fOadq0aSopKfG9VmTevHmaPHmynnvuuYD2YVmWUlNTlZ2drY0bN7Y7p70rI0lJSbpw4YKcTmegywXQC/yl9mOteKFSex6epkkjh5heDoAQcrvdGjZsmBobGzt9/A7qysixY8d0/vx5paen+8auXr2qw4cPa9u2bfJ4POrfv3+n+7Db7ZoyZYpOnz7d4RyHwyGHw9Hutna7PZglA+jhwsLCfN85v4G+JdBzOqgYmT9/vqqrq/3GvvWtb2ns2LH6/ve/f90QkT6Pl+rqai1dujSYQwMAgD4qqBiJjo7WhAkT/MYGDBigoUOH+sZzcnI0fPhwuVwuSdITTzyhGTNmaPTo0bp48aKeffZZnT17Vg899FCIfgQAANCbdfl9RjpSW1urfv3++S+GP/nkEz388MOqr6/X4MGDNXXqVJWVlWncuHGhPjQAAOiFgnoBqylut1sxMTHXfQEMgN6n6uxHytxerqJHZmjyqKGmlwMghAJ9/OazaQAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGBUt2Jk06ZNstlsWrduXafzdu/erbFjxyoiIkITJ07Uvn37unNYAADQh3Q5RioqKrRjxw6lpaV1Oq+srEzZ2dlatWqVjh8/rszMTGVmZurEiRNdPTQAAOhDuhQjTU1Nuv/++/XCCy9o8ODBnc7dunWrFi9erA0bNig1NVUbN25Uenq6tm3b1qUFAwCAviWsKxutXr1ay5Yt0913360nn3yy07lHjx7V+vXr/cYWLVqkoqKiDrfxeDzyeDy+2263W5JkWZYsy+rKkgH8w3sfNavZc9X0Mnz+Vt/o972nGODor68MHWB6GUCvFuhjdtAxsmvXLr399tuqqKgIaH59fb3i4uL8xuLi4lRfX9/hNi6XS3l5eW3Gi4uLFRUVFdyCAfic/1R6qqpL/w9ywz2295TpJbTxw8lXFBtpehVA79XS0hLQvKD+Kp07d05r165VSUmJIiIiurSwQOTm5vpdTXG73UpKStLChQvldDpv2HGBvu7kB26pqlybV0zU6Ft6xv/1N3/m0WtvVmjxnDs0IMJhejmSpNMfNut7e6p1R8ZsjU/kbw7QVdee2bieoGLk2LFjOn/+vNLT031jV69e1eHDh7Vt2zZ5PB7179/fb5v4+Hg1NDT4jTU0NCg+Pr7D4zgcDjkcbf8o2e122e32YJYM4AvCwj4/5ccmxGjC8BjDq/mcZVm68I40/dZbesz5fe33FBYW1mPWBPRGgZ4/Qb2Adf78+aqurlZVVZXva9q0abr//vtVVVXVJkQkKSMjQwcOHPAbKykpUUZGRjCHBgAAfVRQV0aio6M1YcIEv7EBAwZo6NChvvGcnBwNHz5cLpdLkrR27VrNnTtX+fn5WrZsmXbt2qXKykrt3LkzRD8CAADozUL+Dqy1tbWqq6vz3Z45c6YKCwu1c+dOTZo0SXv27FFRUVGbqAEAADenbr+svrS0tNPbkpSVlaWsrKzuHgoAAPRBfDYNAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjAoqRrZv3660tDQ5nU45nU5lZGTo1Vdf7XB+QUGBbDab31dERES3Fw0AAPqOsGAmjxgxQps2bdKYMWPk9Xr18ssv65577tHx48c1fvz4drdxOp2qqanx3bbZbN1bMQAA6FOCipHly5f73X7qqae0fft2lZeXdxgjNptN8fHxXV8hAADo04KKkS+6evWqdu/erebmZmVkZHQ4r6mpSaNGjVJra6vS09P19NNPdxgu13g8Hnk8Ht9tt9stSbIsS5ZldXXJwE3vypUrvu895Vy6to6esh6pZ/6egN4o0PMn6Biprq5WRkaGPvvsMw0cOFB79+7VuHHj2p2bkpKil156SWlpaWpsbNTmzZs1c+ZMnTx5UiNGjOjwGC6XS3l5eW3Gi4uLFRUVFeySAfzDuSZJCtORI0d0dqDp1fgrKSkxvQSfnvx7AnqTlpaWgObZvF6vN5gdX758WbW1tWpsbNSePXv04osv6tChQx0GyRdZlqXU1FRlZ2dr48aNHc5r78pIUlKSLly4IKfTGcxyAXzByQ/cytxerqJHZmh8Ys84lyzLUklJiRYsWCC73W56OZJ65u8J6I3cbreGDRumxsbGTh+/g74yEh4ertGjR0uSpk6dqoqKCm3dulU7duy47rZ2u11TpkzR6dOnO53ncDjkcDja3b6n/LECeqOwsDDf9552LvWk87sn/56A3iTQ86fb7zPS2trqdxWjM1evXlV1dbUSEhK6e1gAANBHBHVlJDc3V0uWLNHIkSN16dIlFRYWqrS0VPv375ck5eTkaPjw4XK5XJKkJ554QjNmzNDo0aN18eJFPfvsszp79qweeuih0P8kAACgVwoqRs6fP6+cnBzV1dUpJiZGaWlp2r9/vxYsWCBJqq2tVb9+/7zY8sknn+jhhx9WfX29Bg8erKlTp6qsrCyg15cAAICbQ1Ax8otf/KLT+0tLS/1ub9myRVu2bAl6UQAA4ObBZ9MAAACjiBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAqKBiZPv27UpLS5PT6ZTT6VRGRoZeffXVTrfZvXu3xo4dq4iICE2cOFH79u3r1oIBAEDfElSMjBgxQps2bdKxY8dUWVmpr33ta7rnnnt08uTJdueXlZUpOztbq1at0vHjx5WZmanMzEydOHEiJIsHAAC9X1Axsnz5ci1dulRjxozR7bffrqeeekoDBw5UeXl5u/O3bt2qxYsXa8OGDUpNTdXGjRuVnp6ubdu2hWTxAACg9wvr6oZXr17V7t271dzcrIyMjHbnHD16VOvXr/cbW7RokYqKijrdt8fjkcfj8d12u92SJMuyZFlWV5cM3PQufeqRLcyt/X+r0OlPBphejiTp08uXVfbhB2qpLlNkeLjp5UiSzn3yqWxhbl25coW/OUA3BHr+BB0j1dXVysjI0GeffaaBAwdq7969GjduXLtz6+vrFRcX5zcWFxen+vr6To/hcrmUl5fXZry4uFhRUVHBLhnAPxxtsMk+6C0V1B6Qak2v5gvsUmm16UX4sw+ar4qjUTobaXolQO/V0tIS0LygYyQlJUVVVVVqbGzUnj17tHLlSh06dKjDIOmK3NxcvysqbrdbSUlJWrhwoZxOZ8iOA9xsZjRfVnL1cMVEr1BEWH/Ty5Eknf2oSVsPvqu1dyVr1NCBppfjkxB9i9ITR5leBtCrXXtm43qCjpHw8HCNHj1akjR16lRVVFRo69at2rFjR5u58fHxamho8BtraGhQfHx8p8dwOBxyOBxtxu12u+x2e7BLBvAPcYPs+s6cqaaX4afq7Efa8tllzf1KuiaPGmp6OQBCKNDH7G6/z0hra6vf6zu+KCMjQwcOHPAbKykp6fA1JgAA4OYT1JWR3NxcLVmyRCNHjtSlS5dUWFio0tJS7d+/X5KUk5Oj4cOHy+VySZLWrl2ruXPnKj8/X8uWLdOuXbtUWVmpnTt3hv4nAQAAvVJQMXL+/Hnl5OSorq5OMTExSktL0/79+7VgwQJJUm1trfr1++fFlpkzZ6qwsFA/+tGP9IMf/EBjxoxRUVGRJkyYENqfAgAA9FpBxcgvfvGLTu8vLS1tM5aVlaWsrKygFgUAAG4efDYNAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjAoqRlwul+644w5FR0crNjZWmZmZqqmp6XSbgoIC2Ww2v6+IiIhuLRoAAPQdQcXIoUOHtHr1apWXl6ukpESWZWnhwoVqbm7udDun06m6ujrf19mzZ7u1aAAA0HeEBTP5tdde87tdUFCg2NhYHTt2TF/96lc73M5msyk+Pr5rKwQAAH1aUDHy/zU2NkqShgwZ0um8pqYmjRo1Sq2trUpPT9fTTz+t8ePHdzjf4/HI4/H4brvdbkmSZVmyLKs7SwbQw1y5csX3nfMb6FsCPae7HCOtra1at26dZs2apQkTJnQ4LyUlRS+99JLS0tLU2NiozZs3a+bMmTp58qRGjBjR7jYul0t5eXltxouLixUVFdXVJQPogc41SVKYysvL9f4J06sBEEotLS0BzbN5vV5vVw7wyCOP6NVXX9WRI0c6jIr2WJal1NRUZWdna+PGje3Oae/KSFJSki5cuCCn09mV5QLoof5S+7FWvFCpPQ9P06SRnV9lBdC7uN1uDRs2TI2NjZ0+fnfpysiaNWv0yiuv6PDhw0GFiCTZ7XZNmTJFp0+f7nCOw+GQw+Fod1u73R70egH0XGFhYb7vnN9A3xLoOR3Uv6bxer1as2aN9u7dqzfeeEPJyclBL+zq1auqrq5WQkJC0NsCAIC+J6grI6tXr1ZhYaH+8Ic/KDo6WvX19ZKkmJgYRUZGSpJycnI0fPhwuVwuSdITTzyhGTNmaPTo0bp48aKeffZZnT17Vg899FCIfxQAANAbBRUj27dvlyTNmzfPb/yXv/ylHnzwQUlSbW2t+vX75wWXTz75RA8//LDq6+s1ePBgTZ06VWVlZRo3blz3Vg4AAPqEoGIkkNe6lpaW+t3esmWLtmzZEtSiAADAzYPPpgEAAEYRIwAAwChiBAAAGEWMAAAAo4gRAABgFDECAACMIkYAAIBRxAgAADCKGAEAAEYRIwAAwChiBAAAGEWMAAAAo4gRAABgFDECAACMIkYAAIBRxAgAADCKGAEAAEYRIwAAwChiBAAAGEWMAAAAo4gRAABgFDECAACMIkYAAIBRxAgAADCKGAEAAEYRIwAAwChiBAAAGEWMAAAAo4gRAABgFDECAACMIkYAAIBRQcWIy+XSHXfcoejoaMXGxiozM1M1NTXX3W737t0aO3asIiIiNHHiRO3bt6/LCwYAAH1LUDFy6NAhrV69WuXl5SopKZFlWVq4cKGam5s73KasrEzZ2dlatWqVjh8/rszMTGVmZurEiRPdXjwAAOj9bF6v19vVjT/88EPFxsbq0KFD+upXv9runPvuu0/Nzc165ZVXfGMzZszQ5MmT9fzzzwd0HLfbrZiYGDU2NsrpdHZ1uQB6oKqzHylze7mKHpmhyaOGml4OgBAK9PE7rDsHaWxslCQNGTKkwzlHjx7V+vXr/cYWLVqkoqKiDrfxeDzyeDy+2263W5JkWZYsy+rGigGEQktLS0BP0Qbib3WN8tSf1omqcF1uiOn2/lJSUhQVFRWClQHorkAfs7scI62trVq3bp1mzZqlCRMmdDivvr5ecXFxfmNxcXGqr6/vcBuXy6W8vLw248XFxfyRAXqAM2fO6NFHHw3pPh94OTT7yc/P12233RaanQHolpaWloDmdTlGVq9erRMnTujIkSNd3UWHcnNz/a6muN1uJSUlaeHChTxNA/QALS0tmj17dkj21fSpR/vfrNCiOXdoYKSj2/vjygjQc1x7ZuN6uhQja9as0SuvvKLDhw9rxIgRnc6Nj49XQ0OD31hDQ4Pi4+M73MbhcMjhaPtHyW63y263d2XJAEIoJiZG06dPD8m+LMvSpYsfa87MGZzfQB8T6Dkd1L+m8Xq9WrNmjfbu3as33nhDycnJ190mIyNDBw4c8BsrKSlRRkZGMIcGAAB9VFBXRlavXq3CwkL94Q9/UHR0tO91HzExMYqMjJQk5eTkaPjw4XK5XJKktWvXau7cucrPz9eyZcu0a9cuVVZWaufOnSH+UQAAQG8U1JWR7du3q7GxUfPmzVNCQoLv6ze/+Y1vTm1trerq6ny3Z86cqcLCQu3cuVOTJk3Snj17VFRU1OmLXgEAwM0jqCsjgbwlSWlpaZuxrKwsZWVlBXMoAABwk+CzaQAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGAUMQIAAIwiRgAAgFHECAAAMIoYAQAARhEjAADAKGIEAAAYRYwAAACjiBEAAGBU0DFy+PBhLV++XImJibLZbCoqKup0fmlpqWw2W5uv+vr6rq4ZAAD0IUHHSHNzsyZNmqSf/exnQW1XU1Ojuro631dsbGywhwYAAH1QWLAbLFmyREuWLAn6QLGxsRo0aFDQ2wEAgL4t6BjpqsmTJ8vj8WjChAl6/PHHNWvWrA7nejweeTwe32232y1JsixLlmXd8LUC+PJcO6c5t4G+J9Dz+obHSEJCgp5//nlNmzZNHo9HL774oubNm6e33npL6enp7W7jcrmUl5fXZry4uFhRUVE3eskADCgpKTG9BAAh1tLSEtA8m9fr9Xb1IDabTXv37lVmZmZQ282dO1cjR47Uf//3f7d7f3tXRpKSknThwgU5nc6uLhdAD2RZlkpKSrRgwQLZ7XbTywEQQm63W8OGDVNjY2Onj99f2tM0XzR9+nQdOXKkw/sdDoccDkebcbvdzh8roI/i/Ab6nkDPaSPvM1JVVaWEhAQThwYAAD1M0FdGmpqadPr0ad/td999V1VVVRoyZIhGjhyp3Nxcvf/++/qv//ovSdJzzz2n5ORkjR8/Xp999plefPFFvfHGGyouLg7dTwEAAHqtoGOksrJSd911l+/2+vXrJUkrV65UQUGB6urqVFtb67v/8uXLevTRR/X+++8rKipKaWlpev311/32AQAAbl7degHrl8XtdismJua6L4AB0PtYlqV9+/Zp6dKlvGYE6GMCffzms2kAAIBRxAgAADCKGAEAAEYRIwAAwChiBAAAGEWMAAAAo4gRAABgFDECAACMIkYAAIBRxAgAADCKGAEAAEYRIwAAwChiBAAAGEWMAAAAo4gRAABgFDECAACMIkYAAIBRxAgAADCKGAEAAEYRIwAAwChiBAAAGEWMAAAAo4gRAABgFDECAACMIkYAAIBRxAgAADCKGAEAAEYRIwAAwChiBAAAGEWMAAAAo4gRAMb8+te/Vnh4uDIzMxUeHq5f//rXppcEwICgY+Tw4cNavny5EhMTZbPZVFRUdN1tSktLlZ6eLofDodGjR6ugoKALSwXQl9hsNn3zm9/0G/vmN78pm81maEUATAk6RpqbmzVp0iT97Gc/C2j+u+++q2XLlumuu+5SVVWV1q1bp4ceekj79+8PerEA+ob/Hxy33nprp/cD6NuCjpElS5boySef1L333hvQ/Oeff17JycnKz89Xamqq1qxZoxUrVmjLli1BLxZA7/fFp2LefPNNXb58Wf/xH/+hy5cv680332x3HoC+LexGH+Do0aO6++67/cYWLVqkdevWdbiNx+ORx+Px3Xa73ZIky7JkWdYNWSeAL8cXn5q58847fee0ZVm68847/eatWLHiS18fgNAJ9DH7hsdIfX294uLi/Mbi4uLkdrv16aefKjIyss02LpdLeXl5bcaLi4sVFRV1w9YK4Mtz6623at++fb7bJSUlkqSkpCSdO3dOkvzuB9D7tLS0BDTvhsdIV+Tm5mr9+vW+2263W0lJSVq4cKGcTqfBlQEIlb///e9aunSpLMtSSUmJFixYILvdrszMTN+cpUuXmlsggG679szG9dzwGImPj1dDQ4PfWENDg5xOZ7tXRSTJ4XDI4XC0Gbfb7bLb7TdknQC+HIWFhb6nat566y3fUzN2u11vvfWW3zzOd6B3C/QcvuHvM5KRkaEDBw74jZWUlCgjI+NGHxpAD5Sdne377zlz5ig8PFzf/e53FR4erjlz5rQ7D0DfFnSMNDU1qaqqSlVVVZI+/6e7VVVVqq2tlfT5Uyw5OTm++f/6r/+qv//973rsscf0zjvv6Oc//7l++9vf6t///d9D8xMA6HW8Xq/f7WuvEenofgB9W9AxUllZqSlTpmjKlCmSpPXr12vKlCn6yU9+Ikmqq6vzhYkkJScn649//KNKSko0adIk5efn68UXX9SiRYtC9CMA6I28Xq8KCwv9xgoLCwkR4CZk8/aCM9/tdismJkaNjY28gBXoYyzL0r59+7R06VJeIwL0MYE+fvPZNAAAwChiBAAAGEWMAAAAo4gRAABgFDECAACMIkYAAIBRxAgAADCKGAEAAEYRIwAAwKgb/qm9oXDtTWID/ShiAL2HZVlqaWmR2+3mHViBPuba4/b13uy9V8TIpUuXJElJSUmGVwIAAIJ16dIlxcTEdHh/r/hsmtbWVn3wwQeKjo6WzWYzvRwAIeR2u5WUlKRz587x2VNAH+P1enXp0iUlJiaqX7+OXxnSK2IEQN/FB2EC4AWsAADAKGIEAAAYRYwAMMrhcOinP/2pHA6H6aUAMITXjAAAAKO4MgIAAIwiRgAAgFHECAAAMIoYAWDMV77yFT333HO+2zabTUVFRZ1u8+CDDyozM/OGrgvAl6tXvB08gJtDXV2dBg8eLEl67733lJycrOPHj2vy5Mm+OVu3br3u51wA6F2IEQA9Rnx8/HXndPb5FgB6J56mASDp88+AeuaZZzR69Gg5HA6NHDlSTz31lCSpurpaX/va1xQZGamhQ4fq29/+tpqamnzbXnvqZPPmzUpISNDQoUO1evVqWZblm3P+/HktX75ckZGRSk5O1q9+9as2a/ji0zTJycmSpClTpshms2nevHl+x7rG4/Ho3/7t3xQbG6uIiAjNnj1bFRUVvvtLS0tls9l04MABTZs2TVFRUZo5c6ZqampC9asD0E3ECABJUm5urjZt2qQf//jH+utf/6rCwkLFxcWpublZixYt0uDBg1VRUaHdu3fr9ddf15o1a/y2P3jwoM6cOaODBw/q5ZdfVkFBgQoKCnz3P/jggzp37pwOHjyoPXv26Oc//7nOnz/f4Xr+/Oc/S5Jef/111dXV6fe//3278x577DH97ne/08svv6y3335bo0eP1qJFi/Txxx/7zfvhD3+o/Px8VVZWKiwsTP/yL//Sxd8UgJDzArjpud1ur8Ph8L7wwgtt7tu5c6d38ODB3qamJt/YH//4R2+/fv289fX1Xq/X6125cqV31KhR3itXrvjmZGVlee+77z6v1+v11tTUeCV5//znP/vuP3XqlFeSd8uWLb4xSd69e/d6vV6v99133/VK8h4/ftxvPStXrvTec889Xq/X621qavLa7Xbvr371K9/9ly9f9iYmJnqfeeYZr9fr9R48eNAryfv666/7rV+S99NPPw3itwTgRuHKCACdOnVKHo9H8+fPb/e+SZMmacCAAb6xWbNmqbW11e+pjvHjx6t///6+2wkJCb4rH6dOnVJYWJimTp3qu3/s2LEaNGhQt9Z95swZWZalWbNm+cbsdrumT5+uU6dO+c1NS0vzW5ukTq/MAPjyECMAFBkZ2e192O12v9s2m02tra3d3m+ofHF9NptNknrU+oCbGTECQGPGjFFkZKQOHDjQ5r7U1FT95S9/UXNzs2/sT3/6k/r166eUlJSA9j927FhduXJFx44d843V1NTo4sWLHW4THh4uSbp69WqHc2677TaFh4frT3/6k2/MsixVVFRo3LhxAa0NgHn8014AioiI0Pe//3099thjCg8P16xZs/Thhx/q5MmTuv/++/XTn/5UK1eu1OOPP64PP/xQ3/3ud/XAAw8oLi4uoP2npKRo8eLF+s53vqPt27crLCxM69at6/SKTGxsrCIjI/Xaa69pxIgRioiIaPPPegcMGKBHHnlEGzZs0JAhQzRy5Eg988wzamlp0apVq7r1OwHw5eHKCABJ0o9//GM9+uij+slPfqLU1FTdd999On/+vKKiorR//359/PHHuuOOO7RixQrNnz9f27ZtC2r/v/zlL5WYmKi5c+fq61//ur797W8rNja2w/lhYWH6z//8T+3YsUOJiYm655572p23adMmfeMb39ADDzyg9PR0nT59Wvv37/e9eRqAns/m9fJWhgAAwByujAAAAKOIEQAAYBQxAgAAjCJGAACAUcQIAAAwihgBAABGESMAAMAoYgQAABhFjAAAAKOIEQAAYBQxAgAAjCJGAACAUf8HxjIn7/9H4JcAAAAASUVORK5CYII=", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "house.boxplot(column=\"condition\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datepriceConditionClip
3620140528T000000550000.02
38020140916T000000270000.02
39720140623T000000365000.02
144220141107T000000352950.02
173420150102T000000252000.02
222320150316T000000535000.02
300420141231T000000441000.02
320220140509T000000255000.02
397520150511T000000210000.02
465120141002T000000125000.02
737620141107T000000295000.02
763620150120T000000190000.02
1230620150128T000000196000.02
1245320150402T000000305000.02
1266820140729T000000227000.02
1362820140716T000000105500.02
1362920150316T000000445000.02
1498720141202T000000432500.02
1529320140506T00000078000.02
1533720140630T000000235000.02
1537120150114T000000658000.02
1571220140724T000000150000.02
1619820150324T00000081000.02
1689320141210T000000125000.02
1694220140611T000000427000.02
1780520150501T000000380000.02
1833220140924T000000130000.02
1864520141216T000000575000.02
1887620150211T0000001500000.02
1945220140926T000000142000.02
\n", "
" ], "text/plain": [ " date price ConditionClip\n", "36 20140528T000000 550000.0 2\n", "380 20140916T000000 270000.0 2\n", "397 20140623T000000 365000.0 2\n", "1442 20141107T000000 352950.0 2\n", "1734 20150102T000000 252000.0 2\n", "2223 20150316T000000 535000.0 2\n", "3004 20141231T000000 441000.0 2\n", "3202 20140509T000000 255000.0 2\n", "3975 20150511T000000 210000.0 2\n", "4651 20141002T000000 125000.0 2\n", "7376 20141107T000000 295000.0 2\n", "7636 20150120T000000 190000.0 2\n", "12306 20150128T000000 196000.0 2\n", "12453 20150402T000000 305000.0 2\n", "12668 20140729T000000 227000.0 2\n", "13628 20140716T000000 105500.0 2\n", "13629 20150316T000000 445000.0 2\n", "14987 20141202T000000 432500.0 2\n", "15293 20140506T000000 78000.0 2\n", "15337 20140630T000000 235000.0 2\n", "15371 20150114T000000 658000.0 2\n", "15712 20140724T000000 150000.0 2\n", "16198 20150324T000000 81000.0 2\n", "16893 20141210T000000 125000.0 2\n", "16942 20140611T000000 427000.0 2\n", "17805 20150501T000000 380000.0 2\n", "18332 20140924T000000 130000.0 2\n", "18645 20141216T000000 575000.0 2\n", "18876 20150211T000000 1500000.0 2\n", "19452 20140926T000000 142000.0 2" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "house_norm = house.copy()\n", "\n", "house_norm[\"ConditionClip\"] = house[\"condition\"].clip(2, 5)\n", "\n", "house_norm[house_norm[\"condition\"] < 2][[\"date\", \"price\", \"ConditionClip\"]]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5.0\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dateconditionConditionWinsorize
3620140528T00000013
38020140916T00000013
39720140623T00000013
144220141107T00000013
173420150102T00000013
222320150316T00000013
300420141231T00000013
320220140509T00000013
397520150511T00000013
465120141002T00000013
737620141107T00000013
763620150120T00000013
1230620150128T00000013
1245320150402T00000013
1266820140729T00000013
1362820140716T00000013
1362920150316T00000013
1498720141202T00000013
1529320140506T00000013
1533720140630T00000013
1537120150114T00000013
1571220140724T00000013
1619820150324T00000013
1689320141210T00000013
1694220140611T00000013
1780520150501T00000013
1833220140924T00000013
1864520141216T00000013
1887620150211T00000013
1945220140926T00000013
\n", "
" ], "text/plain": [ " date condition ConditionWinsorize\n", "36 20140528T000000 1 3\n", "380 20140916T000000 1 3\n", "397 20140623T000000 1 3\n", "1442 20141107T000000 1 3\n", "1734 20150102T000000 1 3\n", "2223 20150316T000000 1 3\n", "3004 20141231T000000 1 3\n", "3202 20140509T000000 1 3\n", "3975 20150511T000000 1 3\n", "4651 20141002T000000 1 3\n", "7376 20141107T000000 1 3\n", "7636 20150120T000000 1 3\n", "12306 20150128T000000 1 3\n", "12453 20150402T000000 1 3\n", "12668 20140729T000000 1 3\n", "13628 20140716T000000 1 3\n", "13629 20150316T000000 1 3\n", "14987 20141202T000000 1 3\n", "15293 20140506T000000 1 3\n", "15337 20140630T000000 1 3\n", "15371 20150114T000000 1 3\n", "15712 20140724T000000 1 3\n", "16198 20150324T000000 1 3\n", "16893 20141210T000000 1 3\n", "16942 20140611T000000 1 3\n", "17805 20150501T000000 1 3\n", "18332 20140924T000000 1 3\n", "18645 20141216T000000 1 3\n", "18876 20150211T000000 1 3\n", "19452 20140926T000000 1 3" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from scipy.stats.mstats import winsorize\n", "\n", "print(house_norm[\"condition\"].quantile(q=0.95))\n", "\n", "house_norm[\"ConditionWinsorize\"] = winsorize(\n", " house_norm[\"condition\"].fillna(house_norm[\"condition\"].mean()), (0.01, 0.05), inplace=False\n", ")\n", "\n", "house_norm[house_norm[\"condition\"] < 2][[\"date\", \"condition\", \"ConditionWinsorize\"]]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
priceconditionConditionNormConditionClipNormConditionWinsorizeNormConditionWinsorizeNorm2
0221900.030.500.3333330.0-1.0
1538000.030.500.3333330.0-1.0
2180000.030.500.3333330.0-1.0
3604000.051.001.0000001.01.0
4510000.030.500.3333330.0-1.0
51225000.030.500.3333330.0-1.0
6257500.030.500.3333330.0-1.0
7291850.030.500.3333330.0-1.0
8229500.030.500.3333330.0-1.0
9323000.030.500.3333330.0-1.0
10662500.030.500.3333330.0-1.0
11468000.040.750.6666670.50.0
12310000.040.750.6666670.50.0
13400000.040.750.6666670.50.0
14530000.030.500.3333330.0-1.0
15650000.030.500.3333330.0-1.0
16395000.030.500.3333330.0-1.0
17485000.040.750.6666670.50.0
18189000.040.750.6666670.50.0
19230000.040.750.6666670.50.0
\n", "
" ], "text/plain": [ " price condition ConditionNorm ConditionClipNorm \\\n", "0 221900.0 3 0.50 0.333333 \n", "1 538000.0 3 0.50 0.333333 \n", "2 180000.0 3 0.50 0.333333 \n", "3 604000.0 5 1.00 1.000000 \n", "4 510000.0 3 0.50 0.333333 \n", "5 1225000.0 3 0.50 0.333333 \n", "6 257500.0 3 0.50 0.333333 \n", "7 291850.0 3 0.50 0.333333 \n", "8 229500.0 3 0.50 0.333333 \n", "9 323000.0 3 0.50 0.333333 \n", "10 662500.0 3 0.50 0.333333 \n", "11 468000.0 4 0.75 0.666667 \n", "12 310000.0 4 0.75 0.666667 \n", "13 400000.0 4 0.75 0.666667 \n", "14 530000.0 3 0.50 0.333333 \n", "15 650000.0 3 0.50 0.333333 \n", "16 395000.0 3 0.50 0.333333 \n", "17 485000.0 4 0.75 0.666667 \n", "18 189000.0 4 0.75 0.666667 \n", "19 230000.0 4 0.75 0.666667 \n", "\n", " ConditionWinsorizeNorm ConditionWinsorizeNorm2 \n", "0 0.0 -1.0 \n", "1 0.0 -1.0 \n", "2 0.0 -1.0 \n", "3 1.0 1.0 \n", "4 0.0 -1.0 \n", "5 0.0 -1.0 \n", "6 0.0 -1.0 \n", "7 0.0 -1.0 \n", "8 0.0 -1.0 \n", "9 0.0 -1.0 \n", "10 0.0 -1.0 \n", "11 0.5 0.0 \n", "12 0.5 0.0 \n", "13 0.5 0.0 \n", "14 0.0 -1.0 \n", "15 0.0 -1.0 \n", "16 0.0 -1.0 \n", "17 0.5 0.0 \n", "18 0.5 0.0 \n", "19 0.5 0.0 " ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import preprocessing\n", "\n", "min_max_scaler = preprocessing.MinMaxScaler()\n", "\n", "min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n", "\n", "house_norm[\"ConditionNorm\"] = min_max_scaler.fit_transform(\n", " house_norm[\"condition\"].to_numpy().reshape(-1, 1)\n", ").reshape(house_norm[\"condition\"].shape)\n", "\n", "house_norm[\"ConditionClipNorm\"] = min_max_scaler.fit_transform(\n", " house_norm[\"ConditionClip\"].to_numpy().reshape(-1, 1)\n", ").reshape(house_norm[\"condition\"].shape)\n", "\n", "house_norm[\"ConditionWinsorizeNorm\"] = min_max_scaler.fit_transform(\n", " house_norm[\"ConditionWinsorize\"].to_numpy().reshape(-1, 1)\n", ").reshape(house_norm[\"condition\"].shape)\n", "\n", "house_norm[\"ConditionWinsorizeNorm2\"] = min_max_scaler_2.fit_transform(\n", " house_norm[\"ConditionWinsorize\"].to_numpy().reshape(-1, 1)\n", ").reshape(house_norm[\"condition\"].shape)\n", "\n", "house_norm[\n", " [\n", " \"price\",\n", " \"condition\",\n", " \"ConditionNorm\",\n", " \"ConditionClipNorm\",\n", " \"ConditionWinsorizeNorm\",\n", " \"ConditionWinsorizeNorm2\",\n", " ]\n", "].head(20)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
priceconditionConditionStandConditionClipStandConditionWinsorizeStand
0221900.03-0.629187-0.635310-0.663482
1538000.03-0.629187-0.635310-0.663482
2180000.03-0.629187-0.635310-0.663482
3604000.052.4442942.4575972.494726
4510000.03-0.629187-0.635310-0.663482
51225000.03-0.629187-0.635310-0.663482
6257500.03-0.629187-0.635310-0.663482
7291850.03-0.629187-0.635310-0.663482
8229500.03-0.629187-0.635310-0.663482
9323000.03-0.629187-0.635310-0.663482
10662500.03-0.629187-0.635310-0.663482
11468000.040.9075540.9111430.915622
12310000.040.9075540.9111430.915622
13400000.040.9075540.9111430.915622
14530000.03-0.629187-0.635310-0.663482
15650000.03-0.629187-0.635310-0.663482
16395000.03-0.629187-0.635310-0.663482
17485000.040.9075540.9111430.915622
18189000.040.9075540.9111430.915622
19230000.040.9075540.9111430.915622
\n", "
" ], "text/plain": [ " price condition ConditionStand ConditionClipStand \\\n", "0 221900.0 3 -0.629187 -0.635310 \n", "1 538000.0 3 -0.629187 -0.635310 \n", "2 180000.0 3 -0.629187 -0.635310 \n", "3 604000.0 5 2.444294 2.457597 \n", "4 510000.0 3 -0.629187 -0.635310 \n", "5 1225000.0 3 -0.629187 -0.635310 \n", "6 257500.0 3 -0.629187 -0.635310 \n", "7 291850.0 3 -0.629187 -0.635310 \n", "8 229500.0 3 -0.629187 -0.635310 \n", "9 323000.0 3 -0.629187 -0.635310 \n", "10 662500.0 3 -0.629187 -0.635310 \n", "11 468000.0 4 0.907554 0.911143 \n", "12 310000.0 4 0.907554 0.911143 \n", "13 400000.0 4 0.907554 0.911143 \n", "14 530000.0 3 -0.629187 -0.635310 \n", "15 650000.0 3 -0.629187 -0.635310 \n", "16 395000.0 3 -0.629187 -0.635310 \n", "17 485000.0 4 0.907554 0.911143 \n", "18 189000.0 4 0.907554 0.911143 \n", "19 230000.0 4 0.907554 0.911143 \n", "\n", " ConditionWinsorizeStand \n", "0 -0.663482 \n", "1 -0.663482 \n", "2 -0.663482 \n", "3 2.494726 \n", "4 -0.663482 \n", "5 -0.663482 \n", "6 -0.663482 \n", "7 -0.663482 \n", "8 -0.663482 \n", "9 -0.663482 \n", "10 -0.663482 \n", "11 0.915622 \n", "12 0.915622 \n", "13 0.915622 \n", "14 -0.663482 \n", "15 -0.663482 \n", "16 -0.663482 \n", "17 0.915622 \n", "18 0.915622 \n", "19 0.915622 " ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import preprocessing\n", "\n", "stndart_scaler = preprocessing.StandardScaler()\n", "\n", "house_norm[\"ConditionStand\"] = stndart_scaler.fit_transform(\n", " house_norm[\"condition\"].to_numpy().reshape(-1, 1)\n", ").reshape(house_norm[\"condition\"].shape)\n", "\n", "house_norm[\"ConditionClipStand\"] = stndart_scaler.fit_transform(\n", " house_norm[\"ConditionClip\"].to_numpy().reshape(-1, 1)\n", ").reshape(house_norm[\"condition\"].shape)\n", "\n", "house_norm[\"ConditionWinsorizeStand\"] = stndart_scaler.fit_transform(\n", " house_norm[\"ConditionWinsorize\"].to_numpy().reshape(-1, 1)\n", ").reshape(house_norm[\"condition\"].shape)\n", "\n", "house_norm[\n", " [\n", " \"price\",\n", " \"condition\",\n", " \"ConditionStand\",\n", " \"ConditionClipStand\",\n", " \"ConditionWinsorizeStand\",\n", " ]\n", "].head(20)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }