Mii_Kislitsa_Egor_Pibd_33/lec3.ipynb

148 KiB
Raw Permalink Blame History

In [17]:
import pandas as pd

house = pd.read_csv("data/kc_house_data.csv", index_col="id")

house = house.reset_index(drop=True)

house
Out[17]:
date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 3 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
1 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 3 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
2 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 3 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
3 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 5 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
4 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 3 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
21608 20140521T000000 360000.0 3 2.50 1530 1131 3.0 0 0 3 8 1530 0 2009 0 98103 47.6993 -122.346 1530 1509
21609 20150223T000000 400000.0 4 2.50 2310 5813 2.0 0 0 3 8 2310 0 2014 0 98146 47.5107 -122.362 1830 7200
21610 20140623T000000 402101.0 2 0.75 1020 1350 2.0 0 0 3 7 1020 0 2009 0 98144 47.5944 -122.299 1020 2007
21611 20150116T000000 400000.0 3 2.50 1600 2388 2.0 0 0 3 8 1600 0 2004 0 98027 47.5345 -122.069 1410 1287
21612 20141015T000000 325000.0 2 0.75 1020 1076 2.0 0 0 3 7 1020 0 2008 0 98144 47.5941 -122.299 1020 1357

21613 rows × 20 columns

In [18]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np


encoder = OneHotEncoder(sparse_output=False, drop="first")

encoded_values = encoder.fit_transform(house[["yr_built", "price"]])

encoded_columns = encoder.get_feature_names_out(["yr_built", "price"])

encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)

encoded_values_df
Out[18]:
yr_built_1901 yr_built_1902 yr_built_1903 yr_built_1904 yr_built_1905 yr_built_1906 yr_built_1907 yr_built_1908 yr_built_1909 yr_built_1910 ... price_4489000.0 price_4500000.0 price_4668000.0 price_5110800.0 price_5300000.0 price_5350000.0 price_5570000.0 price_6885000.0 price_7062500.0 price_7700000.0
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
21608 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21609 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21610 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21611 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21612 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

21613 rows × 4142 columns

In [5]:
house = pd.concat([house, encoded_values_df], axis=1)

house
Out[5]:
date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition ... price_4489000.0 price_4500000.0 price_4668000.0 price_5110800.0 price_5300000.0 price_5350000.0 price_5570000.0 price_6885000.0 price_7062500.0 price_7700000.0
0 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 3 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 3 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 3 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 5 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 3 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
21608 20140521T000000 360000.0 3 2.50 1530 1131 3.0 0 0 3 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21609 20150223T000000 400000.0 4 2.50 2310 5813 2.0 0 0 3 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21610 20140623T000000 402101.0 2 0.75 1020 1350 2.0 0 0 3 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21611 20150116T000000 400000.0 3 2.50 1600 2388 2.0 0 0 3 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21612 20141015T000000 325000.0 2 0.75 1020 1076 2.0 0 0 3 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

21613 rows × 4162 columns

In [6]:
labels = ["old", "middle", "new"]
num_bins = 3
In [19]:
hist1, bins1 = np.histogram(
    house["yr_built"].fillna(house["yr_built"].median()), bins=num_bins
)
bins1, hist1
Out[19]:
(array([1900.        , 1938.33333333, 1976.66666667, 2015.        ]),
 array([ 3067,  8120, 10426]))
In [20]:
pd.concat([house["yr_built"], pd.cut(house["yr_built"], list(bins1))], axis=1).head(20)
Out[20]:
yr_built yr_built
0 1955 (1938.333, 1976.667]
1 1951 (1938.333, 1976.667]
2 1933 (1900.0, 1938.333]
3 1965 (1938.333, 1976.667]
4 1987 (1976.667, 2015.0]
5 2001 (1976.667, 2015.0]
6 1995 (1976.667, 2015.0]
7 1963 (1938.333, 1976.667]
8 1960 (1938.333, 1976.667]
9 2003 (1976.667, 2015.0]
10 1965 (1938.333, 1976.667]
11 1942 (1938.333, 1976.667]
12 1927 (1900.0, 1938.333]
13 1977 (1976.667, 2015.0]
14 1900 NaN
15 1979 (1976.667, 2015.0]
16 1994 (1976.667, 2015.0]
17 1916 (1900.0, 1938.333]
18 1921 (1900.0, 1938.333]
19 1969 (1938.333, 1976.667]
In [21]:
pd.concat(
    [house["yr_built"], pd.cut(house["yr_built"], list(bins1), labels=labels)], axis=1
).head(20)
Out[21]:
yr_built yr_built
0 1955 middle
1 1951 middle
2 1933 old
3 1965 middle
4 1987 new
5 2001 new
6 1995 new
7 1963 middle
8 1960 middle
9 2003 new
10 1965 middle
11 1942 middle
12 1927 old
13 1977 new
14 1900 NaN
15 1979 new
16 1994 new
17 1916 old
18 1921 old
19 1969 middle
In [11]:
bins2 = np.linspace(1899, 2015, 5)
tmp_bins2 = np.digitize(house["yr_built"].fillna(house["yr_built"].median()), bins2)
hist2 = np.bincount(tmp_bins2 - 1)
bins2, hist2
Out[11]:
(array([1899., 1928., 1957., 1986., 2015.]),
 array([2403, 4230, 6914, 8028,   38]))
In [22]:
pd.concat([house["yr_built"], pd.cut(house["yr_built"], list(bins2))], axis=1).head(20)
Out[22]:
yr_built yr_built
0 1955 (1928.0, 1957.0]
1 1951 (1928.0, 1957.0]
2 1933 (1928.0, 1957.0]
3 1965 (1957.0, 1986.0]
4 1987 (1986.0, 2015.0]
5 2001 (1986.0, 2015.0]
6 1995 (1986.0, 2015.0]
7 1963 (1957.0, 1986.0]
8 1960 (1957.0, 1986.0]
9 2003 (1986.0, 2015.0]
10 1965 (1957.0, 1986.0]
11 1942 (1928.0, 1957.0]
12 1927 (1899.0, 1928.0]
13 1977 (1957.0, 1986.0]
14 1900 (1899.0, 1928.0]
15 1979 (1957.0, 1986.0]
16 1994 (1986.0, 2015.0]
17 1916 (1899.0, 1928.0]
18 1921 (1899.0, 1928.0]
19 1969 (1957.0, 1986.0]
In [23]:
hist3, bins3 = np.histogram(
    house["yr_built"].fillna(house["yr_built"].median()), bins=[1899, 1957, 2001, 2015]
)
bins3, hist3
Out[23]:
(array([1899, 1957, 2001, 2015]), array([ 6633, 10439,  4541]))
In [24]:
pd.concat([house["yr_built"], pd.cut(house["yr_built"], list(bins3))], axis=1).head(20)
Out[24]:
yr_built yr_built
0 1955 (1899, 1957]
1 1951 (1899, 1957]
2 1933 (1899, 1957]
3 1965 (1957, 2001]
4 1987 (1957, 2001]
5 2001 (1957, 2001]
6 1995 (1957, 2001]
7 1963 (1957, 2001]
8 1960 (1957, 2001]
9 2003 (2001, 2015]
10 1965 (1957, 2001]
11 1942 (1899, 1957]
12 1927 (1899, 1957]
13 1977 (1957, 2001]
14 1900 (1899, 1957]
15 1979 (1957, 2001]
16 1994 (1957, 2001]
17 1916 (1899, 1957]
18 1921 (1899, 1957]
19 1969 (1957, 2001]
In [25]:
pd.concat(
    [house["yr_built"], pd.cut(house["yr_built"], list(bins3), labels=labels)],
    axis=1,
).head(20)
Out[25]:
yr_built yr_built
0 1955 old
1 1951 old
2 1933 old
3 1965 middle
4 1987 middle
5 2001 middle
6 1995 middle
7 1963 middle
8 1960 middle
9 2003 new
10 1965 middle
11 1942 old
12 1927 old
13 1977 middle
14 1900 old
15 1979 middle
16 1994 middle
17 1916 old
18 1921 old
19 1969 middle
In [26]:
pd.concat(
    [house["yr_built"], pd.qcut(house["yr_built"], q=3, labels=False)], axis=1
).head(20)
Out[26]:
yr_built yr_built
0 1955 0
1 1951 0
2 1933 0
3 1965 1
4 1987 1
5 2001 2
6 1995 2
7 1963 1
8 1960 1
9 2003 2
10 1965 1
11 1942 0
12 1927 0
13 1977 1
14 1900 0
15 1979 1
16 1994 2
17 1916 0
18 1921 0
19 1969 1
In [27]:
pd.concat(
    [house["yr_built"], pd.qcut(house["yr_built"], q=3, labels=labels)], axis=1
).head(20)
Out[27]:
yr_built yr_built
0 1955 old
1 1951 old
2 1933 old
3 1965 middle
4 1987 middle
5 2001 new
6 1995 new
7 1963 middle
8 1960 middle
9 2003 new
10 1965 middle
11 1942 old
12 1927 old
13 1977 middle
14 1900 old
15 1979 middle
16 1994 new
17 1916 old
18 1921 old
19 1969 middle

Пример конструирования признаков на основе существующих

In [28]:
house_cleaned = house.drop(["waterfront", "view", "condition"], axis=1, errors="ignore")

house_cleaned = house_cleaned.dropna()

# Признак "Price_category": разделение домов на категории цен
house_cleaned["Price_category"] = pd.qcut(
    house_cleaned["price"], q=3, labels=["Low", "Medium", "High"]
)

# Признак "Renovated_flag": 1, если дом был отремонтирован, иначе 0
house_cleaned["Renovated_flag"] = house_cleaned["yr_renovated"].apply(
    lambda x: 1 if x > 0 else 0
)

# Признак "Zipcode_area": используем первые три цифры из почтового индекса
house_cleaned["Zipcode_area"] = house_cleaned["zipcode"].apply(lambda x: str(x)[:3])

house_cleaned
Out[28]:
date price bedrooms bathrooms sqft_living sqft_lot floors grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15 Price_category Renovated_flag Zipcode_area
0 20141013T000000 221900.0 3 1.00 1180 5650 1.0 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650 Low 0 981
1 20141209T000000 538000.0 3 2.25 2570 7242 2.0 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639 Medium 1 981
2 20150225T000000 180000.0 2 1.00 770 10000 1.0 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062 Low 0 980
3 20141209T000000 604000.0 4 3.00 1960 5000 1.0 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000 High 0 981
4 20150218T000000 510000.0 3 2.00 1680 8080 1.0 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503 Medium 0 980
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
21608 20140521T000000 360000.0 3 2.50 1530 1131 3.0 8 1530 0 2009 0 98103 47.6993 -122.346 1530 1509 Low 0 981
21609 20150223T000000 400000.0 4 2.50 2310 5813 2.0 8 2310 0 2014 0 98146 47.5107 -122.362 1830 7200 Medium 0 981
21610 20140623T000000 402101.0 2 0.75 1020 1350 2.0 7 1020 0 2009 0 98144 47.5944 -122.299 1020 2007 Medium 0 981
21611 20150116T000000 400000.0 3 2.50 1600 2388 2.0 8 1600 0 2004 0 98027 47.5345 -122.069 1410 1287 Medium 0 980
21612 20141015T000000 325000.0 2 0.75 1020 1076 2.0 7 1020 0 2008 0 98144 47.5941 -122.299 1020 1357 Low 0 981

21613 rows × 20 columns

In [30]:
import featuretools as ft
from woodwork.logical_types import Categorical, Datetime

import featuretools as ft
from woodwork.logical_types import Categorical, Datetime
import pandas as pd

# Загрузка данных
df = pd.read_csv("data/kc_house_data.csv")

# Убедимся, что есть уникальный идентификатор для каждой строки (если нет, создаем)
df["id"] = range(len(df))

# Создаем EntitySet для данных о домах
es = ft.EntitySet(id="house_sales")

# Добавляем основной DataFrame в EntitySet с указанием типов данных
es = es.add_dataframe(
    dataframe_name="houses",
    dataframe=df,
    index="id",  # Уникальный идентификатор для домов
    logical_types={
        "date": Datetime,
        "zipcode": Categorical,
        "condition": Categorical,
        "grade": Categorical,
        "view": Categorical,
        "waterfront": Categorical,
    },
)
ft.primitives.list_primitives()
# Автоматическое конструирование признаков с применением корректных примитивов
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="houses",  # Название основной таблицы
    agg_primitives=["mean", "count", "mode", "sum"],  # Агрегирующие примитивы
    trans_primitives=[
        "year",
        "month",
        "weekday",
        "hour",
    ],  # Корректные трансформационные примитивы
    max_depth=2,  # Максимальная глубина для генерации признаков
)

# Просмотр полученной feature_matrix
print(feature_matrix.head())
       price  bedrooms  bathrooms  sqft_living  sqft_lot  floors waterfront  \
id                                                                            
0   221900.0         3       1.00         1180      5650     1.0          0   
1   538000.0         3       2.25         2570      7242     2.0          0   
2   180000.0         2       1.00          770     10000     1.0          0   
3   604000.0         4       3.00         1960      5000     1.0          0   
4   510000.0         3       2.00         1680      8080     1.0          0   

   view condition grade  ...  yr_renovated  zipcode      lat     long  \
id                       ...                                            
0     0         3     7  ...             0    98178  47.5112 -122.257   
1     0         3     7  ...          1991    98125  47.7210 -122.319   
2     0         3     6  ...             0    98028  47.7379 -122.233   
3     0         5     7  ...             0    98136  47.5208 -122.393   
4     0         3     8  ...             0    98074  47.6168 -122.045   

   sqft_living15  sqft_lot15  HOUR(date)  MONTH(date)  WEEKDAY(date)  \
id                                                                     
0           1340        5650           0           10              0   
1           1690        7639           0           12              1   
2           2720        8062           0            2              2   
3           1360        5000           0           12              1   
4           1800        7503           0            2              2   

   YEAR(date)  
id             
0        2014  
1        2014  
2        2015  
3        2014  
4        2015  

[5 rows x 23 columns]
c:\Mai\mai\.venv\Lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created
  warnings.warn(
c:\Mai\mai\.venv\Lib\site-packages\featuretools\synthesis\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:
  agg_primitives: ['count', 'mean', 'mode', 'sum']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.
  warnings.warn(warning_msg, UnusedPrimitiveWarning)
In [31]:
house.boxplot(column="condition")
Out[31]:
<Axes: >
No description has been provided for this image
In [33]:
house_norm = house.copy()

house_norm["ConditionClip"] = house["condition"].clip(2, 5)

house_norm[house_norm["condition"] < 2][["date", "price", "ConditionClip"]]
Out[33]:
date price ConditionClip
36 20140528T000000 550000.0 2
380 20140916T000000 270000.0 2
397 20140623T000000 365000.0 2
1442 20141107T000000 352950.0 2
1734 20150102T000000 252000.0 2
2223 20150316T000000 535000.0 2
3004 20141231T000000 441000.0 2
3202 20140509T000000 255000.0 2
3975 20150511T000000 210000.0 2
4651 20141002T000000 125000.0 2
7376 20141107T000000 295000.0 2
7636 20150120T000000 190000.0 2
12306 20150128T000000 196000.0 2
12453 20150402T000000 305000.0 2
12668 20140729T000000 227000.0 2
13628 20140716T000000 105500.0 2
13629 20150316T000000 445000.0 2
14987 20141202T000000 432500.0 2
15293 20140506T000000 78000.0 2
15337 20140630T000000 235000.0 2
15371 20150114T000000 658000.0 2
15712 20140724T000000 150000.0 2
16198 20150324T000000 81000.0 2
16893 20141210T000000 125000.0 2
16942 20140611T000000 427000.0 2
17805 20150501T000000 380000.0 2
18332 20140924T000000 130000.0 2
18645 20141216T000000 575000.0 2
18876 20150211T000000 1500000.0 2
19452 20140926T000000 142000.0 2
In [34]:
from scipy.stats.mstats import winsorize

print(house_norm["condition"].quantile(q=0.95))

house_norm["ConditionWinsorize"] = winsorize(
    house_norm["condition"].fillna(house_norm["condition"].mean()), (0.01, 0.05), inplace=False
)

house_norm[house_norm["condition"] < 2][["date", "condition", "ConditionWinsorize"]]
5.0
Out[34]:
date condition ConditionWinsorize
36 20140528T000000 1 3
380 20140916T000000 1 3
397 20140623T000000 1 3
1442 20141107T000000 1 3
1734 20150102T000000 1 3
2223 20150316T000000 1 3
3004 20141231T000000 1 3
3202 20140509T000000 1 3
3975 20150511T000000 1 3
4651 20141002T000000 1 3
7376 20141107T000000 1 3
7636 20150120T000000 1 3
12306 20150128T000000 1 3
12453 20150402T000000 1 3
12668 20140729T000000 1 3
13628 20140716T000000 1 3
13629 20150316T000000 1 3
14987 20141202T000000 1 3
15293 20140506T000000 1 3
15337 20140630T000000 1 3
15371 20150114T000000 1 3
15712 20140724T000000 1 3
16198 20150324T000000 1 3
16893 20141210T000000 1 3
16942 20140611T000000 1 3
17805 20150501T000000 1 3
18332 20140924T000000 1 3
18645 20141216T000000 1 3
18876 20150211T000000 1 3
19452 20140926T000000 1 3
In [35]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()

min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))

house_norm["ConditionNorm"] = min_max_scaler.fit_transform(
    house_norm["condition"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)

house_norm["ConditionClipNorm"] = min_max_scaler.fit_transform(
    house_norm["ConditionClip"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)

house_norm["ConditionWinsorizeNorm"] = min_max_scaler.fit_transform(
    house_norm["ConditionWinsorize"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)

house_norm["ConditionWinsorizeNorm2"] = min_max_scaler_2.fit_transform(
    house_norm["ConditionWinsorize"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)

house_norm[
    [
        "price",
        "condition",
        "ConditionNorm",
        "ConditionClipNorm",
        "ConditionWinsorizeNorm",
        "ConditionWinsorizeNorm2",
    ]
].head(20)
Out[35]:
price condition ConditionNorm ConditionClipNorm ConditionWinsorizeNorm ConditionWinsorizeNorm2
0 221900.0 3 0.50 0.333333 0.0 -1.0
1 538000.0 3 0.50 0.333333 0.0 -1.0
2 180000.0 3 0.50 0.333333 0.0 -1.0
3 604000.0 5 1.00 1.000000 1.0 1.0
4 510000.0 3 0.50 0.333333 0.0 -1.0
5 1225000.0 3 0.50 0.333333 0.0 -1.0
6 257500.0 3 0.50 0.333333 0.0 -1.0
7 291850.0 3 0.50 0.333333 0.0 -1.0
8 229500.0 3 0.50 0.333333 0.0 -1.0
9 323000.0 3 0.50 0.333333 0.0 -1.0
10 662500.0 3 0.50 0.333333 0.0 -1.0
11 468000.0 4 0.75 0.666667 0.5 0.0
12 310000.0 4 0.75 0.666667 0.5 0.0
13 400000.0 4 0.75 0.666667 0.5 0.0
14 530000.0 3 0.50 0.333333 0.0 -1.0
15 650000.0 3 0.50 0.333333 0.0 -1.0
16 395000.0 3 0.50 0.333333 0.0 -1.0
17 485000.0 4 0.75 0.666667 0.5 0.0
18 189000.0 4 0.75 0.666667 0.5 0.0
19 230000.0 4 0.75 0.666667 0.5 0.0
In [36]:
from sklearn import preprocessing

stndart_scaler = preprocessing.StandardScaler()

house_norm["ConditionStand"] = stndart_scaler.fit_transform(
    house_norm["condition"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)

house_norm["ConditionClipStand"] = stndart_scaler.fit_transform(
    house_norm["ConditionClip"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)

house_norm["ConditionWinsorizeStand"] = stndart_scaler.fit_transform(
    house_norm["ConditionWinsorize"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)

house_norm[
    [
        "price",
        "condition",
        "ConditionStand",
        "ConditionClipStand",
        "ConditionWinsorizeStand",
    ]
].head(20)
Out[36]:
price condition ConditionStand ConditionClipStand ConditionWinsorizeStand
0 221900.0 3 -0.629187 -0.635310 -0.663482
1 538000.0 3 -0.629187 -0.635310 -0.663482
2 180000.0 3 -0.629187 -0.635310 -0.663482
3 604000.0 5 2.444294 2.457597 2.494726
4 510000.0 3 -0.629187 -0.635310 -0.663482
5 1225000.0 3 -0.629187 -0.635310 -0.663482
6 257500.0 3 -0.629187 -0.635310 -0.663482
7 291850.0 3 -0.629187 -0.635310 -0.663482
8 229500.0 3 -0.629187 -0.635310 -0.663482
9 323000.0 3 -0.629187 -0.635310 -0.663482
10 662500.0 3 -0.629187 -0.635310 -0.663482
11 468000.0 4 0.907554 0.911143 0.915622
12 310000.0 4 0.907554 0.911143 0.915622
13 400000.0 4 0.907554 0.911143 0.915622
14 530000.0 3 -0.629187 -0.635310 -0.663482
15 650000.0 3 -0.629187 -0.635310 -0.663482
16 395000.0 3 -0.629187 -0.635310 -0.663482
17 485000.0 4 0.907554 0.911143 0.915622
18 189000.0 4 0.907554 0.911143 0.915622
19 230000.0 4 0.907554 0.911143 0.915622