Mii_Kislitsa_Egor_Pibd_33/lec3.ipynb at Lab4

In [17]:

import pandas as pd

house = pd.read_csv("data/kc_house_data.csv", index_col="id")

house = house.reset_index(drop=True)

house

Out[17]:

	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	20141013T000000	221900.0	3	1.00	1180	5650	1.0	0	0	3	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	20141209T000000	538000.0	3	2.25	2570	7242	2.0	0	0	3	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	20150225T000000	180000.0	2	1.00	770	10000	1.0	0	0	3	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	20141209T000000	604000.0	4	3.00	1960	5000	1.0	0	0	5	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	20150218T000000	510000.0	3	2.00	1680	8080	1.0	0	0	3	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
21608	20140521T000000	360000.0	3	2.50	1530	1131	3.0	0	0	3	8	1530	0	2009	0	98103	47.6993	-122.346	1530	1509
21609	20150223T000000	400000.0	4	2.50	2310	5813	2.0	0	0	3	8	2310	0	2014	0	98146	47.5107	-122.362	1830	7200
21610	20140623T000000	402101.0	2	0.75	1020	1350	2.0	0	0	3	7	1020	0	2009	0	98144	47.5944	-122.299	1020	2007
21611	20150116T000000	400000.0	3	2.50	1600	2388	2.0	0	0	3	8	1600	0	2004	0	98027	47.5345	-122.069	1410	1287
21612	20141015T000000	325000.0	2	0.75	1020	1076	2.0	0	0	3	7	1020	0	2008	0	98144	47.5941	-122.299	1020	1357

21613 rows × 20 columns

In [18]:

from sklearn.preprocessing import OneHotEncoder
import numpy as np


encoder = OneHotEncoder(sparse_output=False, drop="first")

encoded_values = encoder.fit_transform(house[["yr_built", "price"]])

encoded_columns = encoder.get_feature_names_out(["yr_built", "price"])

encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)

encoded_values_df

Out[18]:

	yr_built_1901	yr_built_1902	yr_built_1903	yr_built_1904	yr_built_1905	yr_built_1906	yr_built_1907	yr_built_1908	yr_built_1909	yr_built_1910	...	price_4489000.0	price_4500000.0	price_4668000.0	price_5110800.0	price_5300000.0	price_5350000.0	price_5570000.0	price_6885000.0	price_7062500.0	price_7700000.0
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
21608	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21609	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21610	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21611	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21612	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

21613 rows × 4142 columns

In [5]:

house = pd.concat([house, encoded_values_df], axis=1)

house

Out[5]:

	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	...	price_4489000.0	price_4500000.0	price_4668000.0	price_5110800.0	price_5300000.0	price_5350000.0	price_5570000.0	price_6885000.0	price_7062500.0	price_7700000.0
0	20141013T000000	221900.0	3	1.00	1180	5650	1.0	0	0	3	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	20141209T000000	538000.0	3	2.25	2570	7242	2.0	0	0	3	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	20150225T000000	180000.0	2	1.00	770	10000	1.0	0	0	3	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	20141209T000000	604000.0	4	3.00	1960	5000	1.0	0	0	5	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	20150218T000000	510000.0	3	2.00	1680	8080	1.0	0	0	3	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
21608	20140521T000000	360000.0	3	2.50	1530	1131	3.0	0	0	3	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21609	20150223T000000	400000.0	4	2.50	2310	5813	2.0	0	0	3	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21610	20140623T000000	402101.0	2	0.75	1020	1350	2.0	0	0	3	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21611	20150116T000000	400000.0	3	2.50	1600	2388	2.0	0	0	3	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21612	20141015T000000	325000.0	2	0.75	1020	1076	2.0	0	0	3	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

21613 rows × 4162 columns

In [6]:

labels = ["old", "middle", "new"]
num_bins = 3

In [19]:

hist1, bins1 = np.histogram(
    house["yr_built"].fillna(house["yr_built"].median()), bins=num_bins
)
bins1, hist1

Out[19]:

(array([1900.        , 1938.33333333, 1976.66666667, 2015.        ]),
 array([ 3067,  8120, 10426]))

In [20]:

pd.concat([house["yr_built"], pd.cut(house["yr_built"], list(bins1))], axis=1).head(20)

Out[20]:

	yr_built	yr_built
0	1955	(1938.333, 1976.667]
1	1951	(1938.333, 1976.667]
2	1933	(1900.0, 1938.333]
3	1965	(1938.333, 1976.667]
4	1987	(1976.667, 2015.0]
5	2001	(1976.667, 2015.0]
6	1995	(1976.667, 2015.0]
7	1963	(1938.333, 1976.667]
8	1960	(1938.333, 1976.667]
9	2003	(1976.667, 2015.0]
10	1965	(1938.333, 1976.667]
11	1942	(1938.333, 1976.667]
12	1927	(1900.0, 1938.333]
13	1977	(1976.667, 2015.0]
14	1900	NaN
15	1979	(1976.667, 2015.0]
16	1994	(1976.667, 2015.0]
17	1916	(1900.0, 1938.333]
18	1921	(1900.0, 1938.333]
19	1969	(1938.333, 1976.667]

In [21]:

pd.concat(
    [house["yr_built"], pd.cut(house["yr_built"], list(bins1), labels=labels)], axis=1
).head(20)

Out[21]:

	yr_built	yr_built
0	1955	middle
1	1951	middle
2	1933	old
3	1965	middle
4	1987	new
5	2001	new
6	1995	new
7	1963	middle
8	1960	middle
9	2003	new
10	1965	middle
11	1942	middle
12	1927	old
13	1977	new
14	1900	NaN
15	1979	new
16	1994	new
17	1916	old
18	1921	old
19	1969	middle

In [11]:

bins2 = np.linspace(1899, 2015, 5)
tmp_bins2 = np.digitize(house["yr_built"].fillna(house["yr_built"].median()), bins2)
hist2 = np.bincount(tmp_bins2 - 1)
bins2, hist2

Out[11]:

(array([1899., 1928., 1957., 1986., 2015.]),
 array([2403, 4230, 6914, 8028,   38]))

In [22]:

pd.concat([house["yr_built"], pd.cut(house["yr_built"], list(bins2))], axis=1).head(20)

Out[22]:

	yr_built	yr_built
0	1955	(1928.0, 1957.0]
1	1951	(1928.0, 1957.0]
2	1933	(1928.0, 1957.0]
3	1965	(1957.0, 1986.0]
4	1987	(1986.0, 2015.0]
5	2001	(1986.0, 2015.0]
6	1995	(1986.0, 2015.0]
7	1963	(1957.0, 1986.0]
8	1960	(1957.0, 1986.0]
9	2003	(1986.0, 2015.0]
10	1965	(1957.0, 1986.0]
11	1942	(1928.0, 1957.0]
12	1927	(1899.0, 1928.0]
13	1977	(1957.0, 1986.0]
14	1900	(1899.0, 1928.0]
15	1979	(1957.0, 1986.0]
16	1994	(1986.0, 2015.0]
17	1916	(1899.0, 1928.0]
18	1921	(1899.0, 1928.0]
19	1969	(1957.0, 1986.0]

In [23]:

hist3, bins3 = np.histogram(
    house["yr_built"].fillna(house["yr_built"].median()), bins=[1899, 1957, 2001, 2015]
)
bins3, hist3

Out[23]:

(array([1899, 1957, 2001, 2015]), array([ 6633, 10439,  4541]))

In [24]:

pd.concat([house["yr_built"], pd.cut(house["yr_built"], list(bins3))], axis=1).head(20)

Out[24]:

	yr_built	yr_built
0	1955	(1899, 1957]
1	1951	(1899, 1957]
2	1933	(1899, 1957]
3	1965	(1957, 2001]
4	1987	(1957, 2001]
5	2001	(1957, 2001]
6	1995	(1957, 2001]
7	1963	(1957, 2001]
8	1960	(1957, 2001]
9	2003	(2001, 2015]
10	1965	(1957, 2001]
11	1942	(1899, 1957]
12	1927	(1899, 1957]
13	1977	(1957, 2001]
14	1900	(1899, 1957]
15	1979	(1957, 2001]
16	1994	(1957, 2001]
17	1916	(1899, 1957]
18	1921	(1899, 1957]
19	1969	(1957, 2001]

In [25]:

pd.concat(
    [house["yr_built"], pd.cut(house["yr_built"], list(bins3), labels=labels)],
    axis=1,
).head(20)

Out[25]:

	yr_built	yr_built
0	1955	old
1	1951	old
2	1933	old
3	1965	middle
4	1987	middle
5	2001	middle
6	1995	middle
7	1963	middle
8	1960	middle
9	2003	new
10	1965	middle
11	1942	old
12	1927	old
13	1977	middle
14	1900	old
15	1979	middle
16	1994	middle
17	1916	old
18	1921	old
19	1969	middle

In [26]:

pd.concat(
    [house["yr_built"], pd.qcut(house["yr_built"], q=3, labels=False)], axis=1
).head(20)

Out[26]:

	yr_built	yr_built
0	1955	0
1	1951	0
2	1933	0
3	1965	1
4	1987	1
5	2001	2
6	1995	2
7	1963	1
8	1960	1
9	2003	2
10	1965	1
11	1942	0
12	1927	0
13	1977	1
14	1900	0
15	1979	1
16	1994	2
17	1916	0
18	1921	0
19	1969	1

In [27]:

pd.concat(
    [house["yr_built"], pd.qcut(house["yr_built"], q=3, labels=labels)], axis=1
).head(20)

Out[27]:

	yr_built	yr_built
0	1955	old
1	1951	old
2	1933	old
3	1965	middle
4	1987	middle
5	2001	new
6	1995	new
7	1963	middle
8	1960	middle
9	2003	new
10	1965	middle
11	1942	old
12	1927	old
13	1977	middle
14	1900	old
15	1979	middle
16	1994	new
17	1916	old
18	1921	old
19	1969	middle

Пример конструирования признаков на основе существующих

In [28]:

house_cleaned = house.drop(["waterfront", "view", "condition"], axis=1, errors="ignore")

house_cleaned = house_cleaned.dropna()

# Признак "Price_category": разделение домов на категории цен
house_cleaned["Price_category"] = pd.qcut(
    house_cleaned["price"], q=3, labels=["Low", "Medium", "High"]
)

# Признак "Renovated_flag": 1, если дом был отремонтирован, иначе 0
house_cleaned["Renovated_flag"] = house_cleaned["yr_renovated"].apply(
    lambda x: 1 if x > 0 else 0
)

# Признак "Zipcode_area": используем первые три цифры из почтового индекса
house_cleaned["Zipcode_area"] = house_cleaned["zipcode"].apply(lambda x: str(x)[:3])

house_cleaned

Out[28]:

	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15	Price_category	Renovated_flag	Zipcode_area
0	20141013T000000	221900.0	3	1.00	1180	5650	1.0	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650	Low	0	981
1	20141209T000000	538000.0	3	2.25	2570	7242	2.0	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639	Medium	1	981
2	20150225T000000	180000.0	2	1.00	770	10000	1.0	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062	Low	0	980
3	20141209T000000	604000.0	4	3.00	1960	5000	1.0	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000	High	0	981
4	20150218T000000	510000.0	3	2.00	1680	8080	1.0	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503	Medium	0	980
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
21608	20140521T000000	360000.0	3	2.50	1530	1131	3.0	8	1530	0	2009	0	98103	47.6993	-122.346	1530	1509	Low	0	981
21609	20150223T000000	400000.0	4	2.50	2310	5813	2.0	8	2310	0	2014	0	98146	47.5107	-122.362	1830	7200	Medium	0	981
21610	20140623T000000	402101.0	2	0.75	1020	1350	2.0	7	1020	0	2009	0	98144	47.5944	-122.299	1020	2007	Medium	0	981
21611	20150116T000000	400000.0	3	2.50	1600	2388	2.0	8	1600	0	2004	0	98027	47.5345	-122.069	1410	1287	Medium	0	980
21612	20141015T000000	325000.0	2	0.75	1020	1076	2.0	7	1020	0	2008	0	98144	47.5941	-122.299	1020	1357	Low	0	981

21613 rows × 20 columns

In [30]:

import featuretools as ft
from woodwork.logical_types import Categorical, Datetime

import featuretools as ft
from woodwork.logical_types import Categorical, Datetime
import pandas as pd

# Загрузка данных
df = pd.read_csv("data/kc_house_data.csv")

# Убедимся, что есть уникальный идентификатор для каждой строки (если нет, создаем)
df["id"] = range(len(df))

# Создаем EntitySet для данных о домах
es = ft.EntitySet(id="house_sales")

# Добавляем основной DataFrame в EntitySet с указанием типов данных
es = es.add_dataframe(
    dataframe_name="houses",
    dataframe=df,
    index="id",  # Уникальный идентификатор для домов
    logical_types={
        "date": Datetime,
        "zipcode": Categorical,
        "condition": Categorical,
        "grade": Categorical,
        "view": Categorical,
        "waterfront": Categorical,
    },
)
ft.primitives.list_primitives()
# Автоматическое конструирование признаков с применением корректных примитивов
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="houses",  # Название основной таблицы
    agg_primitives=["mean", "count", "mode", "sum"],  # Агрегирующие примитивы
    trans_primitives=[
        "year",
        "month",
        "weekday",
        "hour",
    ],  # Корректные трансформационные примитивы
    max_depth=2,  # Максимальная глубина для генерации признаков
)

# Просмотр полученной feature_matrix
print(feature_matrix.head())

       price  bedrooms  bathrooms  sqft_living  sqft_lot  floors waterfront  \
id                                                                            
0   221900.0         3       1.00         1180      5650     1.0          0   
1   538000.0         3       2.25         2570      7242     2.0          0   
2   180000.0         2       1.00          770     10000     1.0          0   
3   604000.0         4       3.00         1960      5000     1.0          0   
4   510000.0         3       2.00         1680      8080     1.0          0   

   view condition grade  ...  yr_renovated  zipcode      lat     long  \
id                       ...                                            
0     0         3     7  ...             0    98178  47.5112 -122.257   
1     0         3     7  ...          1991    98125  47.7210 -122.319   
2     0         3     6  ...             0    98028  47.7379 -122.233   
3     0         5     7  ...             0    98136  47.5208 -122.393   
4     0         3     8  ...             0    98074  47.6168 -122.045   

   sqft_living15  sqft_lot15  HOUR(date)  MONTH(date)  WEEKDAY(date)  \
id                                                                     
0           1340        5650           0           10              0   
1           1690        7639           0           12              1   
2           2720        8062           0            2              2   
3           1360        5000           0           12              1   
4           1800        7503           0            2              2   

   YEAR(date)  
id             
0        2014  
1        2014  
2        2015  
3        2014  
4        2015  

[5 rows x 23 columns]

c:\Mai\mai\.venv\Lib\site-packages\featuretools\synthesis\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created
  warnings.warn(
c:\Mai\mai\.venv\Lib\site-packages\featuretools\synthesis\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:
  agg_primitives: ['count', 'mean', 'mode', 'sum']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.
  warnings.warn(warning_msg, UnusedPrimitiveWarning)

In [31]:

house.boxplot(column="condition")

Out[31]:

<Axes: >

No description has been provided for this image

In [33]:

house_norm = house.copy()

house_norm["ConditionClip"] = house["condition"].clip(2, 5)

house_norm[house_norm["condition"] < 2][["date", "price", "ConditionClip"]]

Out[33]:

	date	price	ConditionClip
36	20140528T000000	550000.0	2
380	20140916T000000	270000.0	2
397	20140623T000000	365000.0	2
1442	20141107T000000	352950.0	2
1734	20150102T000000	252000.0	2
2223	20150316T000000	535000.0	2
3004	20141231T000000	441000.0	2
3202	20140509T000000	255000.0	2
3975	20150511T000000	210000.0	2
4651	20141002T000000	125000.0	2
7376	20141107T000000	295000.0	2
7636	20150120T000000	190000.0	2
12306	20150128T000000	196000.0	2
12453	20150402T000000	305000.0	2
12668	20140729T000000	227000.0	2
13628	20140716T000000	105500.0	2
13629	20150316T000000	445000.0	2
14987	20141202T000000	432500.0	2
15293	20140506T000000	78000.0	2
15337	20140630T000000	235000.0	2
15371	20150114T000000	658000.0	2
15712	20140724T000000	150000.0	2
16198	20150324T000000	81000.0	2
16893	20141210T000000	125000.0	2
16942	20140611T000000	427000.0	2
17805	20150501T000000	380000.0	2
18332	20140924T000000	130000.0	2
18645	20141216T000000	575000.0	2
18876	20150211T000000	1500000.0	2
19452	20140926T000000	142000.0	2

In [34]:

from scipy.stats.mstats import winsorize

print(house_norm["condition"].quantile(q=0.95))

house_norm["ConditionWinsorize"] = winsorize(
    house_norm["condition"].fillna(house_norm["condition"].mean()), (0.01, 0.05), inplace=False
)

house_norm[house_norm["condition"] < 2][["date", "condition", "ConditionWinsorize"]]

5.0

Out[34]:

	date	condition	ConditionWinsorize
36	20140528T000000	1	3
380	20140916T000000	1	3
397	20140623T000000	1	3
1442	20141107T000000	1	3
1734	20150102T000000	1	3
2223	20150316T000000	1	3
3004	20141231T000000	1	3
3202	20140509T000000	1	3
3975	20150511T000000	1	3
4651	20141002T000000	1	3
7376	20141107T000000	1	3
7636	20150120T000000	1	3
12306	20150128T000000	1	3
12453	20150402T000000	1	3
12668	20140729T000000	1	3
13628	20140716T000000	1	3
13629	20150316T000000	1	3
14987	20141202T000000	1	3
15293	20140506T000000	1	3
15337	20140630T000000	1	3
15371	20150114T000000	1	3
15712	20140724T000000	1	3
16198	20150324T000000	1	3
16893	20141210T000000	1	3
16942	20140611T000000	1	3
17805	20150501T000000	1	3
18332	20140924T000000	1	3
18645	20141216T000000	1	3
18876	20150211T000000	1	3
19452	20140926T000000	1	3

In [35]:

from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()

min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))

house_norm["ConditionNorm"] = min_max_scaler.fit_transform(
    house_norm["condition"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)

house_norm["ConditionClipNorm"] = min_max_scaler.fit_transform(
    house_norm["ConditionClip"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)

house_norm["ConditionWinsorizeNorm"] = min_max_scaler.fit_transform(
    house_norm["ConditionWinsorize"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)

house_norm["ConditionWinsorizeNorm2"] = min_max_scaler_2.fit_transform(
    house_norm["ConditionWinsorize"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)

house_norm[
    [
        "price",
        "condition",
        "ConditionNorm",
        "ConditionClipNorm",
        "ConditionWinsorizeNorm",
        "ConditionWinsorizeNorm2",
    ]
].head(20)

Out[35]:

	price	condition	ConditionNorm	ConditionClipNorm	ConditionWinsorizeNorm	ConditionWinsorizeNorm2
0	221900.0	3	0.50	0.333333	0.0	-1.0
1	538000.0	3	0.50	0.333333	0.0	-1.0
2	180000.0	3	0.50	0.333333	0.0	-1.0
3	604000.0	5	1.00	1.000000	1.0	1.0
4	510000.0	3	0.50	0.333333	0.0	-1.0
5	1225000.0	3	0.50	0.333333	0.0	-1.0
6	257500.0	3	0.50	0.333333	0.0	-1.0
7	291850.0	3	0.50	0.333333	0.0	-1.0
8	229500.0	3	0.50	0.333333	0.0	-1.0
9	323000.0	3	0.50	0.333333	0.0	-1.0
10	662500.0	3	0.50	0.333333	0.0	-1.0
11	468000.0	4	0.75	0.666667	0.5	0.0
12	310000.0	4	0.75	0.666667	0.5	0.0
13	400000.0	4	0.75	0.666667	0.5	0.0
14	530000.0	3	0.50	0.333333	0.0	-1.0
15	650000.0	3	0.50	0.333333	0.0	-1.0
16	395000.0	3	0.50	0.333333	0.0	-1.0
17	485000.0	4	0.75	0.666667	0.5	0.0
18	189000.0	4	0.75	0.666667	0.5	0.0
19	230000.0	4	0.75	0.666667	0.5	0.0

In [36]:

from sklearn import preprocessing

stndart_scaler = preprocessing.StandardScaler()

house_norm["ConditionStand"] = stndart_scaler.fit_transform(
    house_norm["condition"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)

house_norm["ConditionClipStand"] = stndart_scaler.fit_transform(
    house_norm["ConditionClip"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)

house_norm["ConditionWinsorizeStand"] = stndart_scaler.fit_transform(
    house_norm["ConditionWinsorize"].to_numpy().reshape(-1, 1)
).reshape(house_norm["condition"].shape)

house_norm[
    [
        "price",
        "condition",
        "ConditionStand",
        "ConditionClipStand",
        "ConditionWinsorizeStand",
    ]
].head(20)

Out[36]:

	price	condition	ConditionStand	ConditionClipStand	ConditionWinsorizeStand
0	221900.0	3	-0.629187	-0.635310	-0.663482
1	538000.0	3	-0.629187	-0.635310	-0.663482
2	180000.0	3	-0.629187	-0.635310	-0.663482
3	604000.0	5	2.444294	2.457597	2.494726
4	510000.0	3	-0.629187	-0.635310	-0.663482
5	1225000.0	3	-0.629187	-0.635310	-0.663482
6	257500.0	3	-0.629187	-0.635310	-0.663482
7	291850.0	3	-0.629187	-0.635310	-0.663482
8	229500.0	3	-0.629187	-0.635310	-0.663482
9	323000.0	3	-0.629187	-0.635310	-0.663482
10	662500.0	3	-0.629187	-0.635310	-0.663482
11	468000.0	4	0.907554	0.911143	0.915622
12	310000.0	4	0.907554	0.911143	0.915622
13	400000.0	4	0.907554	0.911143	0.915622
14	530000.0	3	-0.629187	-0.635310	-0.663482
15	650000.0	3	-0.629187	-0.635310	-0.663482
16	395000.0	3	-0.629187	-0.635310	-0.663482
17	485000.0	4	0.907554	0.911143	0.915622
18	189000.0	4	0.907554	0.911143	0.915622
19	230000.0	4	0.907554	0.911143	0.915622

148 KiB Raw Permalink Blame History Unescape Escape

148 KiB

Raw Permalink Blame History