Mii_Kislitsa_Egor_Pibd_33/lec2.ipynb at Lab_2

2024-09-28 13:38:06 +04:00

45 KiB

Raw Permalink Blame History

Загрузка данных в DataFrame

In [10]:

import pandas as pd

df = pd.read_csv("data/kc_house_data.csv", index_col="id")
df["bathrooms"] = df["bathrooms"].replace({0: None})

df.info()

print(df.shape)

df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 21613 entries, 7129300520 to 1523300157
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           21613 non-null  object 
 1   price          21613 non-null  float64
 2   bedrooms       21613 non-null  int64  
 3   bathrooms      21603 non-null  object 
 4   sqft_living    21613 non-null  int64  
 5   sqft_lot       21613 non-null  int64  
 6   floors         21613 non-null  float64
 7   waterfront     21613 non-null  int64  
 8   view           21613 non-null  int64  
 9   condition      21613 non-null  int64  
 10  grade          21613 non-null  int64  
 11  sqft_above     21613 non-null  int64  
 12  sqft_basement  21613 non-null  int64  
 13  yr_built       21613 non-null  int64  
 14  yr_renovated   21613 non-null  int64  
 15  zipcode        21613 non-null  int64  
 16  lat            21613 non-null  float64
 17  long           21613 non-null  float64
 18  sqft_living15  21613 non-null  int64  
 19  sqft_lot15     21613 non-null  int64  
dtypes: float64(4), int64(14), object(2)
memory usage: 3.5+ MB
(21613, 20)

Out[10]:

	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
id
7129300520	20141013T000000	221900.0	3	1.0	1180	5650	1.0	0	0	3	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	0	0	3	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
5631500400	20150225T000000	180000.0	2	1.0	770	10000	1.0	0	0	3	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
2487200875	20141209T000000	604000.0	4	3.0	1960	5000	1.0	0	0	5	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
1954400510	20150218T000000	510000.0	3	2.0	1680	8080	1.0	0	0	3	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503

Получение сведений о пропущенных данных

Типы пропущенных данных:

None - представление пустых данных в Python
NaN - представление пустых данных в Pandas
'' - пустая строка

In [11]:

# Количество пустых значений признаков
print(df.isnull().sum())

print()

# Есть ли пустые значения признаков
print(df.isnull().any())

print()

# Процент пустых значений признаков
for i in df.columns:
    null_rate = df[i].isnull().sum() / len(df) * 100
    if null_rate > 0:
        print(f"{i} процент пустых значений: %{null_rate:.2f}")

date              0
price             0
bedrooms          0
bathrooms        10
sqft_living       0
sqft_lot          0
floors            0
waterfront        0
view              0
condition         0
grade             0
sqft_above        0
sqft_basement     0
yr_built          0
yr_renovated      0
zipcode           0
lat               0
long              0
sqft_living15     0
sqft_lot15        0
dtype: int64

date             False
price            False
bedrooms         False
bathrooms         True
sqft_living      False
sqft_lot         False
floors           False
waterfront       False
view             False
condition        False
grade            False
sqft_above       False
sqft_basement    False
yr_built         False
yr_renovated     False
zipcode          False
lat              False
long             False
sqft_living15    False
sqft_lot15       False
dtype: bool

bathrooms процент пустых значений: %0.05

In [6]:

fillna_df = df.fillna(0)

print(fillna_df.shape)

print(fillna_df.isnull().any())

# Замена пустых данных на 0
df["bathroomsFillNA"] = df["bathrooms"].fillna(0)

# Замена пустых данных на медиану
df["bathroomsFillMedian"] = df["bathrooms"].fillna(df["bathroomsFillNA"].median())

df.tail()

(21613, 20)
date             False
price            False
bedrooms         False
bathrooms        False
sqft_living      False
sqft_lot         False
floors           False
waterfront       False
view             False
condition        False
grade            False
sqft_above       False
sqft_basement    False
yr_built         False
yr_renovated     False
zipcode          False
lat              False
long             False
sqft_living15    False
sqft_lot15       False
dtype: bool

C:\Users\ogoro\AppData\Local\Temp\ipykernel_17324\316964845.py:1: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  fillna_df = df.fillna(0)
C:\Users\ogoro\AppData\Local\Temp\ipykernel_17324\316964845.py:8: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df["bathroomsFillNA"] = df["bathrooms"].fillna(0)
C:\Users\ogoro\AppData\Local\Temp\ipykernel_17324\316964845.py:11: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df["bathroomsFillMedian"] = df["bathrooms"].fillna(df["bathroomsFillNA"].median())

Out[6]:

	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	...	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15	bathroomsFillNA	bathroomsFillMedian
id
263000018	20140521T000000	360000.0	3	2.5	1530	1131	3.0	0	0	3	...	0	2009	0	98103	47.6993	-122.346	1530	1509	2.50	2.50
6600060120	20150223T000000	400000.0	4	2.5	2310	5813	2.0	0	0	3	...	0	2014	0	98146	47.5107	-122.362	1830	7200	2.50	2.50
1523300141	20140623T000000	402101.0	2	0.75	1020	1350	2.0	0	0	3	...	0	2009	0	98144	47.5944	-122.299	1020	2007	0.75	0.75
291310100	20150116T000000	400000.0	3	2.5	1600	2388	2.0	0	0	3	...	0	2004	0	98027	47.5345	-122.069	1410	1287	2.50	2.50
1523300157	20141015T000000	325000.0	2	0.75	1020	1076	2.0	0	0	3	...	0	2008	0	98144	47.5941	-122.299	1020	1357	0.75	0.75

5 rows × 22 columns

In [8]:

df["bathroomsCopy"] = df["bathrooms"]

# Замена данных сразу в DataFrame без копирования
df.fillna({"bathroomsCopy": 0}, inplace=True)

df.tail()

C:\Users\ogoro\AppData\Local\Temp\ipykernel_17324\2863363019.py:4: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df.fillna({"bathroomsCopy": 0}, inplace=True)

Out[8]:

	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	...	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15	bathroomsFillNA	bathroomsFillMedian	bathroomsCopy
id
263000018	20140521T000000	360000.0	3	2.5	1530	1131	3.0	0	0	3	...	2009	0	98103	47.6993	-122.346	1530	1509	2.50	2.50	2.50
6600060120	20150223T000000	400000.0	4	2.5	2310	5813	2.0	0	0	3	...	2014	0	98146	47.5107	-122.362	1830	7200	2.50	2.50	2.50
1523300141	20140623T000000	402101.0	2	0.75	1020	1350	2.0	0	0	3	...	2009	0	98144	47.5944	-122.299	1020	2007	0.75	0.75	0.75
291310100	20150116T000000	400000.0	3	2.5	1600	2388	2.0	0	0	3	...	2004	0	98027	47.5345	-122.069	1410	1287	2.50	2.50	2.50
1523300157	20141015T000000	325000.0	2	0.75	1020	1076	2.0	0	0	3	...	2008	0	98144	47.5941	-122.299	1020	1357	0.75	0.75	0.75

5 rows × 23 columns

In [12]:

dropna_df = df.dropna()

print(dropna_df.shape)

print(fillna_df.isnull().any())

(21603, 20)
date             False
price            False
bedrooms         False
bathrooms        False
sqft_living      False
sqft_lot         False
floors           False
waterfront       False
view             False
condition        False
grade            False
sqft_above       False
sqft_basement    False
yr_built         False
yr_renovated     False
zipcode          False
lat              False
long             False
sqft_living15    False
sqft_lot15       False
dtype: bool

Создание выборок данных

Библиотека scikit-learn

https://scikit-learn.org/stable/index.html

No description has been provided for this image

In [14]:

# Функция для создания выборок
from sklearn.model_selection import train_test_split


def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
):
    """
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    """

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test

In [15]:

# Вывод распределения количества наблюдений по меткам (классам)
print(df.waterfront.value_counts())

data = df[["waterfront", "price", "yr_built"]].copy()

df_train, df_val, df_test = split_stratified_into_train_val_test(
    data, stratify_colname="waterfront", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

print("Обучающая выборка: ", df_train.shape)
print(df_train.waterfront.value_counts())

print("Контрольная выборка: ", df_val.shape)
print(df_val.waterfront.value_counts())

print("Тестовая выборка: ", df_test.shape)
print(df_test.waterfront.value_counts())

waterfront
0    21450
1      163
Name: count, dtype: int64
Обучающая выборка:  (12967, 3)
waterfront
0    12869
1       98
Name: count, dtype: int64
Контрольная выборка:  (4323, 3)
waterfront
0    4290
1      33
Name: count, dtype: int64
Тестовая выборка:  (4323, 3)
waterfront
0    4291
1      32
Name: count, dtype: int64

Выборка с избытком (oversampling)

https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/

https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/

Выборка с недостатком (undersampling)

https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/

Библиотека imbalanced-learn

https://imbalanced-learn.org/stable/

In [16]:

from imblearn.over_sampling import ADASYN

ada = ADASYN()

print("Обучающая выборка: ", df_train.shape)
print(df_train.waterfront.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["waterfront"]) # type: ignore
df_train_adasyn = pd.DataFrame(X_resampled)

print("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
print(df_train_adasyn.waterfront.value_counts())

df_train_adasyn

Обучающая выборка:  (12967, 3)
waterfront
0    12869
1       98
Name: count, dtype: int64
Обучающая выборка после oversampling:  (25712, 3)
waterfront
0    12869
1    12843
Name: count, dtype: int64

Out[16]:

	waterfront	price	yr_built
0	0	2.710000e+05	1977
1	0	6.390000e+05	1939
2	0	3.300000e+05	1997
3	0	3.849500e+05	2000
4	0	9.300000e+05	2009
...	...	...	...
25707	1	2.930485e+06	1930
25708	1	2.909679e+06	1924
25709	1	2.822863e+06	1934
25710	1	3.017895e+06	1947
25711	1	2.970980e+06	1938

25712 rows × 3 columns

45 KiB Raw Permalink Blame History Unescape Escape

45 KiB

Raw Permalink Blame History