MII/lec2e.ipynb
2024-10-24 21:50:12 +04:00

45 KiB
Raw Blame History

Загрузка данных в DataFrame

In [1]:
import pandas as pd

df = pd.read_csv("data/kc_house_data.csv", index_col="id")
df["bathrooms"] = df["bathrooms"].replace({0: None})

df.info()

print(df.shape)

df.head()
<class 'pandas.core.frame.DataFrame'>
Index: 21613 entries, 7129300520 to 1523300157
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           21613 non-null  object 
 1   price          21613 non-null  float64
 2   bedrooms       21613 non-null  int64  
 3   bathrooms      21603 non-null  object 
 4   sqft_living    21613 non-null  int64  
 5   sqft_lot       21613 non-null  int64  
 6   floors         21613 non-null  float64
 7   waterfront     21613 non-null  int64  
 8   view           21613 non-null  int64  
 9   condition      21613 non-null  int64  
 10  grade          21613 non-null  int64  
 11  sqft_above     21613 non-null  int64  
 12  sqft_basement  21613 non-null  int64  
 13  yr_built       21613 non-null  int64  
 14  yr_renovated   21613 non-null  int64  
 15  zipcode        21613 non-null  int64  
 16  lat            21613 non-null  float64
 17  long           21613 non-null  float64
 18  sqft_living15  21613 non-null  int64  
 19  sqft_lot15     21613 non-null  int64  
dtypes: float64(4), int64(14), object(2)
memory usage: 3.5+ MB
(21613, 20)
Out[1]:
date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
id
7129300520 20141013T000000 221900.0 3 1.0 1180 5650 1.0 0 0 3 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 3 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
5631500400 20150225T000000 180000.0 2 1.0 770 10000 1.0 0 0 3 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
2487200875 20141209T000000 604000.0 4 3.0 1960 5000 1.0 0 0 5 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
1954400510 20150218T000000 510000.0 3 2.0 1680 8080 1.0 0 0 3 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503

Получение сведений о пропущенных данных

Типы пропущенных данных:

  • None - представление пустых данных в Python
  • NaN - представление пустых данных в Pandas
  • '' - пустая строка
In [11]:
# Количество пустых значений признаков
print(df.isnull().sum())

print()

# Есть ли пустые значения признаков
print(df.isnull().any())

print()

# Процент пустых значений признаков
for i in df.columns:
    null_rate = df[i].isnull().sum() / len(df) * 100
    if null_rate > 0:
        print(f"{i} процент пустых значений: %{null_rate:.2f}")
date              0
price             0
bedrooms          0
bathrooms        10
sqft_living       0
sqft_lot          0
floors            0
waterfront        0
view              0
condition         0
grade             0
sqft_above        0
sqft_basement     0
yr_built          0
yr_renovated      0
zipcode           0
lat               0
long              0
sqft_living15     0
sqft_lot15        0
dtype: int64

date             False
price            False
bedrooms         False
bathrooms         True
sqft_living      False
sqft_lot         False
floors           False
waterfront       False
view             False
condition        False
grade            False
sqft_above       False
sqft_basement    False
yr_built         False
yr_renovated     False
zipcode          False
lat              False
long             False
sqft_living15    False
sqft_lot15       False
dtype: bool

bathrooms процент пустых значений: %0.05
In [6]:
fillna_df = df.fillna(0)

print(fillna_df.shape)

print(fillna_df.isnull().any())

# Замена пустых данных на 0
df["bathroomsFillNA"] = df["bathrooms"].fillna(0)

# Замена пустых данных на медиану
df["bathroomsFillMedian"] = df["bathrooms"].fillna(df["bathroomsFillNA"].median())

df.tail()
(21613, 20)
date             False
price            False
bedrooms         False
bathrooms        False
sqft_living      False
sqft_lot         False
floors           False
waterfront       False
view             False
condition        False
grade            False
sqft_above       False
sqft_basement    False
yr_built         False
yr_renovated     False
zipcode          False
lat              False
long             False
sqft_living15    False
sqft_lot15       False
dtype: bool
C:\Users\ogoro\AppData\Local\Temp\ipykernel_17324\316964845.py:1: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  fillna_df = df.fillna(0)
C:\Users\ogoro\AppData\Local\Temp\ipykernel_17324\316964845.py:8: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df["bathroomsFillNA"] = df["bathrooms"].fillna(0)
C:\Users\ogoro\AppData\Local\Temp\ipykernel_17324\316964845.py:11: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df["bathroomsFillMedian"] = df["bathrooms"].fillna(df["bathroomsFillNA"].median())
Out[6]:
date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition ... sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15 bathroomsFillNA bathroomsFillMedian
id
263000018 20140521T000000 360000.0 3 2.5 1530 1131 3.0 0 0 3 ... 0 2009 0 98103 47.6993 -122.346 1530 1509 2.50 2.50
6600060120 20150223T000000 400000.0 4 2.5 2310 5813 2.0 0 0 3 ... 0 2014 0 98146 47.5107 -122.362 1830 7200 2.50 2.50
1523300141 20140623T000000 402101.0 2 0.75 1020 1350 2.0 0 0 3 ... 0 2009 0 98144 47.5944 -122.299 1020 2007 0.75 0.75
291310100 20150116T000000 400000.0 3 2.5 1600 2388 2.0 0 0 3 ... 0 2004 0 98027 47.5345 -122.069 1410 1287 2.50 2.50
1523300157 20141015T000000 325000.0 2 0.75 1020 1076 2.0 0 0 3 ... 0 2008 0 98144 47.5941 -122.299 1020 1357 0.75 0.75

5 rows × 22 columns

In [8]:
df["bathroomsCopy"] = df["bathrooms"]

# Замена данных сразу в DataFrame без копирования
df.fillna({"bathroomsCopy": 0}, inplace=True)

df.tail()
C:\Users\ogoro\AppData\Local\Temp\ipykernel_17324\2863363019.py:4: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df.fillna({"bathroomsCopy": 0}, inplace=True)
Out[8]:
date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition ... yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15 bathroomsFillNA bathroomsFillMedian bathroomsCopy
id
263000018 20140521T000000 360000.0 3 2.5 1530 1131 3.0 0 0 3 ... 2009 0 98103 47.6993 -122.346 1530 1509 2.50 2.50 2.50
6600060120 20150223T000000 400000.0 4 2.5 2310 5813 2.0 0 0 3 ... 2014 0 98146 47.5107 -122.362 1830 7200 2.50 2.50 2.50
1523300141 20140623T000000 402101.0 2 0.75 1020 1350 2.0 0 0 3 ... 2009 0 98144 47.5944 -122.299 1020 2007 0.75 0.75 0.75
291310100 20150116T000000 400000.0 3 2.5 1600 2388 2.0 0 0 3 ... 2004 0 98027 47.5345 -122.069 1410 1287 2.50 2.50 2.50
1523300157 20141015T000000 325000.0 2 0.75 1020 1076 2.0 0 0 3 ... 2008 0 98144 47.5941 -122.299 1020 1357 0.75 0.75 0.75

5 rows × 23 columns

In [12]:
dropna_df = df.dropna()

print(dropna_df.shape)

print(fillna_df.isnull().any())
(21603, 20)
date             False
price            False
bedrooms         False
bathrooms        False
sqft_living      False
sqft_lot         False
floors           False
waterfront       False
view             False
condition        False
grade            False
sqft_above       False
sqft_basement    False
yr_built         False
yr_renovated     False
zipcode          False
lat              False
long             False
sqft_living15    False
sqft_lot15       False
dtype: bool

Создание выборок данных

Библиотека scikit-learn

https://scikit-learn.org/stable/index.html

No description has been provided for this image
In [2]:
# Функция для создания выборок
from sklearn.model_selection import train_test_split


def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
):
    """
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    """

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test
In [3]:
# Вывод распределения количества наблюдений по меткам (классам)
print(df.waterfront.value_counts())

data = df[["waterfront", "price", "yr_built"]].copy()

df_train, df_val, df_test = split_stratified_into_train_val_test(
    data, stratify_colname="waterfront", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

print("Обучающая выборка: ", df_train.shape)
print(df_train.waterfront.value_counts())

print("Контрольная выборка: ", df_val.shape)
print(df_val.waterfront.value_counts())

print("Тестовая выборка: ", df_test.shape)
print(df_test.waterfront.value_counts())
waterfront
0    21450
1      163
Name: count, dtype: int64
Обучающая выборка:  (12967, 3)
waterfront
0    12869
1       98
Name: count, dtype: int64
Контрольная выборка:  (4323, 3)
waterfront
0    4291
1      32
Name: count, dtype: int64
Тестовая выборка:  (4323, 3)
waterfront
0    4290
1      33
Name: count, dtype: int64
In [5]:
from imblearn.over_sampling import ADASYN

ada = ADASYN()

print("Обучающая выборка: ", df_train.shape)
print(df_train.waterfront.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["waterfront"]) # type: ignore
df_train_adasyn = pd.DataFrame(X_resampled)

print("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
print(df_train_adasyn.waterfront.value_counts())

df_train_adasyn
Обучающая выборка:  (12967, 3)
waterfront
0    12869
1       98
Name: count, dtype: int64
Обучающая выборка после oversampling:  (25720, 3)
waterfront
0    12869
1    12851
Name: count, dtype: int64
Out[5]:
waterfront price yr_built
0 0 5.225000e+05 1977
1 0 2.150000e+05 1967
2 0 3.100000e+05 2000
3 0 2.050000e+05 1954
4 0 5.499500e+05 1996
... ... ... ...
25715 1 1.869458e+06 1984
25716 1 1.856441e+06 1967
25717 1 1.899377e+06 1971
25718 1 1.878892e+06 1985
25719 1 1.904794e+06 1991

25720 rows × 3 columns