MII_Yunusov_Niyaz/notebooks/lec2 copy.ipynb
2024-09-30 23:02:17 +04:00

44 KiB
Raw Permalink Blame History

Загрузка данных в DataFrame

In [13]:
import pandas as pd

df1 = pd.read_csv("../data/kc_house_data.csv", index_col="id")

df1.info()

print(df1.shape)

df1.head()
<class 'pandas.core.frame.DataFrame'>
Index: 21613 entries, 7129300520 to 1523300157
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           21613 non-null  object 
 1   price          21607 non-null  float64
 2   bedrooms       21613 non-null  int64  
 3   bathrooms      21613 non-null  float64
 4   sqft_living    21613 non-null  int64  
 5   sqft_lot       21613 non-null  int64  
 6   floors         21613 non-null  float64
 7   waterfront     21613 non-null  int64  
 8   view           21613 non-null  int64  
 9   condition      21613 non-null  int64  
 10  grade          21613 non-null  int64  
 11  sqft_above     21613 non-null  int64  
 12  sqft_basement  21613 non-null  int64  
 13  yr_built       21613 non-null  int64  
 14  yr_renovated   21613 non-null  int64  
 15  zipcode        21613 non-null  int64  
 16  lat            21613 non-null  float64
 17  long           21613 non-null  float64
 18  sqft_living15  21613 non-null  int64  
 19  sqft_lot15     21613 non-null  int64  
dtypes: float64(5), int64(14), object(1)
memory usage: 3.5+ MB
(21613, 20)
Out[13]:
date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
id
7129300520 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 3 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 3 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
5631500400 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 3 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
2487200875 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 5 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
1954400510 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 3 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503

Получение сведений о пропущенных данных

Типы пропущенных данных:

  • None - представление пустых данных в Python
  • NaN - представление пустых данных в Pandas
  • '' - пустая строка
In [14]:
# Количество пустых значений признаков
print(df1.isnull().sum())

print()

# Есть ли пустые значения признаков
print(df1.isnull().any())

print()

# Процент пустых значений признаков
for i in df1.columns:
    null_rate = df1[i].isnull().sum() / len(df1) * 100
    if null_rate > 0:
        print(f"{i} процент пустых значений: %{null_rate:.2f}")
date             0
price            6
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

date             False
price             True
bedrooms         False
bathrooms        False
sqft_living      False
sqft_lot         False
floors           False
waterfront       False
view             False
condition        False
grade            False
sqft_above       False
sqft_basement    False
yr_built         False
yr_renovated     False
zipcode          False
lat              False
long             False
sqft_living15    False
sqft_lot15       False
dtype: bool

price процент пустых значений: %0.03
In [17]:
fillna_df = df1.fillna(0)

print(fillna_df.shape)

print(fillna_df.isnull().any())

# Замена пустых данных на 0
df1["priceFillNa"] = df1["price"].fillna(0)

# Замена пустых данных на медиану
df1["priceFillMedian"] = df1["price"].fillna(df1["priceFillNa"].median())

df1.tail()
(21613, 21)
date             False
price            False
bedrooms         False
bathrooms        False
sqft_living      False
sqft_lot         False
floors           False
waterfront       False
view             False
condition        False
grade            False
sqft_above       False
sqft_basement    False
yr_built         False
yr_renovated     False
zipcode          False
lat              False
long             False
sqft_living15    False
sqft_lot15       False
priceFillNa      False
dtype: bool
Out[17]:
date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition ... sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15 priceFillNa priceFillMedian
id
263000018 20140521T000000 360000.0 3 2.50 1530 1131 3.0 0 0 3 ... 0 2009 0 98103 47.6993 -122.346 1530 1509 360000.0 360000.0
6600060120 20150223T000000 400000.0 4 2.50 2310 5813 2.0 0 0 3 ... 0 2014 0 98146 47.5107 -122.362 1830 7200 400000.0 400000.0
1523300141 20140623T000000 402101.0 2 0.75 1020 1350 2.0 0 0 3 ... 0 2009 0 98144 47.5944 -122.299 1020 2007 402101.0 402101.0
291310100 20150116T000000 400000.0 3 2.50 1600 2388 2.0 0 0 3 ... 0 2004 0 98027 47.5345 -122.069 1410 1287 400000.0 400000.0
1523300157 20141015T000000 325000.0 2 0.75 1020 1076 2.0 0 0 3 ... 0 2008 0 98144 47.5941 -122.299 1020 1357 325000.0 325000.0

5 rows × 22 columns

In [19]:
df1["PriceCopy"] = df1["price"]

# Замена данных сразу в DataFrame без копирования
df1.fillna({"PriceCopy": 0}, inplace=True)

df1.tail()
Out[19]:
date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition ... yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15 priceFillNa priceFillMedian PriceCopy
id
263000018 20140521T000000 360000.0 3 2.50 1530 1131 3.0 0 0 3 ... 2009 0 98103 47.6993 -122.346 1530 1509 360000.0 360000.0 360000.0
6600060120 20150223T000000 400000.0 4 2.50 2310 5813 2.0 0 0 3 ... 2014 0 98146 47.5107 -122.362 1830 7200 400000.0 400000.0 400000.0
1523300141 20140623T000000 402101.0 2 0.75 1020 1350 2.0 0 0 3 ... 2009 0 98144 47.5944 -122.299 1020 2007 402101.0 402101.0 402101.0
291310100 20150116T000000 400000.0 3 2.50 1600 2388 2.0 0 0 3 ... 2004 0 98027 47.5345 -122.069 1410 1287 400000.0 400000.0 400000.0
1523300157 20141015T000000 325000.0 2 0.75 1020 1076 2.0 0 0 3 ... 2008 0 98144 47.5941 -122.299 1020 1357 325000.0 325000.0 325000.0

5 rows × 23 columns

Удаление наблюдений с пропусками

In [20]:
dropna_df = df1.dropna()

print(dropna_df.shape)

print(fillna_df.isnull().any())
(21607, 23)
date             False
price            False
bedrooms         False
bathrooms        False
sqft_living      False
sqft_lot         False
floors           False
waterfront       False
view             False
condition        False
grade            False
sqft_above       False
sqft_basement    False
yr_built         False
yr_renovated     False
zipcode          False
lat              False
long             False
sqft_living15    False
sqft_lot15       False
priceFillNa      False
dtype: bool

Создание выборок данных

Библиотека scikit-learn

https://scikit-learn.org/stable/index.html

No description has been provided for this image
In [24]:
# Функция для создания выборок
from sklearn.model_selection import train_test_split


def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
):
    """
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    """

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test
In [27]:
# Вывод распределения количества наблюдений по меткам (классам)
print(df1.waterfront.value_counts())

data = df1[["waterfront", "priceFillMedian", "bedrooms"]].copy()

df_train, df_val, df_test = split_stratified_into_train_val_test(
    data, stratify_colname="waterfront", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

print("Обучающая выборка: ", df_train.shape)
print(df_train.waterfront.value_counts())

print("Контрольная выборка: ", df_val.shape)
print(df_val.waterfront.value_counts())

print("Тестовая выборка: ", df_test.shape)
print(df_test.waterfront.value_counts())
waterfront
0    21450
1      163
Name: count, dtype: int64
Обучающая выборка:  (12967, 3)
waterfront
0    12869
1       98
Name: count, dtype: int64
Контрольная выборка:  (4323, 3)
waterfront
0    4290
1      33
Name: count, dtype: int64
Тестовая выборка:  (4323, 3)
waterfront
0    4291
1      32
Name: count, dtype: int64
In [28]:
from imblearn.over_sampling import ADASYN

ada = ADASYN()

print("Обучающая выборка: ", df_train.shape)
print(df_train.waterfront.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["waterfront"])
df_train_adasyn = pd.DataFrame(X_resampled)

print("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
print(df_train_adasyn.waterfront.value_counts())

df_train_adasyn
Обучающая выборка:  (12967, 3)
waterfront
0    12869
1       98
Name: count, dtype: int64
Обучающая выборка после oversampling:  (25754, 3)
waterfront
1    12885
0    12869
Name: count, dtype: int64
Out[28]:
waterfront priceFillMedian bedrooms
0 0 360000.000000 4
1 0 488250.000000 4
2 0 699000.000000 4
3 0 405000.000000 2
4 0 343000.000000 2
... ... ... ...
25749 1 528600.011954 2
25750 1 537978.366911 1
25751 1 522391.975863 3
25752 1 537575.496160 2
25753 1 538006.679050 2

25754 rows × 3 columns