Mii_Kislitsa_Egor_Pibd_33/lec2_car.ipynb

33 KiB
Raw Permalink Blame History

Загрузка данных в DataFrame

In [ ]:
from numpy import nan
import pandas as pd

df = pd.read_csv("data/car_price_prediction.csv", index_col="ID")
df["Leather_interior"] = df["Leather_interior"].replace({"Yes": 1, "No": 0})
df["Levy"] = df["Levy"].replace({"-": None})

df.info()
print(df.shape)
df.head()
Running cells with 'Python 3.9.13' requires the ipykernel package.

Run the following command to install 'ipykernel' into the Python environment. 

Command: 'c:/Users/ogoro/AppData/Local/Programs/Python/Python39/python.exe -m pip install ipykernel -U --user --force-reinstall'

Получение сведений о пропущенных данных

Типы пропущенных данных:

  • None - представление пустых данных в Python
  • NaN - представление пустых данных в Pandas
  • '' - пустая строка
In [46]:
# Количество пустых значений признаков
print(df.isnull().sum())

print()

# Есть ли пустые значения признаков
print(df.isnull().any())

print()

# Процент пустых значений признаков
for i in df.columns:
    null_rate = df[i].isnull().sum() / len(df) * 100
    if null_rate > 0:
        print(f"{i} процент пустых значений: %{null_rate:.2f}")
Price                  0
Levy                5819
Manufacturer           0
Model                  0
Prod_year              0
Category               0
Leather_interior       0
Fuel type              0
Engine volume          0
Mileage                0
Cylinders              0
Gear box type          0
Drive wheels           0
Doors                  0
Wheel                  0
Color                  0
Airbags                0
dtype: int64

Price               False
Levy                 True
Manufacturer        False
Model               False
Prod_year           False
Category            False
Leather_interior    False
Fuel type           False
Engine volume       False
Mileage             False
Cylinders           False
Gear box type       False
Drive wheels        False
Doors               False
Wheel               False
Color               False
Airbags             False
dtype: bool

Levy процент пустых значений: %30.25
In [47]:
fillna_df = df.fillna(0)

print(fillna_df.shape)

print(fillna_df.isnull().any())

# Замена пустых данных на 0
df["LevyFillNA"] = df["Levy"].fillna(0)

# Замена пустых данных на медиану
df["LevyFillMedian"] = df["Levy"].fillna(df["LevyFillNA"].median())

df.tail()
(19237, 17)
Price               False
Levy                False
Manufacturer        False
Model               False
Prod_year           False
Category            False
Leather_interior    False
Fuel type           False
Engine volume       False
Mileage             False
Cylinders           False
Gear box type       False
Drive wheels        False
Doors               False
Wheel               False
Color               False
Airbags             False
dtype: bool
Out[47]:
Price Levy Manufacturer Model Prod_year Category Leather_interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags LevyFillNA LevyFillMedian
ID
45798355 8467 None MERCEDES-BENZ CLK 200 1999 Coupe 1 CNG 2.0 Turbo 300000 km 4.0 Manual Rear 02-Mar Left wheel Silver 5 0 642.0
45778856 15681 831 HYUNDAI Sonata 2011 Sedan 1 Petrol 2.4 161600 km 4.0 Tiptronic Front 04-May Left wheel Red 8 831 831
45804997 26108 836 HYUNDAI Tucson 2010 Jeep 1 Diesel 2 116365 km 4.0 Automatic Front 04-May Left wheel Grey 4 836 836
45793526 5331 1288 CHEVROLET Captiva 2007 Jeep 1 Diesel 2 51258 km 4.0 Automatic Front 04-May Left wheel Black 4 1288 1288
45813273 470 753 HYUNDAI Sonata 2012 Sedan 1 Hybrid 2.4 186923 km 4.0 Automatic Front 04-May Left wheel White 12 753 753
In [32]:
df["LevyCopy"] = df["Levy"]

# Замена данных сразу в DataFrame без копирования
df.fillna({"LevyCopy": 0}, inplace=True)

df.tail()
Out[32]:
Price Levy Manufacturer Model Prod. year Category Leather_interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags LevyFillNA LevyFillMedian LevyCopy
ID
45798355 8467 None MERCEDES-BENZ CLK 200 1999 Coupe 1 CNG 2.0 Turbo 300000 km 4.0 Manual Rear 02-Mar Left wheel Silver 5 0 642.0 0
45778856 15681 831 HYUNDAI Sonata 2011 Sedan 1 Petrol 2.4 161600 km 4.0 Tiptronic Front 04-May Left wheel Red 8 831 831 831
45804997 26108 836 HYUNDAI Tucson 2010 Jeep 1 Diesel 2 116365 km 4.0 Automatic Front 04-May Left wheel Grey 4 836 836 836
45793526 5331 1288 CHEVROLET Captiva 2007 Jeep 1 Diesel 2 51258 km 4.0 Automatic Front 04-May Left wheel Black 4 1288 1288 1288
45813273 470 753 HYUNDAI Sonata 2012 Sedan 1 Hybrid 2.4 186923 km 4.0 Automatic Front 04-May Left wheel White 12 753 753 753

Удаление наблюдений с пропусками

In [33]:
dropna_df = df.dropna()

print(dropna_df.shape)

print(fillna_df.isnull().any())
(13418, 20)
Price               False
Levy                False
Manufacturer        False
Model               False
Prod. year          False
Category            False
Leather_interior    False
Fuel type           False
Engine volume       False
Mileage             False
Cylinders           False
Gear box type       False
Drive wheels        False
Doors               False
Wheel               False
Color               False
Airbags             False
dtype: bool

Создание выборок данных

Библиотека scikit-learn

https://scikit-learn.org/stable/index.html

No description has been provided for this image
In [41]:
# Функция для создания выборок
from sklearn.model_selection import train_test_split


def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
):
    """
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    """

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test
In [48]:
# Вывод распределения количества наблюдений по меткам (классам)
print(df.Leather_interior.value_counts())

data = df[["Leather_interior", "Price", "Prod_year"]].copy()

df_train, df_val, df_test = split_stratified_into_train_val_test(
    data,
    stratify_colname="Leather_interior",
    frac_train=0.60,
    frac_val=0.20,
    frac_test=0.20,
)

print("Обучающая выборка: ", df_train.shape)
print(df_train.Leather_interior.value_counts())

print("Контрольная выборка: ", df_val.shape)
print(df_val.Leather_interior.value_counts())

print("Тестовая выборка: ", df_test.shape)
print(df_test.Leather_interior.value_counts())
Leather_interior
1    13954
0     5283
Name: count, dtype: int64
Обучающая выборка:  (11542, 3)
Leather_interior
1    8372
0    3170
Name: count, dtype: int64
Контрольная выборка:  (3847, 3)
Leather_interior
1    2791
0    1056
Name: count, dtype: int64
Тестовая выборка:  (3848, 3)
Leather_interior
1    2791
0    1057
Name: count, dtype: int64
In [49]:
from imblearn.over_sampling import ADASYN

ada = ADASYN()

print("Обучающая выборка: ", df_train.shape)
print(df_train.Leather_interior.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["Leather_interior"])
df_train_adasyn = pd.DataFrame(X_resampled)

print("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
print(df_train_adasyn.Leather_interior.value_counts())

df_train_adasyn
Обучающая выборка:  (11542, 3)
Leather_interior
1    8372
0    3170
Name: count, dtype: int64
Обучающая выборка после oversampling:  (16416, 3)
Leather_interior
1    8372
0    8044
Name: count, dtype: int64
Out[49]:
Leather_interior Price Prod_year
0 0 21400 2008
1 0 16621 2016
2 1 28852 2017
3 1 2430 2008
4 0 7840 2005
... ... ... ...
16411 0 26030 2013
16412 0 26030 2012
16413 0 26030 2014
16414 0 26030 2012
16415 0 26030 2012

16416 rows × 3 columns