MII/lec2_car.ipynb
2024-10-24 21:50:12 +04:00

42 KiB
Raw Permalink Blame History

Загрузка данных в DataFrame

In [6]:
from numpy import nan
import pandas as pd

df = pd.read_csv("data/car_price_prediction.csv", index_col="ID")
df["Leather_interior"] = df["Leather_interior"].replace({"Yes": 1, "No": 0})
df["Levy"] = df["Levy"].replace({"-": None})

df.info()
print(df.shape)
df.head()
<class 'pandas.core.frame.DataFrame'>
Index: 19237 entries, 45654403 to 45813273
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             19237 non-null  int64  
 1   Levy              13418 non-null  object 
 2   Manufacturer      19237 non-null  object 
 3   Model             19237 non-null  object 
 4   Prod_year         19237 non-null  int64  
 5   Category          19237 non-null  object 
 6   Leather_interior  19237 non-null  int64  
 7   Fuel type         19237 non-null  object 
 8   Engine volume     19237 non-null  object 
 9   Mileage           19237 non-null  object 
 10  Cylinders         19237 non-null  float64
 11  Gear box type     19237 non-null  object 
 12  Drive wheels      19237 non-null  object 
 13  Doors             19237 non-null  object 
 14  Wheel             19237 non-null  object 
 15  Color             19237 non-null  object 
 16  Airbags           19237 non-null  int64  
dtypes: float64(1), int64(4), object(12)
memory usage: 2.6+ MB
(19237, 17)
C:\Users\1\AppData\Local\Temp\ipykernel_25288\68381857.py:5: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df["Leather_interior"] = df["Leather_interior"].replace({"Yes": 1, "No": 0})
Out[6]:
Price Levy Manufacturer Model Prod_year Category Leather_interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags
ID
45654403 13328 1399 LEXUS RX 450 2010 Jeep 1 Hybrid 3.5 186005 km 6.0 Automatic 4x4 04-May Left wheel Silver 12
44731507 16621 1018 CHEVROLET Equinox 2011 Jeep 0 Petrol 3 192000 km 6.0 Tiptronic 4x4 04-May Left wheel Black 8
45774419 8467 None HONDA FIT 2006 Hatchback 0 Petrol 1.3 200000 km 4.0 Variator Front 04-May Right-hand drive Black 2
45769185 3607 862 FORD Escape 2011 Jeep 1 Hybrid 2.5 168966 km 4.0 Automatic 4x4 04-May Left wheel White 0
45809263 11726 446 HONDA FIT 2014 Hatchback 1 Petrol 1.3 91901 km 4.0 Automatic Front 04-May Left wheel Silver 4

Получение сведений о пропущенных данных

Типы пропущенных данных:

  • None - представление пустых данных в Python
  • NaN - представление пустых данных в Pandas
  • '' - пустая строка
In [46]:
# Количество пустых значений признаков
print(df.isnull().sum())

print()

# Есть ли пустые значения признаков
print(df.isnull().any())

print()

# Процент пустых значений признаков
for i in df.columns:
    null_rate = df[i].isnull().sum() / len(df) * 100
    if null_rate > 0:
        print(f"{i} процент пустых значений: %{null_rate:.2f}")
Price                  0
Levy                5819
Manufacturer           0
Model                  0
Prod_year              0
Category               0
Leather_interior       0
Fuel type              0
Engine volume          0
Mileage                0
Cylinders              0
Gear box type          0
Drive wheels           0
Doors                  0
Wheel                  0
Color                  0
Airbags                0
dtype: int64

Price               False
Levy                 True
Manufacturer        False
Model               False
Prod_year           False
Category            False
Leather_interior    False
Fuel type           False
Engine volume       False
Mileage             False
Cylinders           False
Gear box type       False
Drive wheels        False
Doors               False
Wheel               False
Color               False
Airbags             False
dtype: bool

Levy процент пустых значений: %30.25
In [47]:
fillna_df = df.fillna(0)

print(fillna_df.shape)

print(fillna_df.isnull().any())

# Замена пустых данных на 0
df["LevyFillNA"] = df["Levy"].fillna(0)

# Замена пустых данных на медиану
df["LevyFillMedian"] = df["Levy"].fillna(df["LevyFillNA"].median())

df.tail()
(19237, 17)
Price               False
Levy                False
Manufacturer        False
Model               False
Prod_year           False
Category            False
Leather_interior    False
Fuel type           False
Engine volume       False
Mileage             False
Cylinders           False
Gear box type       False
Drive wheels        False
Doors               False
Wheel               False
Color               False
Airbags             False
dtype: bool
Out[47]:
Price Levy Manufacturer Model Prod_year Category Leather_interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags LevyFillNA LevyFillMedian
ID
45798355 8467 None MERCEDES-BENZ CLK 200 1999 Coupe 1 CNG 2.0 Turbo 300000 km 4.0 Manual Rear 02-Mar Left wheel Silver 5 0 642.0
45778856 15681 831 HYUNDAI Sonata 2011 Sedan 1 Petrol 2.4 161600 km 4.0 Tiptronic Front 04-May Left wheel Red 8 831 831
45804997 26108 836 HYUNDAI Tucson 2010 Jeep 1 Diesel 2 116365 km 4.0 Automatic Front 04-May Left wheel Grey 4 836 836
45793526 5331 1288 CHEVROLET Captiva 2007 Jeep 1 Diesel 2 51258 km 4.0 Automatic Front 04-May Left wheel Black 4 1288 1288
45813273 470 753 HYUNDAI Sonata 2012 Sedan 1 Hybrid 2.4 186923 km 4.0 Automatic Front 04-May Left wheel White 12 753 753
In [32]:
df["LevyCopy"] = df["Levy"]

# Замена данных сразу в DataFrame без копирования
df.fillna({"LevyCopy": 0}, inplace=True)

df.tail()
Out[32]:
Price Levy Manufacturer Model Prod. year Category Leather_interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags LevyFillNA LevyFillMedian LevyCopy
ID
45798355 8467 None MERCEDES-BENZ CLK 200 1999 Coupe 1 CNG 2.0 Turbo 300000 km 4.0 Manual Rear 02-Mar Left wheel Silver 5 0 642.0 0
45778856 15681 831 HYUNDAI Sonata 2011 Sedan 1 Petrol 2.4 161600 km 4.0 Tiptronic Front 04-May Left wheel Red 8 831 831 831
45804997 26108 836 HYUNDAI Tucson 2010 Jeep 1 Diesel 2 116365 km 4.0 Automatic Front 04-May Left wheel Grey 4 836 836 836
45793526 5331 1288 CHEVROLET Captiva 2007 Jeep 1 Diesel 2 51258 km 4.0 Automatic Front 04-May Left wheel Black 4 1288 1288 1288
45813273 470 753 HYUNDAI Sonata 2012 Sedan 1 Hybrid 2.4 186923 km 4.0 Automatic Front 04-May Left wheel White 12 753 753 753

Удаление наблюдений с пропусками

In [33]:
dropna_df = df.dropna()

print(dropna_df.shape)

print(fillna_df.isnull().any())
(13418, 20)
Price               False
Levy                False
Manufacturer        False
Model               False
Prod. year          False
Category            False
Leather_interior    False
Fuel type           False
Engine volume       False
Mileage             False
Cylinders           False
Gear box type       False
Drive wheels        False
Doors               False
Wheel               False
Color               False
Airbags             False
dtype: bool

Создание выборок данных

Библиотека scikit-learn

https://scikit-learn.org/stable/index.html

No description has been provided for this image
In [7]:
# Функция для создания выборок
from sklearn.model_selection import train_test_split


def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
):
    """
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    """

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test
In [8]:
# Вывод распределения количества наблюдений по меткам (классам)
print(df.Leather_interior.value_counts())

data = df[["Leather_interior", "Price", "Prod_year"]].copy()

df_train, df_val, df_test = split_stratified_into_train_val_test(
    data,
    stratify_colname="Leather_interior",
    frac_train=0.60,
    frac_val=0.20,
    frac_test=0.20,
)

print("Обучающая выборка: ", df_train.shape)
print(df_train.Leather_interior.value_counts())

print("Контрольная выборка: ", df_val.shape)
print(df_val.Leather_interior.value_counts())

print("Тестовая выборка: ", df_test.shape)
print(df_test.Leather_interior.value_counts())
Leather_interior
1    13954
0     5283
Name: count, dtype: int64
Обучающая выборка:  (11542, 3)
Leather_interior
1    8372
0    3170
Name: count, dtype: int64
Контрольная выборка:  (3847, 3)
Leather_interior
1    2791
0    1056
Name: count, dtype: int64
Тестовая выборка:  (3848, 3)
Leather_interior
1    2791
0    1057
Name: count, dtype: int64
In [9]:
from imblearn.over_sampling import ADASYN

ada = ADASYN()

print("Обучающая выборка: ", df_train.shape)
print(df_train.Leather_interior.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["Leather_interior"]) # type: ignore
df_train_adasyn = pd.DataFrame(X_resampled)

print("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
print(df_train_adasyn.Leather_interior.value_counts())

df_train_adasyn
Обучающая выборка:  (11542, 3)
Leather_interior
1    8372
0    3170
Name: count, dtype: int64
Обучающая выборка после oversampling:  (16453, 3)
Leather_interior
1    8372
0    8081
Name: count, dtype: int64
Out[9]:
Leather_interior Price Prod_year
0 1 22621 2011
1 1 35850 2006
2 1 20385 2012
3 1 4547 2009
4 1 19416 2013
... ... ... ...
16448 0 31361 2015
16449 0 31361 2015
16450 0 31361 2015
16451 0 11172 2009
16452 0 11133 2006

16453 rows × 3 columns