39 KiB
Raw Blame History

Загрузка данных в DataFrame

In [2]:
import pandas as pd

df = pd.read_csv("../data/car_price_prediction.csv")
In [3]:
df.head()
Out[3]:
ID Price Levy Manufacturer Model Prod_year Category Leather interior Fuel type Engine volume Mileage Cylinders Gear_box_type Drive_wheels Doors Wheel Color Airbags
0 45654403 13328 1399 LEXUS RX 450 2010 Jeep Yes Hybrid 3.5 186005 km 6.0 Automatic 4x4 04-May Left wheel Silver 12
1 44731507 16621 1018 CHEVROLET Equinox 2011 Jeep No Petrol 3 192000 km 6.0 Tiptronic 4x4 04-May Left wheel Black 8
2 45774419 8467 - HONDA FIT 2006 Hatchback No Petrol 1.3 200000 km 4.0 Variator Front 04-May Right-hand drive Black 2
3 45769185 3607 862 FORD Escape 2011 Jeep Yes Hybrid 2.5 168966 km 4.0 Automatic 4x4 04-May Left wheel White 0
4 45809263 11726 446 HONDA FIT 2014 Hatchback Yes Petrol 1.3 91901 km 4.0 Automatic Front 04-May Left wheel Silver 4

Получение сведений о пропущенных данных

In [4]:
print(df.isnull().sum())
ID                  0
Price               0
Levy                0
Manufacturer        0
Model               0
Prod_year           0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear_box_type       0
Drive_wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
dtype: int64
In [5]:
print(df.isnull().any())
ID                  False
Price               False
Levy                False
Manufacturer        False
Model               False
Prod_year           False
Category            False
Leather interior    False
Fuel type           False
Engine volume       False
Mileage             False
Cylinders           False
Gear_box_type       False
Drive_wheels        False
Doors               False
Wheel               False
Color               False
Airbags             False
dtype: bool
In [6]:
print(df["Levy"].unique())
['1399' '1018' '-' '862' '446' '891' '761' '751' '394' '1053' '1055'
 '1079' '810' '2386' '1850' '531' '586' '1249' '2455' '583' '1537' '1288'
 '915' '1750' '707' '1077' '1486' '1091' '650' '382' '1436' '1194' '503'
 '1017' '1104' '639' '629' '919' '781' '530' '640' '765' '777' '779' '934'
 '769' '645' '1185' '1324' '830' '1187' '1111' '760' '642' '1604' '1095'
 '966' '473' '1138' '1811' '988' '917' '1156' '687' '11714' '836' '1347'
 '2866' '1646' '259' '609' '697' '585' '475' '690' '308' '1823' '1361'
 '1273' '924' '584' '2078' '831' '1172' '893' '1872' '1885' '1266' '447'
 '2148' '1730' '730' '289' '502' '333' '1325' '247' '879' '1342' '1327'
 '1598' '1514' '1058' '738' '1935' '481' '1522' '1282' '456' '880' '900'
 '798' '1277' '442' '1051' '790' '1292' '1047' '528' '1211' '1493' '1793'
 '574' '930' '1998' '271' '706' '1481' '1677' '1661' '1286' '1408' '1090'
 '595' '1451' '1267' '993' '1714' '878' '641' '749' '1511' '603' '353'
 '877' '1236' '1141' '397' '784' '1024' '1357' '1301' '770' '922' '1438'
 '753' '607' '1363' '638' '490' '431' '565' '517' '833' '489' '1760' '986'
 '1841' '1620' '1360' '474' '1099' '978' '1624' '1946' '1268' '1307' '696'
 '649' '666' '2151' '551' '800' '971' '1323' '2377' '1845' '1083' '694'
 '463' '419' '345' '1515' '1505' '2056' '1203' '729' '460' '1356' '876'
 '911' '1190' '780' '448' '2410' '1848' '1148' '834' '1275' '1028' '1197'
 '724' '890' '1705' '505' '789' '2959' '518' '461' '1719' '2858' '3156'
 '2225' '2177' '1968' '1888' '1308' '2736' '1103' '557' '2195' '843'
 '1664' '723' '4508' '562' '501' '2018' '1076' '1202' '3301' '691' '1440'
 '1869' '1178' '418' '1820' '1413' '488' '1304' '363' '2108' '521' '1659'
 '87' '1411' '1528' '3292' '7058' '1578' '627' '874' '1996' '1488' '5679'
 '1234' '5603' '400' '889' '3268' '875' '949' '2265' '441' '742' '425'
 '2476' '2971' '614' '1816' '1375' '1405' '2297' '1062' '1113' '420'
 '2469' '658' '1951' '2670' '2578' '1995' '1032' '994' '1011' '2421'
 '1296' '155' '494' '426' '1086' '961' '2236' '1829' '764' '1834' '1054'
 '617' '1529' '2266' '637' '626' '1832' '1016' '2002' '1756' '746' '1285'
 '2690' '1118' '5332' '980' '1807' '970' '1228' '1195' '1132' '1768'
 '1384' '1080' '7063' '1817' '1452' '1975' '1368' '702' '1974' '1781'
 '1036' '944' '663' '364' '1539' '1345' '1680' '2209' '741' '1575' '695'
 '1317' '294' '1525' '424' '997' '1473' '1552' '2819' '2188' '1668' '3057'
 '799' '1502' '2606' '552' '1694' '1759' '1110' '399' '1470' '1174' '5877'
 '1474' '1688' '526' '686' '5908' '1107' '2070' '1468' '1246' '1685' '556'
 '1533' '1917' '1346' '732' '692' '579' '421' '362' '3505' '1855' '2711'
 '1586' '3739' '681' '1708' '2278' '1701' '722' '1482' '928' '827' '832'
 '527' '604' '173' '1341' '3329' '1553' '859' '167' '916' '828' '2082'
 '1176' '1108' '975' '3008' '1516' '2269' '1699' '2073' '1031' '1503'
 '2364' '1030' '1442' '5666' '2715' '1437' '2067' '1426' '2908' '1279'
 '866' '4283' '279' '2658' '3015' '2004' '1391' '4736' '748' '1466' '644'
 '683' '2705' '1297' '731' '1252' '2216' '3141' '3273' '1518' '1723'
 '1588' '972' '682' '1094' '668' '175' '967' '402' '3894' '1960' '1599'
 '2000' '2084' '1621' '714' '1109' '3989' '873' '1572' '1163' '1991'
 '1716' '1673' '2562' '2874' '965' '462' '605' '1948' '1736' '3518' '2054'
 '2467' '1681' '1272' '1205' '750' '2156' '2566' '115' '524' '3184' '676'
 '1678' '612' '328' '955' '1441' '1675' '3965' '2909' '623' '822' '867'
 '3025' '1993' '792' '636' '4057' '3743' '2337' '2570' '2418' '2472'
 '3910' '1662' '2123' '2628' '3208' '2080' '3699' '2913' '864' '2505'
 '870' '7536' '1924' '1671' '1064' '1836' '1866' '4741' '841' '1369'
 '5681' '3112' '1366' '2223' '1198' '1039' '3811' '3571' '1387' '1171'
 '1365' '1531' '1590' '11706' '2308' '4860' '1641' '1045' '1901']
In [7]:
df["Levy"] = df["Levy"].replace({'-' : None})
In [8]:
for i in df.columns:
    null_rate = df[i].isnull().sum() / len(df) * 100
    if null_rate > 0:
        print(f"{i} процент пустых значений: {null_rate:.2f}%")
Levy процент пустых значений: 30.25%

Заполнение пропущенных данных

In [9]:
df.fillna({"Levy": 0}, inplace=True)
for i in df.columns:
    null_rate = df[i].isnull().sum() / len(df) * 100
    if null_rate > 0:
        print(f"{i} процент пустых значений: {null_rate:.2f}%")

Создание выборок данных

In [10]:
from sklearn.model_selection import train_test_split


def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
):
    """
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    """

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test
In [11]:
print(df.Gear_box_type.unique())

data = df[
    [
        "Price",
        "Gear_box_type",
    ]
].copy()
['Automatic' 'Tiptronic' 'Variator' 'Manual']
In [12]:
df_train, df_val, df_test = split_stratified_into_train_val_test(
    data,
    stratify_colname="Gear_box_type",
    frac_train=0.60,
    frac_val=0.20,
    frac_test=0.20,
)

print("Обучающая выборка: ", df_train.shape)
print(df_train.Gear_box_type.value_counts())

print("Контрольная выборка: ", df_val.shape)
print(df_val.Gear_box_type.value_counts())

print("Тестовая выборка: ", df_test.shape)
print(df_test.Gear_box_type.value_counts())
Обучающая выборка:  (11542, 2)
Gear_box_type
Automatic    8108
Tiptronic    1861
Manual       1125
Variator      448
Name: count, dtype: int64
Контрольная выборка:  (3847, 2)
Gear_box_type
Automatic    2703
Tiptronic     620
Manual        375
Variator      149
Name: count, dtype: int64
Тестовая выборка:  (3848, 2)
Gear_box_type
Automatic    2703
Tiptronic     621
Manual        375
Variator      149
Name: count, dtype: int64

Выборка с избытком (oversampling)

In [13]:
from imblearn.over_sampling import ADASYN

ada = ADASYN()

print("Обучающая выборка: ", df_train.shape)
print(df_train.Gear_box_type.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["Gear_box_type"])
df_train_adasyn = pd.DataFrame(X_resampled)

print("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
print(df_train_adasyn.Gear_box_type.value_counts())
Обучающая выборка:  (11542, 2)
Gear_box_type
Automatic    8108
Tiptronic    1861
Manual       1125
Variator      448
Name: count, dtype: int64
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_9996\2277749880.py in ?()
      4 
      5 print("Обучающая выборка: ", df_train.shape)
      6 print(df_train.Gear_box_type.value_counts())
      7 
----> 8 X_resampled, y_resampled = ada.fit_resample(df_train, df_train["Gear_box_type"])
      9 df_train_adasyn = pd.DataFrame(X_resampled)
     10 
     11 print("Обучающая выборка после oversampling: ", df_train_adasyn.shape)

c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\imblearn\base.py in ?(self, X, y)
    204         y_resampled : array-like of shape (n_samples_new,)
    205             The corresponding label of `X_resampled`.
    206         """
    207         self._validate_params()
--> 208         return super().fit_resample(X, y)

c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\imblearn\base.py in ?(self, X, y)
    102             The corresponding label of `X_resampled`.
    103         """
    104         check_classification_targets(y)
    105         arrays_transformer = ArraysTransformer(X, y)
--> 106         X, y, binarize_y = self._check_X_y(X, y)
    107 
    108         self.sampling_strategy_ = check_sampling_strategy(
    109             self.sampling_strategy, y, self._sampling_type

c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\imblearn\base.py in ?(self, X, y, accept_sparse)
    157     def _check_X_y(self, X, y, accept_sparse=None):
    158         if accept_sparse is None:
    159             accept_sparse = ["csr", "csc"]
    160         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
--> 161         X, y = self._validate_data(X, y, reset=True, accept_sparse=accept_sparse)
    162         return X, y, binarize_y

c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py in ?(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    646                 if "estimator" not in check_y_params:
    647                     check_y_params = {**default_check_params, **check_y_params}
    648                 y = check_array(y, input_name="y", **check_y_params)
    649             else:
--> 650                 X, y = check_X_y(X, y, **check_params)
    651             out = X, y
    652 
    653         if not no_val_X and check_params.get("ensure_2d", True):

c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\utils\validation.py in ?(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
   1297         raise ValueError(
   1298             f"{estimator_name} requires y to be passed, but the target y is None"
   1299         )
   1300 
-> 1301     X = check_array(
   1302         X,
   1303         accept_sparse=accept_sparse,
   1304         accept_large_sparse=accept_large_sparse,

c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\utils\validation.py in ?(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
   1009                         )
   1010                     array = xp.astype(array, dtype, copy=False)
   1011                 else:
   1012                     array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
-> 1013             except ComplexWarning as complex_warning:
   1014                 raise ValueError(
   1015                     "Complex data not supported\n{}\n".format(array)
   1016                 ) from complex_warning

c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\utils\_array_api.py in ?(array, dtype, order, copy, xp, device)
    741         # Use NumPy API to support order
    742         if copy is True:
    743             array = numpy.array(array, order=order, dtype=dtype)
    744         else:
--> 745             array = numpy.asarray(array, order=order, dtype=dtype)
    746 
    747         # At this point array is a NumPy ndarray. We convert it to an array
    748         # container that is consistent with the input's namespace.

c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\pandas\core\generic.py in ?(self, dtype, copy)
   2149     def __array__(
   2150         self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None
   2151     ) -> np.ndarray:
   2152         values = self._values
-> 2153         arr = np.asarray(values, dtype=dtype)
   2154         if (
   2155             astype_is_view(values.dtype, arr.dtype)
   2156             and using_copy_on_write()

ValueError: could not convert string to float: 'Automatic'