{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Загрузка данных в DataFrame"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mRunning cells with 'Python 3.9.13' requires the ipykernel package.\n",
"\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n",
"\u001b[1;31mCommand: 'c:/Users/ogoro/AppData/Local/Programs/Python/Python39/python.exe -m pip install ipykernel -U --user --force-reinstall'"
]
}
],
"source": [
"from numpy import nan\n",
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"data/car_price_prediction.csv\", index_col=\"ID\")\n",
"df[\"Leather_interior\"] = df[\"Leather_interior\"].replace({\"Yes\": 1, \"No\": 0})\n",
"df[\"Levy\"] = df[\"Levy\"].replace({\"-\": None})\n",
"\n",
"df.info()\n",
"print(df.shape)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Получение сведений о пропущенных данных"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Типы пропущенных данных:\n",
"- None - представление пустых данных в Python\n",
"- NaN - представление пустых данных в Pandas\n",
"- '' - пустая строка"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Price 0\n",
"Levy 5819\n",
"Manufacturer 0\n",
"Model 0\n",
"Prod_year 0\n",
"Category 0\n",
"Leather_interior 0\n",
"Fuel type 0\n",
"Engine volume 0\n",
"Mileage 0\n",
"Cylinders 0\n",
"Gear box type 0\n",
"Drive wheels 0\n",
"Doors 0\n",
"Wheel 0\n",
"Color 0\n",
"Airbags 0\n",
"dtype: int64\n",
"\n",
"Price False\n",
"Levy True\n",
"Manufacturer False\n",
"Model False\n",
"Prod_year False\n",
"Category False\n",
"Leather_interior False\n",
"Fuel type False\n",
"Engine volume False\n",
"Mileage False\n",
"Cylinders False\n",
"Gear box type False\n",
"Drive wheels False\n",
"Doors False\n",
"Wheel False\n",
"Color False\n",
"Airbags False\n",
"dtype: bool\n",
"\n",
"Levy процент пустых значений: %30.25\n"
]
}
],
"source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n",
"\n",
"print()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n",
"\n",
"print()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Заполнение пропущенных данных\n",
"\n",
"https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
"\n",
"https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(19237, 17)\n",
"Price False\n",
"Levy False\n",
"Manufacturer False\n",
"Model False\n",
"Prod_year False\n",
"Category False\n",
"Leather_interior False\n",
"Fuel type False\n",
"Engine volume False\n",
"Mileage False\n",
"Cylinders False\n",
"Gear box type False\n",
"Drive wheels False\n",
"Doors False\n",
"Wheel False\n",
"Color False\n",
"Airbags False\n",
"dtype: bool\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Price | \n",
" Levy | \n",
" Manufacturer | \n",
" Model | \n",
" Prod_year | \n",
" Category | \n",
" Leather_interior | \n",
" Fuel type | \n",
" Engine volume | \n",
" Mileage | \n",
" Cylinders | \n",
" Gear box type | \n",
" Drive wheels | \n",
" Doors | \n",
" Wheel | \n",
" Color | \n",
" Airbags | \n",
" LevyFillNA | \n",
" LevyFillMedian | \n",
"
\n",
" \n",
" ID | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 45798355 | \n",
" 8467 | \n",
" None | \n",
" MERCEDES-BENZ | \n",
" CLK 200 | \n",
" 1999 | \n",
" Coupe | \n",
" 1 | \n",
" CNG | \n",
" 2.0 Turbo | \n",
" 300000 km | \n",
" 4.0 | \n",
" Manual | \n",
" Rear | \n",
" 02-Mar | \n",
" Left wheel | \n",
" Silver | \n",
" 5 | \n",
" 0 | \n",
" 642.0 | \n",
"
\n",
" \n",
" 45778856 | \n",
" 15681 | \n",
" 831 | \n",
" HYUNDAI | \n",
" Sonata | \n",
" 2011 | \n",
" Sedan | \n",
" 1 | \n",
" Petrol | \n",
" 2.4 | \n",
" 161600 km | \n",
" 4.0 | \n",
" Tiptronic | \n",
" Front | \n",
" 04-May | \n",
" Left wheel | \n",
" Red | \n",
" 8 | \n",
" 831 | \n",
" 831 | \n",
"
\n",
" \n",
" 45804997 | \n",
" 26108 | \n",
" 836 | \n",
" HYUNDAI | \n",
" Tucson | \n",
" 2010 | \n",
" Jeep | \n",
" 1 | \n",
" Diesel | \n",
" 2 | \n",
" 116365 km | \n",
" 4.0 | \n",
" Automatic | \n",
" Front | \n",
" 04-May | \n",
" Left wheel | \n",
" Grey | \n",
" 4 | \n",
" 836 | \n",
" 836 | \n",
"
\n",
" \n",
" 45793526 | \n",
" 5331 | \n",
" 1288 | \n",
" CHEVROLET | \n",
" Captiva | \n",
" 2007 | \n",
" Jeep | \n",
" 1 | \n",
" Diesel | \n",
" 2 | \n",
" 51258 km | \n",
" 4.0 | \n",
" Automatic | \n",
" Front | \n",
" 04-May | \n",
" Left wheel | \n",
" Black | \n",
" 4 | \n",
" 1288 | \n",
" 1288 | \n",
"
\n",
" \n",
" 45813273 | \n",
" 470 | \n",
" 753 | \n",
" HYUNDAI | \n",
" Sonata | \n",
" 2012 | \n",
" Sedan | \n",
" 1 | \n",
" Hybrid | \n",
" 2.4 | \n",
" 186923 km | \n",
" 4.0 | \n",
" Automatic | \n",
" Front | \n",
" 04-May | \n",
" Left wheel | \n",
" White | \n",
" 12 | \n",
" 753 | \n",
" 753 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Price Levy Manufacturer Model Prod_year Category \\\n",
"ID \n",
"45798355 8467 None MERCEDES-BENZ CLK 200 1999 Coupe \n",
"45778856 15681 831 HYUNDAI Sonata 2011 Sedan \n",
"45804997 26108 836 HYUNDAI Tucson 2010 Jeep \n",
"45793526 5331 1288 CHEVROLET Captiva 2007 Jeep \n",
"45813273 470 753 HYUNDAI Sonata 2012 Sedan \n",
"\n",
" Leather_interior Fuel type Engine volume Mileage Cylinders \\\n",
"ID \n",
"45798355 1 CNG 2.0 Turbo 300000 km 4.0 \n",
"45778856 1 Petrol 2.4 161600 km 4.0 \n",
"45804997 1 Diesel 2 116365 km 4.0 \n",
"45793526 1 Diesel 2 51258 km 4.0 \n",
"45813273 1 Hybrid 2.4 186923 km 4.0 \n",
"\n",
" Gear box type Drive wheels Doors Wheel Color Airbags \\\n",
"ID \n",
"45798355 Manual Rear 02-Mar Left wheel Silver 5 \n",
"45778856 Tiptronic Front 04-May Left wheel Red 8 \n",
"45804997 Automatic Front 04-May Left wheel Grey 4 \n",
"45793526 Automatic Front 04-May Left wheel Black 4 \n",
"45813273 Automatic Front 04-May Left wheel White 12 \n",
"\n",
" LevyFillNA LevyFillMedian \n",
"ID \n",
"45798355 0 642.0 \n",
"45778856 831 831 \n",
"45804997 836 836 \n",
"45793526 1288 1288 \n",
"45813273 753 753 "
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fillna_df = df.fillna(0)\n",
"\n",
"print(fillna_df.shape)\n",
"\n",
"print(fillna_df.isnull().any())\n",
"\n",
"# Замена пустых данных на 0\n",
"df[\"LevyFillNA\"] = df[\"Levy\"].fillna(0)\n",
"\n",
"# Замена пустых данных на медиану\n",
"df[\"LevyFillMedian\"] = df[\"Levy\"].fillna(df[\"LevyFillNA\"].median())\n",
"\n",
"df.tail()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Price | \n",
" Levy | \n",
" Manufacturer | \n",
" Model | \n",
" Prod. year | \n",
" Category | \n",
" Leather_interior | \n",
" Fuel type | \n",
" Engine volume | \n",
" Mileage | \n",
" Cylinders | \n",
" Gear box type | \n",
" Drive wheels | \n",
" Doors | \n",
" Wheel | \n",
" Color | \n",
" Airbags | \n",
" LevyFillNA | \n",
" LevyFillMedian | \n",
" LevyCopy | \n",
"
\n",
" \n",
" ID | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 45798355 | \n",
" 8467 | \n",
" None | \n",
" MERCEDES-BENZ | \n",
" CLK 200 | \n",
" 1999 | \n",
" Coupe | \n",
" 1 | \n",
" CNG | \n",
" 2.0 Turbo | \n",
" 300000 km | \n",
" 4.0 | \n",
" Manual | \n",
" Rear | \n",
" 02-Mar | \n",
" Left wheel | \n",
" Silver | \n",
" 5 | \n",
" 0 | \n",
" 642.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 45778856 | \n",
" 15681 | \n",
" 831 | \n",
" HYUNDAI | \n",
" Sonata | \n",
" 2011 | \n",
" Sedan | \n",
" 1 | \n",
" Petrol | \n",
" 2.4 | \n",
" 161600 km | \n",
" 4.0 | \n",
" Tiptronic | \n",
" Front | \n",
" 04-May | \n",
" Left wheel | \n",
" Red | \n",
" 8 | \n",
" 831 | \n",
" 831 | \n",
" 831 | \n",
"
\n",
" \n",
" 45804997 | \n",
" 26108 | \n",
" 836 | \n",
" HYUNDAI | \n",
" Tucson | \n",
" 2010 | \n",
" Jeep | \n",
" 1 | \n",
" Diesel | \n",
" 2 | \n",
" 116365 km | \n",
" 4.0 | \n",
" Automatic | \n",
" Front | \n",
" 04-May | \n",
" Left wheel | \n",
" Grey | \n",
" 4 | \n",
" 836 | \n",
" 836 | \n",
" 836 | \n",
"
\n",
" \n",
" 45793526 | \n",
" 5331 | \n",
" 1288 | \n",
" CHEVROLET | \n",
" Captiva | \n",
" 2007 | \n",
" Jeep | \n",
" 1 | \n",
" Diesel | \n",
" 2 | \n",
" 51258 km | \n",
" 4.0 | \n",
" Automatic | \n",
" Front | \n",
" 04-May | \n",
" Left wheel | \n",
" Black | \n",
" 4 | \n",
" 1288 | \n",
" 1288 | \n",
" 1288 | \n",
"
\n",
" \n",
" 45813273 | \n",
" 470 | \n",
" 753 | \n",
" HYUNDAI | \n",
" Sonata | \n",
" 2012 | \n",
" Sedan | \n",
" 1 | \n",
" Hybrid | \n",
" 2.4 | \n",
" 186923 km | \n",
" 4.0 | \n",
" Automatic | \n",
" Front | \n",
" 04-May | \n",
" Left wheel | \n",
" White | \n",
" 12 | \n",
" 753 | \n",
" 753 | \n",
" 753 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Price Levy Manufacturer Model Prod. year Category \\\n",
"ID \n",
"45798355 8467 None MERCEDES-BENZ CLK 200 1999 Coupe \n",
"45778856 15681 831 HYUNDAI Sonata 2011 Sedan \n",
"45804997 26108 836 HYUNDAI Tucson 2010 Jeep \n",
"45793526 5331 1288 CHEVROLET Captiva 2007 Jeep \n",
"45813273 470 753 HYUNDAI Sonata 2012 Sedan \n",
"\n",
" Leather_interior Fuel type Engine volume Mileage Cylinders \\\n",
"ID \n",
"45798355 1 CNG 2.0 Turbo 300000 km 4.0 \n",
"45778856 1 Petrol 2.4 161600 km 4.0 \n",
"45804997 1 Diesel 2 116365 km 4.0 \n",
"45793526 1 Diesel 2 51258 km 4.0 \n",
"45813273 1 Hybrid 2.4 186923 km 4.0 \n",
"\n",
" Gear box type Drive wheels Doors Wheel Color Airbags \\\n",
"ID \n",
"45798355 Manual Rear 02-Mar Left wheel Silver 5 \n",
"45778856 Tiptronic Front 04-May Left wheel Red 8 \n",
"45804997 Automatic Front 04-May Left wheel Grey 4 \n",
"45793526 Automatic Front 04-May Left wheel Black 4 \n",
"45813273 Automatic Front 04-May Left wheel White 12 \n",
"\n",
" LevyFillNA LevyFillMedian LevyCopy \n",
"ID \n",
"45798355 0 642.0 0 \n",
"45778856 831 831 831 \n",
"45804997 836 836 836 \n",
"45793526 1288 1288 1288 \n",
"45813273 753 753 753 "
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"LevyCopy\"] = df[\"Levy\"]\n",
"\n",
"# Замена данных сразу в DataFrame без копирования\n",
"df.fillna({\"LevyCopy\": 0}, inplace=True)\n",
"\n",
"df.tail()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Удаление наблюдений с пропусками"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(13418, 20)\n",
"Price False\n",
"Levy False\n",
"Manufacturer False\n",
"Model False\n",
"Prod. year False\n",
"Category False\n",
"Leather_interior False\n",
"Fuel type False\n",
"Engine volume False\n",
"Mileage False\n",
"Cylinders False\n",
"Gear box type False\n",
"Drive wheels False\n",
"Doors False\n",
"Wheel False\n",
"Color False\n",
"Airbags False\n",
"dtype: bool\n"
]
}
],
"source": [
"dropna_df = df.dropna()\n",
"\n",
"print(dropna_df.shape)\n",
"\n",
"print(fillna_df.isnull().any())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Создание выборок данных\n",
"\n",
"Библиотека scikit-learn\n",
"\n",
"https://scikit-learn.org/stable/index.html"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"# Функция для создания выборок\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
"):\n",
" \"\"\"\n",
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
" following fractional ratios provided by the user, where each subset is\n",
" stratified by the values in a specific column (that is, each subset has\n",
" the same relative frequency of the values in the column). It performs this\n",
" splitting by running train_test_split() twice.\n",
"\n",
" Parameters\n",
" ----------\n",
" df_input : Pandas dataframe\n",
" Input dataframe to be split.\n",
" stratify_colname : str\n",
" The name of the column that will be used for stratification. Usually\n",
" this column would be for the label.\n",
" frac_train : float\n",
" frac_val : float\n",
" frac_test : float\n",
" The ratios with which the dataframe will be split into train, val, and\n",
" test data. The values should be expressed as float fractions and should\n",
" sum to 1.0.\n",
" random_state : int, None, or RandomStateInstance\n",
" Value to be passed to train_test_split().\n",
"\n",
" Returns\n",
" -------\n",
" df_train, df_val, df_test :\n",
" Dataframes containing the three splits.\n",
" \"\"\"\n",
"\n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
"\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
"\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
"\n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
"\n",
" return df_train, df_val, df_test"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Leather_interior\n",
"1 13954\n",
"0 5283\n",
"Name: count, dtype: int64\n",
"Обучающая выборка: (11542, 3)\n",
"Leather_interior\n",
"1 8372\n",
"0 3170\n",
"Name: count, dtype: int64\n",
"Контрольная выборка: (3847, 3)\n",
"Leather_interior\n",
"1 2791\n",
"0 1056\n",
"Name: count, dtype: int64\n",
"Тестовая выборка: (3848, 3)\n",
"Leather_interior\n",
"1 2791\n",
"0 1057\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# Вывод распределения количества наблюдений по меткам (классам)\n",
"print(df.Leather_interior.value_counts())\n",
"\n",
"data = df[[\"Leather_interior\", \"Price\", \"Prod_year\"]].copy()\n",
"\n",
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
" data,\n",
" stratify_colname=\"Leather_interior\",\n",
" frac_train=0.60,\n",
" frac_val=0.20,\n",
" frac_test=0.20,\n",
")\n",
"\n",
"print(\"Обучающая выборка: \", df_train.shape)\n",
"print(df_train.Leather_interior.value_counts())\n",
"\n",
"print(\"Контрольная выборка: \", df_val.shape)\n",
"print(df_val.Leather_interior.value_counts())\n",
"\n",
"print(\"Тестовая выборка: \", df_test.shape)\n",
"print(df_test.Leather_interior.value_counts())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выборка с избытком (oversampling)\n",
"\n",
"https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
"\n",
"https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
"\n",
"Выборка с недостатком (undersampling)\n",
"\n",
"https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
"\n",
"Библиотека imbalanced-learn\n",
"\n",
"https://imbalanced-learn.org/stable/"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка: (11542, 3)\n",
"Leather_interior\n",
"1 8372\n",
"0 3170\n",
"Name: count, dtype: int64\n",
"Обучающая выборка после oversampling: (16416, 3)\n",
"Leather_interior\n",
"1 8372\n",
"0 8044\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Leather_interior | \n",
" Price | \n",
" Prod_year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 21400 | \n",
" 2008 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 16621 | \n",
" 2016 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 28852 | \n",
" 2017 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 2430 | \n",
" 2008 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 7840 | \n",
" 2005 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 16411 | \n",
" 0 | \n",
" 26030 | \n",
" 2013 | \n",
"
\n",
" \n",
" 16412 | \n",
" 0 | \n",
" 26030 | \n",
" 2012 | \n",
"
\n",
" \n",
" 16413 | \n",
" 0 | \n",
" 26030 | \n",
" 2014 | \n",
"
\n",
" \n",
" 16414 | \n",
" 0 | \n",
" 26030 | \n",
" 2012 | \n",
"
\n",
" \n",
" 16415 | \n",
" 0 | \n",
" 26030 | \n",
" 2012 | \n",
"
\n",
" \n",
"
\n",
"
16416 rows × 3 columns
\n",
"
"
],
"text/plain": [
" Leather_interior Price Prod_year\n",
"0 0 21400 2008\n",
"1 0 16621 2016\n",
"2 1 28852 2017\n",
"3 1 2430 2008\n",
"4 0 7840 2005\n",
"... ... ... ...\n",
"16411 0 26030 2013\n",
"16412 0 26030 2012\n",
"16413 0 26030 2014\n",
"16414 0 26030 2012\n",
"16415 0 26030 2012\n",
"\n",
"[16416 rows x 3 columns]"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from imblearn.over_sampling import ADASYN\n",
"\n",
"ada = ADASYN()\n",
"\n",
"print(\"Обучающая выборка: \", df_train.shape)\n",
"print(df_train.Leather_interior.value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Leather_interior\"])\n",
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
"print(df_train_adasyn.Leather_interior.value_counts())\n",
"\n",
"df_train_adasyn"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}