1263 lines
42 KiB
Plaintext
1263 lines
42 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Загрузка данных в DataFrame"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 19237 entries, 45654403 to 45813273\n",
|
||
"Data columns (total 17 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 Price 19237 non-null int64 \n",
|
||
" 1 Levy 13418 non-null object \n",
|
||
" 2 Manufacturer 19237 non-null object \n",
|
||
" 3 Model 19237 non-null object \n",
|
||
" 4 Prod_year 19237 non-null int64 \n",
|
||
" 5 Category 19237 non-null object \n",
|
||
" 6 Leather_interior 19237 non-null int64 \n",
|
||
" 7 Fuel type 19237 non-null object \n",
|
||
" 8 Engine volume 19237 non-null object \n",
|
||
" 9 Mileage 19237 non-null object \n",
|
||
" 10 Cylinders 19237 non-null float64\n",
|
||
" 11 Gear box type 19237 non-null object \n",
|
||
" 12 Drive wheels 19237 non-null object \n",
|
||
" 13 Doors 19237 non-null object \n",
|
||
" 14 Wheel 19237 non-null object \n",
|
||
" 15 Color 19237 non-null object \n",
|
||
" 16 Airbags 19237 non-null int64 \n",
|
||
"dtypes: float64(1), int64(4), object(12)\n",
|
||
"memory usage: 2.6+ MB\n",
|
||
"(19237, 17)\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\Users\\1\\AppData\\Local\\Temp\\ipykernel_25288\\68381857.py:5: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
||
" df[\"Leather_interior\"] = df[\"Leather_interior\"].replace({\"Yes\": 1, \"No\": 0})\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Price</th>\n",
|
||
" <th>Levy</th>\n",
|
||
" <th>Manufacturer</th>\n",
|
||
" <th>Model</th>\n",
|
||
" <th>Prod_year</th>\n",
|
||
" <th>Category</th>\n",
|
||
" <th>Leather_interior</th>\n",
|
||
" <th>Fuel type</th>\n",
|
||
" <th>Engine volume</th>\n",
|
||
" <th>Mileage</th>\n",
|
||
" <th>Cylinders</th>\n",
|
||
" <th>Gear box type</th>\n",
|
||
" <th>Drive wheels</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Wheel</th>\n",
|
||
" <th>Color</th>\n",
|
||
" <th>Airbags</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>ID</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>45654403</th>\n",
|
||
" <td>13328</td>\n",
|
||
" <td>1399</td>\n",
|
||
" <td>LEXUS</td>\n",
|
||
" <td>RX 450</td>\n",
|
||
" <td>2010</td>\n",
|
||
" <td>Jeep</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Hybrid</td>\n",
|
||
" <td>3.5</td>\n",
|
||
" <td>186005 km</td>\n",
|
||
" <td>6.0</td>\n",
|
||
" <td>Automatic</td>\n",
|
||
" <td>4x4</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>Silver</td>\n",
|
||
" <td>12</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>44731507</th>\n",
|
||
" <td>16621</td>\n",
|
||
" <td>1018</td>\n",
|
||
" <td>CHEVROLET</td>\n",
|
||
" <td>Equinox</td>\n",
|
||
" <td>2011</td>\n",
|
||
" <td>Jeep</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>Petrol</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>192000 km</td>\n",
|
||
" <td>6.0</td>\n",
|
||
" <td>Tiptronic</td>\n",
|
||
" <td>4x4</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>Black</td>\n",
|
||
" <td>8</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>45774419</th>\n",
|
||
" <td>8467</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>HONDA</td>\n",
|
||
" <td>FIT</td>\n",
|
||
" <td>2006</td>\n",
|
||
" <td>Hatchback</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>Petrol</td>\n",
|
||
" <td>1.3</td>\n",
|
||
" <td>200000 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Variator</td>\n",
|
||
" <td>Front</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Right-hand drive</td>\n",
|
||
" <td>Black</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>45769185</th>\n",
|
||
" <td>3607</td>\n",
|
||
" <td>862</td>\n",
|
||
" <td>FORD</td>\n",
|
||
" <td>Escape</td>\n",
|
||
" <td>2011</td>\n",
|
||
" <td>Jeep</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Hybrid</td>\n",
|
||
" <td>2.5</td>\n",
|
||
" <td>168966 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Automatic</td>\n",
|
||
" <td>4x4</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>45809263</th>\n",
|
||
" <td>11726</td>\n",
|
||
" <td>446</td>\n",
|
||
" <td>HONDA</td>\n",
|
||
" <td>FIT</td>\n",
|
||
" <td>2014</td>\n",
|
||
" <td>Hatchback</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Petrol</td>\n",
|
||
" <td>1.3</td>\n",
|
||
" <td>91901 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Automatic</td>\n",
|
||
" <td>Front</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>Silver</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Price Levy Manufacturer Model Prod_year Category \\\n",
|
||
"ID \n",
|
||
"45654403 13328 1399 LEXUS RX 450 2010 Jeep \n",
|
||
"44731507 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
|
||
"45774419 8467 None HONDA FIT 2006 Hatchback \n",
|
||
"45769185 3607 862 FORD Escape 2011 Jeep \n",
|
||
"45809263 11726 446 HONDA FIT 2014 Hatchback \n",
|
||
"\n",
|
||
" Leather_interior Fuel type Engine volume Mileage Cylinders \\\n",
|
||
"ID \n",
|
||
"45654403 1 Hybrid 3.5 186005 km 6.0 \n",
|
||
"44731507 0 Petrol 3 192000 km 6.0 \n",
|
||
"45774419 0 Petrol 1.3 200000 km 4.0 \n",
|
||
"45769185 1 Hybrid 2.5 168966 km 4.0 \n",
|
||
"45809263 1 Petrol 1.3 91901 km 4.0 \n",
|
||
"\n",
|
||
" Gear box type Drive wheels Doors Wheel Color Airbags \n",
|
||
"ID \n",
|
||
"45654403 Automatic 4x4 04-May Left wheel Silver 12 \n",
|
||
"44731507 Tiptronic 4x4 04-May Left wheel Black 8 \n",
|
||
"45774419 Variator Front 04-May Right-hand drive Black 2 \n",
|
||
"45769185 Automatic 4x4 04-May Left wheel White 0 \n",
|
||
"45809263 Automatic Front 04-May Left wheel Silver 4 "
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from numpy import nan\n",
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"df = pd.read_csv(\"data/car_price_prediction.csv\", index_col=\"ID\")\n",
|
||
"df[\"Leather_interior\"] = df[\"Leather_interior\"].replace({\"Yes\": 1, \"No\": 0})\n",
|
||
"df[\"Levy\"] = df[\"Levy\"].replace({\"-\": None})\n",
|
||
"\n",
|
||
"df.info()\n",
|
||
"print(df.shape)\n",
|
||
"df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Получение сведений о пропущенных данных"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Типы пропущенных данных:\n",
|
||
"- None - представление пустых данных в Python\n",
|
||
"- NaN - представление пустых данных в Pandas\n",
|
||
"- '' - пустая строка"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 46,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Price 0\n",
|
||
"Levy 5819\n",
|
||
"Manufacturer 0\n",
|
||
"Model 0\n",
|
||
"Prod_year 0\n",
|
||
"Category 0\n",
|
||
"Leather_interior 0\n",
|
||
"Fuel type 0\n",
|
||
"Engine volume 0\n",
|
||
"Mileage 0\n",
|
||
"Cylinders 0\n",
|
||
"Gear box type 0\n",
|
||
"Drive wheels 0\n",
|
||
"Doors 0\n",
|
||
"Wheel 0\n",
|
||
"Color 0\n",
|
||
"Airbags 0\n",
|
||
"dtype: int64\n",
|
||
"\n",
|
||
"Price False\n",
|
||
"Levy True\n",
|
||
"Manufacturer False\n",
|
||
"Model False\n",
|
||
"Prod_year False\n",
|
||
"Category False\n",
|
||
"Leather_interior False\n",
|
||
"Fuel type False\n",
|
||
"Engine volume False\n",
|
||
"Mileage False\n",
|
||
"Cylinders False\n",
|
||
"Gear box type False\n",
|
||
"Drive wheels False\n",
|
||
"Doors False\n",
|
||
"Wheel False\n",
|
||
"Color False\n",
|
||
"Airbags False\n",
|
||
"dtype: bool\n",
|
||
"\n",
|
||
"Levy процент пустых значений: %30.25\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Количество пустых значений признаков\n",
|
||
"print(df.isnull().sum())\n",
|
||
"\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Есть ли пустые значения признаков\n",
|
||
"print(df.isnull().any())\n",
|
||
"\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Процент пустых значений признаков\n",
|
||
"for i in df.columns:\n",
|
||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||
" if null_rate > 0:\n",
|
||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Заполнение пропущенных данных\n",
|
||
"\n",
|
||
"https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
|
||
"\n",
|
||
"https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 47,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"(19237, 17)\n",
|
||
"Price False\n",
|
||
"Levy False\n",
|
||
"Manufacturer False\n",
|
||
"Model False\n",
|
||
"Prod_year False\n",
|
||
"Category False\n",
|
||
"Leather_interior False\n",
|
||
"Fuel type False\n",
|
||
"Engine volume False\n",
|
||
"Mileage False\n",
|
||
"Cylinders False\n",
|
||
"Gear box type False\n",
|
||
"Drive wheels False\n",
|
||
"Doors False\n",
|
||
"Wheel False\n",
|
||
"Color False\n",
|
||
"Airbags False\n",
|
||
"dtype: bool\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Price</th>\n",
|
||
" <th>Levy</th>\n",
|
||
" <th>Manufacturer</th>\n",
|
||
" <th>Model</th>\n",
|
||
" <th>Prod_year</th>\n",
|
||
" <th>Category</th>\n",
|
||
" <th>Leather_interior</th>\n",
|
||
" <th>Fuel type</th>\n",
|
||
" <th>Engine volume</th>\n",
|
||
" <th>Mileage</th>\n",
|
||
" <th>Cylinders</th>\n",
|
||
" <th>Gear box type</th>\n",
|
||
" <th>Drive wheels</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Wheel</th>\n",
|
||
" <th>Color</th>\n",
|
||
" <th>Airbags</th>\n",
|
||
" <th>LevyFillNA</th>\n",
|
||
" <th>LevyFillMedian</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>ID</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>45798355</th>\n",
|
||
" <td>8467</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>MERCEDES-BENZ</td>\n",
|
||
" <td>CLK 200</td>\n",
|
||
" <td>1999</td>\n",
|
||
" <td>Coupe</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>CNG</td>\n",
|
||
" <td>2.0 Turbo</td>\n",
|
||
" <td>300000 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Manual</td>\n",
|
||
" <td>Rear</td>\n",
|
||
" <td>02-Mar</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>Silver</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>642.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>45778856</th>\n",
|
||
" <td>15681</td>\n",
|
||
" <td>831</td>\n",
|
||
" <td>HYUNDAI</td>\n",
|
||
" <td>Sonata</td>\n",
|
||
" <td>2011</td>\n",
|
||
" <td>Sedan</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Petrol</td>\n",
|
||
" <td>2.4</td>\n",
|
||
" <td>161600 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Tiptronic</td>\n",
|
||
" <td>Front</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>Red</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>831</td>\n",
|
||
" <td>831</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>45804997</th>\n",
|
||
" <td>26108</td>\n",
|
||
" <td>836</td>\n",
|
||
" <td>HYUNDAI</td>\n",
|
||
" <td>Tucson</td>\n",
|
||
" <td>2010</td>\n",
|
||
" <td>Jeep</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Diesel</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>116365 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Automatic</td>\n",
|
||
" <td>Front</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>Grey</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>836</td>\n",
|
||
" <td>836</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>45793526</th>\n",
|
||
" <td>5331</td>\n",
|
||
" <td>1288</td>\n",
|
||
" <td>CHEVROLET</td>\n",
|
||
" <td>Captiva</td>\n",
|
||
" <td>2007</td>\n",
|
||
" <td>Jeep</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Diesel</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>51258 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Automatic</td>\n",
|
||
" <td>Front</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>Black</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1288</td>\n",
|
||
" <td>1288</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>45813273</th>\n",
|
||
" <td>470</td>\n",
|
||
" <td>753</td>\n",
|
||
" <td>HYUNDAI</td>\n",
|
||
" <td>Sonata</td>\n",
|
||
" <td>2012</td>\n",
|
||
" <td>Sedan</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Hybrid</td>\n",
|
||
" <td>2.4</td>\n",
|
||
" <td>186923 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Automatic</td>\n",
|
||
" <td>Front</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>753</td>\n",
|
||
" <td>753</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Price Levy Manufacturer Model Prod_year Category \\\n",
|
||
"ID \n",
|
||
"45798355 8467 None MERCEDES-BENZ CLK 200 1999 Coupe \n",
|
||
"45778856 15681 831 HYUNDAI Sonata 2011 Sedan \n",
|
||
"45804997 26108 836 HYUNDAI Tucson 2010 Jeep \n",
|
||
"45793526 5331 1288 CHEVROLET Captiva 2007 Jeep \n",
|
||
"45813273 470 753 HYUNDAI Sonata 2012 Sedan \n",
|
||
"\n",
|
||
" Leather_interior Fuel type Engine volume Mileage Cylinders \\\n",
|
||
"ID \n",
|
||
"45798355 1 CNG 2.0 Turbo 300000 km 4.0 \n",
|
||
"45778856 1 Petrol 2.4 161600 km 4.0 \n",
|
||
"45804997 1 Diesel 2 116365 km 4.0 \n",
|
||
"45793526 1 Diesel 2 51258 km 4.0 \n",
|
||
"45813273 1 Hybrid 2.4 186923 km 4.0 \n",
|
||
"\n",
|
||
" Gear box type Drive wheels Doors Wheel Color Airbags \\\n",
|
||
"ID \n",
|
||
"45798355 Manual Rear 02-Mar Left wheel Silver 5 \n",
|
||
"45778856 Tiptronic Front 04-May Left wheel Red 8 \n",
|
||
"45804997 Automatic Front 04-May Left wheel Grey 4 \n",
|
||
"45793526 Automatic Front 04-May Left wheel Black 4 \n",
|
||
"45813273 Automatic Front 04-May Left wheel White 12 \n",
|
||
"\n",
|
||
" LevyFillNA LevyFillMedian \n",
|
||
"ID \n",
|
||
"45798355 0 642.0 \n",
|
||
"45778856 831 831 \n",
|
||
"45804997 836 836 \n",
|
||
"45793526 1288 1288 \n",
|
||
"45813273 753 753 "
|
||
]
|
||
},
|
||
"execution_count": 47,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"fillna_df = df.fillna(0)\n",
|
||
"\n",
|
||
"print(fillna_df.shape)\n",
|
||
"\n",
|
||
"print(fillna_df.isnull().any())\n",
|
||
"\n",
|
||
"# Замена пустых данных на 0\n",
|
||
"df[\"LevyFillNA\"] = df[\"Levy\"].fillna(0)\n",
|
||
"\n",
|
||
"# Замена пустых данных на медиану\n",
|
||
"df[\"LevyFillMedian\"] = df[\"Levy\"].fillna(df[\"LevyFillNA\"].median())\n",
|
||
"\n",
|
||
"df.tail()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Price</th>\n",
|
||
" <th>Levy</th>\n",
|
||
" <th>Manufacturer</th>\n",
|
||
" <th>Model</th>\n",
|
||
" <th>Prod. year</th>\n",
|
||
" <th>Category</th>\n",
|
||
" <th>Leather_interior</th>\n",
|
||
" <th>Fuel type</th>\n",
|
||
" <th>Engine volume</th>\n",
|
||
" <th>Mileage</th>\n",
|
||
" <th>Cylinders</th>\n",
|
||
" <th>Gear box type</th>\n",
|
||
" <th>Drive wheels</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Wheel</th>\n",
|
||
" <th>Color</th>\n",
|
||
" <th>Airbags</th>\n",
|
||
" <th>LevyFillNA</th>\n",
|
||
" <th>LevyFillMedian</th>\n",
|
||
" <th>LevyCopy</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>ID</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>45798355</th>\n",
|
||
" <td>8467</td>\n",
|
||
" <td>None</td>\n",
|
||
" <td>MERCEDES-BENZ</td>\n",
|
||
" <td>CLK 200</td>\n",
|
||
" <td>1999</td>\n",
|
||
" <td>Coupe</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>CNG</td>\n",
|
||
" <td>2.0 Turbo</td>\n",
|
||
" <td>300000 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Manual</td>\n",
|
||
" <td>Rear</td>\n",
|
||
" <td>02-Mar</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>Silver</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>642.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>45778856</th>\n",
|
||
" <td>15681</td>\n",
|
||
" <td>831</td>\n",
|
||
" <td>HYUNDAI</td>\n",
|
||
" <td>Sonata</td>\n",
|
||
" <td>2011</td>\n",
|
||
" <td>Sedan</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Petrol</td>\n",
|
||
" <td>2.4</td>\n",
|
||
" <td>161600 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Tiptronic</td>\n",
|
||
" <td>Front</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>Red</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>831</td>\n",
|
||
" <td>831</td>\n",
|
||
" <td>831</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>45804997</th>\n",
|
||
" <td>26108</td>\n",
|
||
" <td>836</td>\n",
|
||
" <td>HYUNDAI</td>\n",
|
||
" <td>Tucson</td>\n",
|
||
" <td>2010</td>\n",
|
||
" <td>Jeep</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Diesel</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>116365 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Automatic</td>\n",
|
||
" <td>Front</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>Grey</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>836</td>\n",
|
||
" <td>836</td>\n",
|
||
" <td>836</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>45793526</th>\n",
|
||
" <td>5331</td>\n",
|
||
" <td>1288</td>\n",
|
||
" <td>CHEVROLET</td>\n",
|
||
" <td>Captiva</td>\n",
|
||
" <td>2007</td>\n",
|
||
" <td>Jeep</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Diesel</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>51258 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Automatic</td>\n",
|
||
" <td>Front</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>Black</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1288</td>\n",
|
||
" <td>1288</td>\n",
|
||
" <td>1288</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>45813273</th>\n",
|
||
" <td>470</td>\n",
|
||
" <td>753</td>\n",
|
||
" <td>HYUNDAI</td>\n",
|
||
" <td>Sonata</td>\n",
|
||
" <td>2012</td>\n",
|
||
" <td>Sedan</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Hybrid</td>\n",
|
||
" <td>2.4</td>\n",
|
||
" <td>186923 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Automatic</td>\n",
|
||
" <td>Front</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>753</td>\n",
|
||
" <td>753</td>\n",
|
||
" <td>753</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Price Levy Manufacturer Model Prod. year Category \\\n",
|
||
"ID \n",
|
||
"45798355 8467 None MERCEDES-BENZ CLK 200 1999 Coupe \n",
|
||
"45778856 15681 831 HYUNDAI Sonata 2011 Sedan \n",
|
||
"45804997 26108 836 HYUNDAI Tucson 2010 Jeep \n",
|
||
"45793526 5331 1288 CHEVROLET Captiva 2007 Jeep \n",
|
||
"45813273 470 753 HYUNDAI Sonata 2012 Sedan \n",
|
||
"\n",
|
||
" Leather_interior Fuel type Engine volume Mileage Cylinders \\\n",
|
||
"ID \n",
|
||
"45798355 1 CNG 2.0 Turbo 300000 km 4.0 \n",
|
||
"45778856 1 Petrol 2.4 161600 km 4.0 \n",
|
||
"45804997 1 Diesel 2 116365 km 4.0 \n",
|
||
"45793526 1 Diesel 2 51258 km 4.0 \n",
|
||
"45813273 1 Hybrid 2.4 186923 km 4.0 \n",
|
||
"\n",
|
||
" Gear box type Drive wheels Doors Wheel Color Airbags \\\n",
|
||
"ID \n",
|
||
"45798355 Manual Rear 02-Mar Left wheel Silver 5 \n",
|
||
"45778856 Tiptronic Front 04-May Left wheel Red 8 \n",
|
||
"45804997 Automatic Front 04-May Left wheel Grey 4 \n",
|
||
"45793526 Automatic Front 04-May Left wheel Black 4 \n",
|
||
"45813273 Automatic Front 04-May Left wheel White 12 \n",
|
||
"\n",
|
||
" LevyFillNA LevyFillMedian LevyCopy \n",
|
||
"ID \n",
|
||
"45798355 0 642.0 0 \n",
|
||
"45778856 831 831 831 \n",
|
||
"45804997 836 836 836 \n",
|
||
"45793526 1288 1288 1288 \n",
|
||
"45813273 753 753 753 "
|
||
]
|
||
},
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df[\"LevyCopy\"] = df[\"Levy\"]\n",
|
||
"\n",
|
||
"# Замена данных сразу в DataFrame без копирования\n",
|
||
"df.fillna({\"LevyCopy\": 0}, inplace=True)\n",
|
||
"\n",
|
||
"df.tail()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Удаление наблюдений с пропусками"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 33,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"(13418, 20)\n",
|
||
"Price False\n",
|
||
"Levy False\n",
|
||
"Manufacturer False\n",
|
||
"Model False\n",
|
||
"Prod. year False\n",
|
||
"Category False\n",
|
||
"Leather_interior False\n",
|
||
"Fuel type False\n",
|
||
"Engine volume False\n",
|
||
"Mileage False\n",
|
||
"Cylinders False\n",
|
||
"Gear box type False\n",
|
||
"Drive wheels False\n",
|
||
"Doors False\n",
|
||
"Wheel False\n",
|
||
"Color False\n",
|
||
"Airbags False\n",
|
||
"dtype: bool\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"dropna_df = df.dropna()\n",
|
||
"\n",
|
||
"print(dropna_df.shape)\n",
|
||
"\n",
|
||
"print(fillna_df.isnull().any())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Создание выборок данных\n",
|
||
"\n",
|
||
"Библиотека scikit-learn\n",
|
||
"\n",
|
||
"https://scikit-learn.org/stable/index.html"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Функция для создания выборок\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"\n",
|
||
"\n",
|
||
"def split_stratified_into_train_val_test(\n",
|
||
" df_input,\n",
|
||
" stratify_colname=\"y\",\n",
|
||
" frac_train=0.6,\n",
|
||
" frac_val=0.15,\n",
|
||
" frac_test=0.25,\n",
|
||
" random_state=None,\n",
|
||
"):\n",
|
||
" \"\"\"\n",
|
||
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
||
" following fractional ratios provided by the user, where each subset is\n",
|
||
" stratified by the values in a specific column (that is, each subset has\n",
|
||
" the same relative frequency of the values in the column). It performs this\n",
|
||
" splitting by running train_test_split() twice.\n",
|
||
"\n",
|
||
" Parameters\n",
|
||
" ----------\n",
|
||
" df_input : Pandas dataframe\n",
|
||
" Input dataframe to be split.\n",
|
||
" stratify_colname : str\n",
|
||
" The name of the column that will be used for stratification. Usually\n",
|
||
" this column would be for the label.\n",
|
||
" frac_train : float\n",
|
||
" frac_val : float\n",
|
||
" frac_test : float\n",
|
||
" The ratios with which the dataframe will be split into train, val, and\n",
|
||
" test data. The values should be expressed as float fractions and should\n",
|
||
" sum to 1.0.\n",
|
||
" random_state : int, None, or RandomStateInstance\n",
|
||
" Value to be passed to train_test_split().\n",
|
||
"\n",
|
||
" Returns\n",
|
||
" -------\n",
|
||
" df_train, df_val, df_test :\n",
|
||
" Dataframes containing the three splits.\n",
|
||
" \"\"\"\n",
|
||
"\n",
|
||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||
" raise ValueError(\n",
|
||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||
" % (frac_train, frac_val, frac_test)\n",
|
||
" )\n",
|
||
"\n",
|
||
" if stratify_colname not in df_input.columns:\n",
|
||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||
"\n",
|
||
" X = df_input # Contains all columns.\n",
|
||
" y = df_input[\n",
|
||
" [stratify_colname]\n",
|
||
" ] # Dataframe of just the column on which to stratify.\n",
|
||
"\n",
|
||
" # Split original dataframe into train and temp dataframes.\n",
|
||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Split the temp dataframe into val and test dataframes.\n",
|
||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||
" df_temp,\n",
|
||
" y_temp,\n",
|
||
" stratify=y_temp,\n",
|
||
" test_size=relative_frac_test,\n",
|
||
" random_state=random_state,\n",
|
||
" )\n",
|
||
"\n",
|
||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||
"\n",
|
||
" return df_train, df_val, df_test"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Leather_interior\n",
|
||
"1 13954\n",
|
||
"0 5283\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Обучающая выборка: (11542, 3)\n",
|
||
"Leather_interior\n",
|
||
"1 8372\n",
|
||
"0 3170\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Контрольная выборка: (3847, 3)\n",
|
||
"Leather_interior\n",
|
||
"1 2791\n",
|
||
"0 1056\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Тестовая выборка: (3848, 3)\n",
|
||
"Leather_interior\n",
|
||
"1 2791\n",
|
||
"0 1057\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Вывод распределения количества наблюдений по меткам (классам)\n",
|
||
"print(df.Leather_interior.value_counts())\n",
|
||
"\n",
|
||
"data = df[[\"Leather_interior\", \"Price\", \"Prod_year\"]].copy()\n",
|
||
"\n",
|
||
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
||
" data,\n",
|
||
" stratify_colname=\"Leather_interior\",\n",
|
||
" frac_train=0.60,\n",
|
||
" frac_val=0.20,\n",
|
||
" frac_test=0.20,\n",
|
||
")\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"print(df_train.Leather_interior.value_counts())\n",
|
||
"\n",
|
||
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
||
"print(df_val.Leather_interior.value_counts())\n",
|
||
"\n",
|
||
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
||
"print(df_test.Leather_interior.value_counts())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Выборка с избытком (oversampling)\n",
|
||
"\n",
|
||
"https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
|
||
"\n",
|
||
"https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
|
||
"\n",
|
||
"Выборка с недостатком (undersampling)\n",
|
||
"\n",
|
||
"https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
|
||
"\n",
|
||
"Библиотека imbalanced-learn\n",
|
||
"\n",
|
||
"https://imbalanced-learn.org/stable/"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Обучающая выборка: (11542, 3)\n",
|
||
"Leather_interior\n",
|
||
"1 8372\n",
|
||
"0 3170\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Обучающая выборка после oversampling: (16453, 3)\n",
|
||
"Leather_interior\n",
|
||
"1 8372\n",
|
||
"0 8081\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Leather_interior</th>\n",
|
||
" <th>Price</th>\n",
|
||
" <th>Prod_year</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>22621</td>\n",
|
||
" <td>2011</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>35850</td>\n",
|
||
" <td>2006</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>20385</td>\n",
|
||
" <td>2012</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4547</td>\n",
|
||
" <td>2009</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>19416</td>\n",
|
||
" <td>2013</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16448</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>31361</td>\n",
|
||
" <td>2015</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16449</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>31361</td>\n",
|
||
" <td>2015</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16450</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>31361</td>\n",
|
||
" <td>2015</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16451</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>11172</td>\n",
|
||
" <td>2009</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16452</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>11133</td>\n",
|
||
" <td>2006</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>16453 rows × 3 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Leather_interior Price Prod_year\n",
|
||
"0 1 22621 2011\n",
|
||
"1 1 35850 2006\n",
|
||
"2 1 20385 2012\n",
|
||
"3 1 4547 2009\n",
|
||
"4 1 19416 2013\n",
|
||
"... ... ... ...\n",
|
||
"16448 0 31361 2015\n",
|
||
"16449 0 31361 2015\n",
|
||
"16450 0 31361 2015\n",
|
||
"16451 0 11172 2009\n",
|
||
"16452 0 11133 2006\n",
|
||
"\n",
|
||
"[16453 rows x 3 columns]"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.over_sampling import ADASYN\n",
|
||
"\n",
|
||
"ada = ADASYN()\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"print(df_train.Leather_interior.value_counts())\n",
|
||
"\n",
|
||
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Leather_interior\"]) # type: ignore\n",
|
||
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
||
"print(df_train_adasyn.Leather_interior.value_counts())\n",
|
||
"\n",
|
||
"df_train_adasyn"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|