2024-09-28 13:38:06 +04:00
|
|
|
|
{
|
|
|
|
|
"cells": [
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Загрузка данных в DataFrame"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2024-11-16 00:11:46 +04:00
|
|
|
|
"execution_count": null,
|
2024-09-28 13:38:06 +04:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
2024-11-16 00:11:46 +04:00
|
|
|
|
"ename": "",
|
|
|
|
|
"evalue": "",
|
|
|
|
|
"output_type": "error",
|
|
|
|
|
"traceback": [
|
|
|
|
|
"\u001b[1;31mRunning cells with 'Python 3.9.13' requires the ipykernel package.\n",
|
|
|
|
|
"\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n",
|
|
|
|
|
"\u001b[1;31mCommand: 'c:/Users/ogoro/AppData/Local/Programs/Python/Python39/python.exe -m pip install ipykernel -U --user --force-reinstall'"
|
2024-09-28 13:38:06 +04:00
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"from numpy import nan\n",
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"df = pd.read_csv(\"data/car_price_prediction.csv\", index_col=\"ID\")\n",
|
|
|
|
|
"df[\"Leather_interior\"] = df[\"Leather_interior\"].replace({\"Yes\": 1, \"No\": 0})\n",
|
|
|
|
|
"df[\"Levy\"] = df[\"Levy\"].replace({\"-\": None})\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"df.info()\n",
|
|
|
|
|
"print(df.shape)\n",
|
|
|
|
|
"df.head()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Получение сведений о пропущенных данных"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Типы пропущенных данных:\n",
|
|
|
|
|
"- None - представление пустых данных в Python\n",
|
|
|
|
|
"- NaN - представление пустых данных в Pandas\n",
|
|
|
|
|
"- '' - пустая строка"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 46,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Price 0\n",
|
|
|
|
|
"Levy 5819\n",
|
|
|
|
|
"Manufacturer 0\n",
|
|
|
|
|
"Model 0\n",
|
|
|
|
|
"Prod_year 0\n",
|
|
|
|
|
"Category 0\n",
|
|
|
|
|
"Leather_interior 0\n",
|
|
|
|
|
"Fuel type 0\n",
|
|
|
|
|
"Engine volume 0\n",
|
|
|
|
|
"Mileage 0\n",
|
|
|
|
|
"Cylinders 0\n",
|
|
|
|
|
"Gear box type 0\n",
|
|
|
|
|
"Drive wheels 0\n",
|
|
|
|
|
"Doors 0\n",
|
|
|
|
|
"Wheel 0\n",
|
|
|
|
|
"Color 0\n",
|
|
|
|
|
"Airbags 0\n",
|
|
|
|
|
"dtype: int64\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"Price False\n",
|
|
|
|
|
"Levy True\n",
|
|
|
|
|
"Manufacturer False\n",
|
|
|
|
|
"Model False\n",
|
|
|
|
|
"Prod_year False\n",
|
|
|
|
|
"Category False\n",
|
|
|
|
|
"Leather_interior False\n",
|
|
|
|
|
"Fuel type False\n",
|
|
|
|
|
"Engine volume False\n",
|
|
|
|
|
"Mileage False\n",
|
|
|
|
|
"Cylinders False\n",
|
|
|
|
|
"Gear box type False\n",
|
|
|
|
|
"Drive wheels False\n",
|
|
|
|
|
"Doors False\n",
|
|
|
|
|
"Wheel False\n",
|
|
|
|
|
"Color False\n",
|
|
|
|
|
"Airbags False\n",
|
|
|
|
|
"dtype: bool\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"Levy процент пустых значений: %30.25\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Количество пустых значений признаков\n",
|
|
|
|
|
"print(df.isnull().sum())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Есть ли пустые значения признаков\n",
|
|
|
|
|
"print(df.isnull().any())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Процент пустых значений признаков\n",
|
|
|
|
|
"for i in df.columns:\n",
|
|
|
|
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|
|
|
|
" if null_rate > 0:\n",
|
|
|
|
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Заполнение пропущенных данных\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 47,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"(19237, 17)\n",
|
|
|
|
|
"Price False\n",
|
|
|
|
|
"Levy False\n",
|
|
|
|
|
"Manufacturer False\n",
|
|
|
|
|
"Model False\n",
|
|
|
|
|
"Prod_year False\n",
|
|
|
|
|
"Category False\n",
|
|
|
|
|
"Leather_interior False\n",
|
|
|
|
|
"Fuel type False\n",
|
|
|
|
|
"Engine volume False\n",
|
|
|
|
|
"Mileage False\n",
|
|
|
|
|
"Cylinders False\n",
|
|
|
|
|
"Gear box type False\n",
|
|
|
|
|
"Drive wheels False\n",
|
|
|
|
|
"Doors False\n",
|
|
|
|
|
"Wheel False\n",
|
|
|
|
|
"Color False\n",
|
|
|
|
|
"Airbags False\n",
|
|
|
|
|
"dtype: bool\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>Price</th>\n",
|
|
|
|
|
" <th>Levy</th>\n",
|
|
|
|
|
" <th>Manufacturer</th>\n",
|
|
|
|
|
" <th>Model</th>\n",
|
|
|
|
|
" <th>Prod_year</th>\n",
|
|
|
|
|
" <th>Category</th>\n",
|
|
|
|
|
" <th>Leather_interior</th>\n",
|
|
|
|
|
" <th>Fuel type</th>\n",
|
|
|
|
|
" <th>Engine volume</th>\n",
|
|
|
|
|
" <th>Mileage</th>\n",
|
|
|
|
|
" <th>Cylinders</th>\n",
|
|
|
|
|
" <th>Gear box type</th>\n",
|
|
|
|
|
" <th>Drive wheels</th>\n",
|
|
|
|
|
" <th>Doors</th>\n",
|
|
|
|
|
" <th>Wheel</th>\n",
|
|
|
|
|
" <th>Color</th>\n",
|
|
|
|
|
" <th>Airbags</th>\n",
|
|
|
|
|
" <th>LevyFillNA</th>\n",
|
|
|
|
|
" <th>LevyFillMedian</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>ID</th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>45798355</th>\n",
|
|
|
|
|
" <td>8467</td>\n",
|
|
|
|
|
" <td>None</td>\n",
|
|
|
|
|
" <td>MERCEDES-BENZ</td>\n",
|
|
|
|
|
" <td>CLK 200</td>\n",
|
|
|
|
|
" <td>1999</td>\n",
|
|
|
|
|
" <td>Coupe</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>CNG</td>\n",
|
|
|
|
|
" <td>2.0 Turbo</td>\n",
|
|
|
|
|
" <td>300000 km</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>Manual</td>\n",
|
|
|
|
|
" <td>Rear</td>\n",
|
|
|
|
|
" <td>02-Mar</td>\n",
|
|
|
|
|
" <td>Left wheel</td>\n",
|
|
|
|
|
" <td>Silver</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>642.0</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>45778856</th>\n",
|
|
|
|
|
" <td>15681</td>\n",
|
|
|
|
|
" <td>831</td>\n",
|
|
|
|
|
" <td>HYUNDAI</td>\n",
|
|
|
|
|
" <td>Sonata</td>\n",
|
|
|
|
|
" <td>2011</td>\n",
|
|
|
|
|
" <td>Sedan</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>Petrol</td>\n",
|
|
|
|
|
" <td>2.4</td>\n",
|
|
|
|
|
" <td>161600 km</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>Tiptronic</td>\n",
|
|
|
|
|
" <td>Front</td>\n",
|
|
|
|
|
" <td>04-May</td>\n",
|
|
|
|
|
" <td>Left wheel</td>\n",
|
|
|
|
|
" <td>Red</td>\n",
|
|
|
|
|
" <td>8</td>\n",
|
|
|
|
|
" <td>831</td>\n",
|
|
|
|
|
" <td>831</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>45804997</th>\n",
|
|
|
|
|
" <td>26108</td>\n",
|
|
|
|
|
" <td>836</td>\n",
|
|
|
|
|
" <td>HYUNDAI</td>\n",
|
|
|
|
|
" <td>Tucson</td>\n",
|
|
|
|
|
" <td>2010</td>\n",
|
|
|
|
|
" <td>Jeep</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>Diesel</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>116365 km</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>Automatic</td>\n",
|
|
|
|
|
" <td>Front</td>\n",
|
|
|
|
|
" <td>04-May</td>\n",
|
|
|
|
|
" <td>Left wheel</td>\n",
|
|
|
|
|
" <td>Grey</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>836</td>\n",
|
|
|
|
|
" <td>836</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>45793526</th>\n",
|
|
|
|
|
" <td>5331</td>\n",
|
|
|
|
|
" <td>1288</td>\n",
|
|
|
|
|
" <td>CHEVROLET</td>\n",
|
|
|
|
|
" <td>Captiva</td>\n",
|
|
|
|
|
" <td>2007</td>\n",
|
|
|
|
|
" <td>Jeep</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>Diesel</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>51258 km</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>Automatic</td>\n",
|
|
|
|
|
" <td>Front</td>\n",
|
|
|
|
|
" <td>04-May</td>\n",
|
|
|
|
|
" <td>Left wheel</td>\n",
|
|
|
|
|
" <td>Black</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>1288</td>\n",
|
|
|
|
|
" <td>1288</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>45813273</th>\n",
|
|
|
|
|
" <td>470</td>\n",
|
|
|
|
|
" <td>753</td>\n",
|
|
|
|
|
" <td>HYUNDAI</td>\n",
|
|
|
|
|
" <td>Sonata</td>\n",
|
|
|
|
|
" <td>2012</td>\n",
|
|
|
|
|
" <td>Sedan</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>Hybrid</td>\n",
|
|
|
|
|
" <td>2.4</td>\n",
|
|
|
|
|
" <td>186923 km</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>Automatic</td>\n",
|
|
|
|
|
" <td>Front</td>\n",
|
|
|
|
|
" <td>04-May</td>\n",
|
|
|
|
|
" <td>Left wheel</td>\n",
|
|
|
|
|
" <td>White</td>\n",
|
|
|
|
|
" <td>12</td>\n",
|
|
|
|
|
" <td>753</td>\n",
|
|
|
|
|
" <td>753</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" Price Levy Manufacturer Model Prod_year Category \\\n",
|
|
|
|
|
"ID \n",
|
|
|
|
|
"45798355 8467 None MERCEDES-BENZ CLK 200 1999 Coupe \n",
|
|
|
|
|
"45778856 15681 831 HYUNDAI Sonata 2011 Sedan \n",
|
|
|
|
|
"45804997 26108 836 HYUNDAI Tucson 2010 Jeep \n",
|
|
|
|
|
"45793526 5331 1288 CHEVROLET Captiva 2007 Jeep \n",
|
|
|
|
|
"45813273 470 753 HYUNDAI Sonata 2012 Sedan \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" Leather_interior Fuel type Engine volume Mileage Cylinders \\\n",
|
|
|
|
|
"ID \n",
|
|
|
|
|
"45798355 1 CNG 2.0 Turbo 300000 km 4.0 \n",
|
|
|
|
|
"45778856 1 Petrol 2.4 161600 km 4.0 \n",
|
|
|
|
|
"45804997 1 Diesel 2 116365 km 4.0 \n",
|
|
|
|
|
"45793526 1 Diesel 2 51258 km 4.0 \n",
|
|
|
|
|
"45813273 1 Hybrid 2.4 186923 km 4.0 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" Gear box type Drive wheels Doors Wheel Color Airbags \\\n",
|
|
|
|
|
"ID \n",
|
|
|
|
|
"45798355 Manual Rear 02-Mar Left wheel Silver 5 \n",
|
|
|
|
|
"45778856 Tiptronic Front 04-May Left wheel Red 8 \n",
|
|
|
|
|
"45804997 Automatic Front 04-May Left wheel Grey 4 \n",
|
|
|
|
|
"45793526 Automatic Front 04-May Left wheel Black 4 \n",
|
|
|
|
|
"45813273 Automatic Front 04-May Left wheel White 12 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" LevyFillNA LevyFillMedian \n",
|
|
|
|
|
"ID \n",
|
|
|
|
|
"45798355 0 642.0 \n",
|
|
|
|
|
"45778856 831 831 \n",
|
|
|
|
|
"45804997 836 836 \n",
|
|
|
|
|
"45793526 1288 1288 \n",
|
|
|
|
|
"45813273 753 753 "
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 47,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"fillna_df = df.fillna(0)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(fillna_df.shape)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(fillna_df.isnull().any())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Замена пустых данных на 0\n",
|
|
|
|
|
"df[\"LevyFillNA\"] = df[\"Levy\"].fillna(0)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Замена пустых данных на медиану\n",
|
|
|
|
|
"df[\"LevyFillMedian\"] = df[\"Levy\"].fillna(df[\"LevyFillNA\"].median())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"df.tail()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 32,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>Price</th>\n",
|
|
|
|
|
" <th>Levy</th>\n",
|
|
|
|
|
" <th>Manufacturer</th>\n",
|
|
|
|
|
" <th>Model</th>\n",
|
|
|
|
|
" <th>Prod. year</th>\n",
|
|
|
|
|
" <th>Category</th>\n",
|
|
|
|
|
" <th>Leather_interior</th>\n",
|
|
|
|
|
" <th>Fuel type</th>\n",
|
|
|
|
|
" <th>Engine volume</th>\n",
|
|
|
|
|
" <th>Mileage</th>\n",
|
|
|
|
|
" <th>Cylinders</th>\n",
|
|
|
|
|
" <th>Gear box type</th>\n",
|
|
|
|
|
" <th>Drive wheels</th>\n",
|
|
|
|
|
" <th>Doors</th>\n",
|
|
|
|
|
" <th>Wheel</th>\n",
|
|
|
|
|
" <th>Color</th>\n",
|
|
|
|
|
" <th>Airbags</th>\n",
|
|
|
|
|
" <th>LevyFillNA</th>\n",
|
|
|
|
|
" <th>LevyFillMedian</th>\n",
|
|
|
|
|
" <th>LevyCopy</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>ID</th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>45798355</th>\n",
|
|
|
|
|
" <td>8467</td>\n",
|
|
|
|
|
" <td>None</td>\n",
|
|
|
|
|
" <td>MERCEDES-BENZ</td>\n",
|
|
|
|
|
" <td>CLK 200</td>\n",
|
|
|
|
|
" <td>1999</td>\n",
|
|
|
|
|
" <td>Coupe</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>CNG</td>\n",
|
|
|
|
|
" <td>2.0 Turbo</td>\n",
|
|
|
|
|
" <td>300000 km</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>Manual</td>\n",
|
|
|
|
|
" <td>Rear</td>\n",
|
|
|
|
|
" <td>02-Mar</td>\n",
|
|
|
|
|
" <td>Left wheel</td>\n",
|
|
|
|
|
" <td>Silver</td>\n",
|
|
|
|
|
" <td>5</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>642.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>45778856</th>\n",
|
|
|
|
|
" <td>15681</td>\n",
|
|
|
|
|
" <td>831</td>\n",
|
|
|
|
|
" <td>HYUNDAI</td>\n",
|
|
|
|
|
" <td>Sonata</td>\n",
|
|
|
|
|
" <td>2011</td>\n",
|
|
|
|
|
" <td>Sedan</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>Petrol</td>\n",
|
|
|
|
|
" <td>2.4</td>\n",
|
|
|
|
|
" <td>161600 km</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>Tiptronic</td>\n",
|
|
|
|
|
" <td>Front</td>\n",
|
|
|
|
|
" <td>04-May</td>\n",
|
|
|
|
|
" <td>Left wheel</td>\n",
|
|
|
|
|
" <td>Red</td>\n",
|
|
|
|
|
" <td>8</td>\n",
|
|
|
|
|
" <td>831</td>\n",
|
|
|
|
|
" <td>831</td>\n",
|
|
|
|
|
" <td>831</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>45804997</th>\n",
|
|
|
|
|
" <td>26108</td>\n",
|
|
|
|
|
" <td>836</td>\n",
|
|
|
|
|
" <td>HYUNDAI</td>\n",
|
|
|
|
|
" <td>Tucson</td>\n",
|
|
|
|
|
" <td>2010</td>\n",
|
|
|
|
|
" <td>Jeep</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>Diesel</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>116365 km</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>Automatic</td>\n",
|
|
|
|
|
" <td>Front</td>\n",
|
|
|
|
|
" <td>04-May</td>\n",
|
|
|
|
|
" <td>Left wheel</td>\n",
|
|
|
|
|
" <td>Grey</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>836</td>\n",
|
|
|
|
|
" <td>836</td>\n",
|
|
|
|
|
" <td>836</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>45793526</th>\n",
|
|
|
|
|
" <td>5331</td>\n",
|
|
|
|
|
" <td>1288</td>\n",
|
|
|
|
|
" <td>CHEVROLET</td>\n",
|
|
|
|
|
" <td>Captiva</td>\n",
|
|
|
|
|
" <td>2007</td>\n",
|
|
|
|
|
" <td>Jeep</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>Diesel</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>51258 km</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>Automatic</td>\n",
|
|
|
|
|
" <td>Front</td>\n",
|
|
|
|
|
" <td>04-May</td>\n",
|
|
|
|
|
" <td>Left wheel</td>\n",
|
|
|
|
|
" <td>Black</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>1288</td>\n",
|
|
|
|
|
" <td>1288</td>\n",
|
|
|
|
|
" <td>1288</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>45813273</th>\n",
|
|
|
|
|
" <td>470</td>\n",
|
|
|
|
|
" <td>753</td>\n",
|
|
|
|
|
" <td>HYUNDAI</td>\n",
|
|
|
|
|
" <td>Sonata</td>\n",
|
|
|
|
|
" <td>2012</td>\n",
|
|
|
|
|
" <td>Sedan</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>Hybrid</td>\n",
|
|
|
|
|
" <td>2.4</td>\n",
|
|
|
|
|
" <td>186923 km</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>Automatic</td>\n",
|
|
|
|
|
" <td>Front</td>\n",
|
|
|
|
|
" <td>04-May</td>\n",
|
|
|
|
|
" <td>Left wheel</td>\n",
|
|
|
|
|
" <td>White</td>\n",
|
|
|
|
|
" <td>12</td>\n",
|
|
|
|
|
" <td>753</td>\n",
|
|
|
|
|
" <td>753</td>\n",
|
|
|
|
|
" <td>753</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" Price Levy Manufacturer Model Prod. year Category \\\n",
|
|
|
|
|
"ID \n",
|
|
|
|
|
"45798355 8467 None MERCEDES-BENZ CLK 200 1999 Coupe \n",
|
|
|
|
|
"45778856 15681 831 HYUNDAI Sonata 2011 Sedan \n",
|
|
|
|
|
"45804997 26108 836 HYUNDAI Tucson 2010 Jeep \n",
|
|
|
|
|
"45793526 5331 1288 CHEVROLET Captiva 2007 Jeep \n",
|
|
|
|
|
"45813273 470 753 HYUNDAI Sonata 2012 Sedan \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" Leather_interior Fuel type Engine volume Mileage Cylinders \\\n",
|
|
|
|
|
"ID \n",
|
|
|
|
|
"45798355 1 CNG 2.0 Turbo 300000 km 4.0 \n",
|
|
|
|
|
"45778856 1 Petrol 2.4 161600 km 4.0 \n",
|
|
|
|
|
"45804997 1 Diesel 2 116365 km 4.0 \n",
|
|
|
|
|
"45793526 1 Diesel 2 51258 km 4.0 \n",
|
|
|
|
|
"45813273 1 Hybrid 2.4 186923 km 4.0 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" Gear box type Drive wheels Doors Wheel Color Airbags \\\n",
|
|
|
|
|
"ID \n",
|
|
|
|
|
"45798355 Manual Rear 02-Mar Left wheel Silver 5 \n",
|
|
|
|
|
"45778856 Tiptronic Front 04-May Left wheel Red 8 \n",
|
|
|
|
|
"45804997 Automatic Front 04-May Left wheel Grey 4 \n",
|
|
|
|
|
"45793526 Automatic Front 04-May Left wheel Black 4 \n",
|
|
|
|
|
"45813273 Automatic Front 04-May Left wheel White 12 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" LevyFillNA LevyFillMedian LevyCopy \n",
|
|
|
|
|
"ID \n",
|
|
|
|
|
"45798355 0 642.0 0 \n",
|
|
|
|
|
"45778856 831 831 831 \n",
|
|
|
|
|
"45804997 836 836 836 \n",
|
|
|
|
|
"45793526 1288 1288 1288 \n",
|
|
|
|
|
"45813273 753 753 753 "
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 32,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"df[\"LevyCopy\"] = df[\"Levy\"]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Замена данных сразу в DataFrame без копирования\n",
|
|
|
|
|
"df.fillna({\"LevyCopy\": 0}, inplace=True)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"df.tail()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Удаление наблюдений с пропусками"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 33,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"(13418, 20)\n",
|
|
|
|
|
"Price False\n",
|
|
|
|
|
"Levy False\n",
|
|
|
|
|
"Manufacturer False\n",
|
|
|
|
|
"Model False\n",
|
|
|
|
|
"Prod. year False\n",
|
|
|
|
|
"Category False\n",
|
|
|
|
|
"Leather_interior False\n",
|
|
|
|
|
"Fuel type False\n",
|
|
|
|
|
"Engine volume False\n",
|
|
|
|
|
"Mileage False\n",
|
|
|
|
|
"Cylinders False\n",
|
|
|
|
|
"Gear box type False\n",
|
|
|
|
|
"Drive wheels False\n",
|
|
|
|
|
"Doors False\n",
|
|
|
|
|
"Wheel False\n",
|
|
|
|
|
"Color False\n",
|
|
|
|
|
"Airbags False\n",
|
|
|
|
|
"dtype: bool\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"dropna_df = df.dropna()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(dropna_df.shape)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(fillna_df.isnull().any())"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Создание выборок данных\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"Библиотека scikit-learn\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"https://scikit-learn.org/stable/index.html"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 41,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Функция для создания выборок\n",
|
|
|
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def split_stratified_into_train_val_test(\n",
|
|
|
|
|
" df_input,\n",
|
|
|
|
|
" stratify_colname=\"y\",\n",
|
|
|
|
|
" frac_train=0.6,\n",
|
|
|
|
|
" frac_val=0.15,\n",
|
|
|
|
|
" frac_test=0.25,\n",
|
|
|
|
|
" random_state=None,\n",
|
|
|
|
|
"):\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
|
|
|
|
" following fractional ratios provided by the user, where each subset is\n",
|
|
|
|
|
" stratified by the values in a specific column (that is, each subset has\n",
|
|
|
|
|
" the same relative frequency of the values in the column). It performs this\n",
|
|
|
|
|
" splitting by running train_test_split() twice.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" Parameters\n",
|
|
|
|
|
" ----------\n",
|
|
|
|
|
" df_input : Pandas dataframe\n",
|
|
|
|
|
" Input dataframe to be split.\n",
|
|
|
|
|
" stratify_colname : str\n",
|
|
|
|
|
" The name of the column that will be used for stratification. Usually\n",
|
|
|
|
|
" this column would be for the label.\n",
|
|
|
|
|
" frac_train : float\n",
|
|
|
|
|
" frac_val : float\n",
|
|
|
|
|
" frac_test : float\n",
|
|
|
|
|
" The ratios with which the dataframe will be split into train, val, and\n",
|
|
|
|
|
" test data. The values should be expressed as float fractions and should\n",
|
|
|
|
|
" sum to 1.0.\n",
|
|
|
|
|
" random_state : int, None, or RandomStateInstance\n",
|
|
|
|
|
" Value to be passed to train_test_split().\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" Returns\n",
|
|
|
|
|
" -------\n",
|
|
|
|
|
" df_train, df_val, df_test :\n",
|
|
|
|
|
" Dataframes containing the three splits.\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" if frac_train + frac_val + frac_test != 1.0:\n",
|
|
|
|
|
" raise ValueError(\n",
|
|
|
|
|
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
|
|
|
|
" % (frac_train, frac_val, frac_test)\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" if stratify_colname not in df_input.columns:\n",
|
|
|
|
|
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" X = df_input # Contains all columns.\n",
|
|
|
|
|
" y = df_input[\n",
|
|
|
|
|
" [stratify_colname]\n",
|
|
|
|
|
" ] # Dataframe of just the column on which to stratify.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Split original dataframe into train and temp dataframes.\n",
|
|
|
|
|
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
|
|
|
|
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Split the temp dataframe into val and test dataframes.\n",
|
|
|
|
|
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
|
|
|
|
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
|
|
|
|
" df_temp,\n",
|
|
|
|
|
" y_temp,\n",
|
|
|
|
|
" stratify=y_temp,\n",
|
|
|
|
|
" test_size=relative_frac_test,\n",
|
|
|
|
|
" random_state=random_state,\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" return df_train, df_val, df_test"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 48,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Leather_interior\n",
|
|
|
|
|
"1 13954\n",
|
|
|
|
|
"0 5283\n",
|
|
|
|
|
"Name: count, dtype: int64\n",
|
|
|
|
|
"Обучающая выборка: (11542, 3)\n",
|
|
|
|
|
"Leather_interior\n",
|
|
|
|
|
"1 8372\n",
|
|
|
|
|
"0 3170\n",
|
|
|
|
|
"Name: count, dtype: int64\n",
|
|
|
|
|
"Контрольная выборка: (3847, 3)\n",
|
|
|
|
|
"Leather_interior\n",
|
|
|
|
|
"1 2791\n",
|
|
|
|
|
"0 1056\n",
|
|
|
|
|
"Name: count, dtype: int64\n",
|
|
|
|
|
"Тестовая выборка: (3848, 3)\n",
|
|
|
|
|
"Leather_interior\n",
|
|
|
|
|
"1 2791\n",
|
|
|
|
|
"0 1057\n",
|
|
|
|
|
"Name: count, dtype: int64\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Вывод распределения количества наблюдений по меткам (классам)\n",
|
|
|
|
|
"print(df.Leather_interior.value_counts())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"data = df[[\"Leather_interior\", \"Price\", \"Prod_year\"]].copy()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
|
|
|
|
" data,\n",
|
|
|
|
|
" stratify_colname=\"Leather_interior\",\n",
|
|
|
|
|
" frac_train=0.60,\n",
|
|
|
|
|
" frac_val=0.20,\n",
|
|
|
|
|
" frac_test=0.20,\n",
|
|
|
|
|
")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
|
|
|
|
"print(df_train.Leather_interior.value_counts())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
|
|
|
|
"print(df_val.Leather_interior.value_counts())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
|
|
|
|
"print(df_test.Leather_interior.value_counts())"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Выборка с избытком (oversampling)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"Выборка с недостатком (undersampling)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"Библиотека imbalanced-learn\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"https://imbalanced-learn.org/stable/"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 49,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Обучающая выборка: (11542, 3)\n",
|
|
|
|
|
"Leather_interior\n",
|
|
|
|
|
"1 8372\n",
|
|
|
|
|
"0 3170\n",
|
|
|
|
|
"Name: count, dtype: int64\n",
|
|
|
|
|
"Обучающая выборка после oversampling: (16416, 3)\n",
|
|
|
|
|
"Leather_interior\n",
|
|
|
|
|
"1 8372\n",
|
|
|
|
|
"0 8044\n",
|
|
|
|
|
"Name: count, dtype: int64\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>Leather_interior</th>\n",
|
|
|
|
|
" <th>Price</th>\n",
|
|
|
|
|
" <th>Prod_year</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>21400</td>\n",
|
|
|
|
|
" <td>2008</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>16621</td>\n",
|
|
|
|
|
" <td>2016</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>28852</td>\n",
|
|
|
|
|
" <td>2017</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>2430</td>\n",
|
|
|
|
|
" <td>2008</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>7840</td>\n",
|
|
|
|
|
" <td>2005</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>...</th>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>16411</th>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>26030</td>\n",
|
|
|
|
|
" <td>2013</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>16412</th>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>26030</td>\n",
|
|
|
|
|
" <td>2012</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>16413</th>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>26030</td>\n",
|
|
|
|
|
" <td>2014</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>16414</th>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>26030</td>\n",
|
|
|
|
|
" <td>2012</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>16415</th>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>26030</td>\n",
|
|
|
|
|
" <td>2012</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"<p>16416 rows × 3 columns</p>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" Leather_interior Price Prod_year\n",
|
|
|
|
|
"0 0 21400 2008\n",
|
|
|
|
|
"1 0 16621 2016\n",
|
|
|
|
|
"2 1 28852 2017\n",
|
|
|
|
|
"3 1 2430 2008\n",
|
|
|
|
|
"4 0 7840 2005\n",
|
|
|
|
|
"... ... ... ...\n",
|
|
|
|
|
"16411 0 26030 2013\n",
|
|
|
|
|
"16412 0 26030 2012\n",
|
|
|
|
|
"16413 0 26030 2014\n",
|
|
|
|
|
"16414 0 26030 2012\n",
|
|
|
|
|
"16415 0 26030 2012\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"[16416 rows x 3 columns]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 49,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"from imblearn.over_sampling import ADASYN\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"ada = ADASYN()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
|
|
|
|
"print(df_train.Leather_interior.value_counts())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Leather_interior\"])\n",
|
|
|
|
|
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
|
|
|
|
"print(df_train_adasyn.Leather_interior.value_counts())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"df_train_adasyn"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"kernelspec": {
|
2024-11-16 00:11:46 +04:00
|
|
|
|
"display_name": "Python 3",
|
2024-09-28 13:38:06 +04:00
|
|
|
|
"language": "python",
|
|
|
|
|
"name": "python3"
|
|
|
|
|
},
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
"version": 3
|
|
|
|
|
},
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
"name": "python",
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
"pygments_lexer": "ipython3",
|
2024-11-16 00:11:46 +04:00
|
|
|
|
"version": "3.10.6"
|
2024-09-28 13:38:06 +04:00
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 2
|
|
|
|
|
}
|