2024-09-21 09:46:46 +04:00
|
|
|
|
{
|
|
|
|
|
"cells": [
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Загрузка данных в DataFrame"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 1,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"Index: 19237 entries, 45654403 to 45813273\n",
|
|
|
|
|
"Data columns (total 17 columns):\n",
|
|
|
|
|
" # Column Non-Null Count Dtype \n",
|
|
|
|
|
"--- ------ -------------- ----- \n",
|
|
|
|
|
" 0 Price 19237 non-null int64 \n",
|
|
|
|
|
" 1 Levy 19237 non-null object \n",
|
|
|
|
|
" 2 Manufacturer 19237 non-null object \n",
|
|
|
|
|
" 3 Model 19237 non-null object \n",
|
|
|
|
|
" 4 Prodyear 19237 non-null int64 \n",
|
|
|
|
|
" 5 Category 19237 non-null object \n",
|
|
|
|
|
" 6 Leatherinterior 19237 non-null object \n",
|
|
|
|
|
" 7 Fueltype 19237 non-null object \n",
|
|
|
|
|
" 8 Engine volume 19237 non-null object \n",
|
|
|
|
|
" 9 Mileage 19237 non-null object \n",
|
|
|
|
|
" 10 Cylinders 19237 non-null float64\n",
|
|
|
|
|
" 11 Gear box type 19237 non-null object \n",
|
|
|
|
|
" 12 Drive wheels 19237 non-null object \n",
|
|
|
|
|
" 13 Doors 19237 non-null object \n",
|
|
|
|
|
" 14 Wheel 19237 non-null object \n",
|
|
|
|
|
" 15 Color 19237 non-null object \n",
|
|
|
|
|
" 16 Airbags 19237 non-null int64 \n",
|
|
|
|
|
"dtypes: float64(1), int64(3), object(13)\n",
|
|
|
|
|
"memory usage: 2.6+ MB\n",
|
|
|
|
|
"(19237, 17)\n"
|
2024-09-21 09:46:46 +04:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <th>Price</th>\n",
|
|
|
|
|
" <th>Levy</th>\n",
|
|
|
|
|
" <th>Manufacturer</th>\n",
|
|
|
|
|
" <th>Model</th>\n",
|
|
|
|
|
" <th>Prodyear</th>\n",
|
|
|
|
|
" <th>Category</th>\n",
|
|
|
|
|
" <th>Leatherinterior</th>\n",
|
|
|
|
|
" <th>Fueltype</th>\n",
|
|
|
|
|
" <th>Engine volume</th>\n",
|
|
|
|
|
" <th>Mileage</th>\n",
|
|
|
|
|
" <th>Cylinders</th>\n",
|
|
|
|
|
" <th>Gear box type</th>\n",
|
|
|
|
|
" <th>Drive wheels</th>\n",
|
|
|
|
|
" <th>Doors</th>\n",
|
|
|
|
|
" <th>Wheel</th>\n",
|
|
|
|
|
" <th>Color</th>\n",
|
|
|
|
|
" <th>Airbags</th>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <th>ID</th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <th>45654403</th>\n",
|
|
|
|
|
" <td>13328</td>\n",
|
|
|
|
|
" <td>1399</td>\n",
|
|
|
|
|
" <td>LEXUS</td>\n",
|
|
|
|
|
" <td>RX 450</td>\n",
|
|
|
|
|
" <td>2010</td>\n",
|
|
|
|
|
" <td>Jeep</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" <td>1</td>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <td>Hybrid</td>\n",
|
|
|
|
|
" <td>3.5</td>\n",
|
|
|
|
|
" <td>186005 km</td>\n",
|
|
|
|
|
" <td>6.0</td>\n",
|
|
|
|
|
" <td>Automatic</td>\n",
|
|
|
|
|
" <td>4x4</td>\n",
|
|
|
|
|
" <td>04-May</td>\n",
|
|
|
|
|
" <td>Left wheel</td>\n",
|
|
|
|
|
" <td>Silver</td>\n",
|
|
|
|
|
" <td>12</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <th>44731507</th>\n",
|
|
|
|
|
" <td>16621</td>\n",
|
|
|
|
|
" <td>1018</td>\n",
|
|
|
|
|
" <td>CHEVROLET</td>\n",
|
|
|
|
|
" <td>Equinox</td>\n",
|
|
|
|
|
" <td>2011</td>\n",
|
|
|
|
|
" <td>Jeep</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>Petrol</td>\n",
|
|
|
|
|
" <td>3</td>\n",
|
|
|
|
|
" <td>192000 km</td>\n",
|
|
|
|
|
" <td>6.0</td>\n",
|
|
|
|
|
" <td>Tiptronic</td>\n",
|
|
|
|
|
" <td>4x4</td>\n",
|
|
|
|
|
" <td>04-May</td>\n",
|
|
|
|
|
" <td>Left wheel</td>\n",
|
|
|
|
|
" <td>Black</td>\n",
|
|
|
|
|
" <td>8</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <th>45774419</th>\n",
|
|
|
|
|
" <td>8467</td>\n",
|
|
|
|
|
" <td>-</td>\n",
|
|
|
|
|
" <td>HONDA</td>\n",
|
|
|
|
|
" <td>FIT</td>\n",
|
|
|
|
|
" <td>2006</td>\n",
|
|
|
|
|
" <td>Hatchback</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>Petrol</td>\n",
|
|
|
|
|
" <td>1.3</td>\n",
|
|
|
|
|
" <td>200000 km</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>Variator</td>\n",
|
|
|
|
|
" <td>Front</td>\n",
|
|
|
|
|
" <td>04-May</td>\n",
|
|
|
|
|
" <td>Right-hand drive</td>\n",
|
|
|
|
|
" <td>Black</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <th>45769185</th>\n",
|
|
|
|
|
" <td>3607</td>\n",
|
|
|
|
|
" <td>862</td>\n",
|
|
|
|
|
" <td>FORD</td>\n",
|
|
|
|
|
" <td>Escape</td>\n",
|
|
|
|
|
" <td>2011</td>\n",
|
|
|
|
|
" <td>Jeep</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" <td>1</td>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <td>Hybrid</td>\n",
|
|
|
|
|
" <td>2.5</td>\n",
|
|
|
|
|
" <td>168966 km</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>Automatic</td>\n",
|
|
|
|
|
" <td>4x4</td>\n",
|
|
|
|
|
" <td>04-May</td>\n",
|
|
|
|
|
" <td>Left wheel</td>\n",
|
|
|
|
|
" <td>White</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <th>45809263</th>\n",
|
|
|
|
|
" <td>11726</td>\n",
|
|
|
|
|
" <td>446</td>\n",
|
|
|
|
|
" <td>HONDA</td>\n",
|
|
|
|
|
" <td>FIT</td>\n",
|
|
|
|
|
" <td>2014</td>\n",
|
|
|
|
|
" <td>Hatchback</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>Petrol</td>\n",
|
|
|
|
|
" <td>1.3</td>\n",
|
|
|
|
|
" <td>91901 km</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>Automatic</td>\n",
|
|
|
|
|
" <td>Front</td>\n",
|
|
|
|
|
" <td>04-May</td>\n",
|
|
|
|
|
" <td>Left wheel</td>\n",
|
|
|
|
|
" <td>Silver</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" Price Levy Manufacturer Model Prodyear Category \\\n",
|
|
|
|
|
"ID \n",
|
|
|
|
|
"45654403 13328 1399 LEXUS RX 450 2010 Jeep \n",
|
|
|
|
|
"44731507 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
|
|
|
|
|
"45774419 8467 - HONDA FIT 2006 Hatchback \n",
|
|
|
|
|
"45769185 3607 862 FORD Escape 2011 Jeep \n",
|
|
|
|
|
"45809263 11726 446 HONDA FIT 2014 Hatchback \n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" Leatherinterior Fueltype Engine volume Mileage Cylinders \\\n",
|
|
|
|
|
"ID \n",
|
|
|
|
|
"45654403 1 Hybrid 3.5 186005 km 6.0 \n",
|
|
|
|
|
"44731507 0 Petrol 3 192000 km 6.0 \n",
|
|
|
|
|
"45774419 0 Petrol 1.3 200000 km 4.0 \n",
|
|
|
|
|
"45769185 1 Hybrid 2.5 168966 km 4.0 \n",
|
|
|
|
|
"45809263 1 Petrol 1.3 91901 km 4.0 \n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" Gear box type Drive wheels Doors Wheel Color Airbags \n",
|
|
|
|
|
"ID \n",
|
|
|
|
|
"45654403 Automatic 4x4 04-May Left wheel Silver 12 \n",
|
|
|
|
|
"44731507 Tiptronic 4x4 04-May Left wheel Black 8 \n",
|
|
|
|
|
"45774419 Variator Front 04-May Right-hand drive Black 2 \n",
|
|
|
|
|
"45769185 Automatic 4x4 04-May Left wheel White 0 \n",
|
|
|
|
|
"45809263 Automatic Front 04-May Left wheel Silver 4 "
|
2024-09-21 09:46:46 +04:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 1,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"df = pd.read_csv(\"data/car_price_prediction.csv\", index_col=\"ID\")\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
|
|
|
|
"df.info()\n",
|
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"df[\"Leatherinterior\"] = df[\"Leatherinterior\"].apply(\n",
|
|
|
|
|
" lambda x: 1 if x == 'Yes' else 0,\n",
|
|
|
|
|
")\n",
|
|
|
|
|
"\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"print(df.shape)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"df.head()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Получение сведений о пропущенных данных"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Типы пропущенных данных:\n",
|
|
|
|
|
"- None - представление пустых данных в Python\n",
|
|
|
|
|
"- NaN - представление пустых данных в Pandas\n",
|
|
|
|
|
"- '' - пустая строка"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 2,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"Price 0\n",
|
|
|
|
|
"Levy 0\n",
|
|
|
|
|
"Manufacturer 0\n",
|
|
|
|
|
"Model 0\n",
|
|
|
|
|
"Prodyear 0\n",
|
|
|
|
|
"Category 0\n",
|
|
|
|
|
"Leatherinterior 0\n",
|
|
|
|
|
"Fueltype 0\n",
|
|
|
|
|
"Engine volume 0\n",
|
|
|
|
|
"Mileage 0\n",
|
|
|
|
|
"Cylinders 0\n",
|
|
|
|
|
"Gear box type 0\n",
|
|
|
|
|
"Drive wheels 0\n",
|
|
|
|
|
"Doors 0\n",
|
|
|
|
|
"Wheel 0\n",
|
|
|
|
|
"Color 0\n",
|
|
|
|
|
"Airbags 0\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"dtype: int64\n",
|
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"Price False\n",
|
|
|
|
|
"Levy False\n",
|
|
|
|
|
"Manufacturer False\n",
|
|
|
|
|
"Model False\n",
|
|
|
|
|
"Prodyear False\n",
|
|
|
|
|
"Category False\n",
|
|
|
|
|
"Leatherinterior False\n",
|
|
|
|
|
"Fueltype False\n",
|
|
|
|
|
"Engine volume False\n",
|
|
|
|
|
"Mileage False\n",
|
|
|
|
|
"Cylinders False\n",
|
|
|
|
|
"Gear box type False\n",
|
|
|
|
|
"Drive wheels False\n",
|
|
|
|
|
"Doors False\n",
|
|
|
|
|
"Wheel False\n",
|
|
|
|
|
"Color False\n",
|
|
|
|
|
"Airbags False\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"dtype: bool\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"\n"
|
2024-09-21 09:46:46 +04:00
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Количество пустых значений признаков\n",
|
|
|
|
|
"print(df.isnull().sum())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Есть ли пустые значения признаков\n",
|
|
|
|
|
"print(df.isnull().any())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Процент пустых значений признаков\n",
|
|
|
|
|
"for i in df.columns:\n",
|
|
|
|
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|
|
|
|
" if null_rate > 0:\n",
|
|
|
|
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Заполнение пропущенных данных\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 3,
|
|
|
|
|
"metadata": {},
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"outputs": [],
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"source": [
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"# fillna_df = df.fillna(0)\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"# print(fillna_df.shape)\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"# print(fillna_df.isnull().any())\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"# # Замена пустых данных на 0\n",
|
|
|
|
|
"# df[\"AgeFillNA\"] = df[\"Age\"].fillna(0)\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"# # Замена пустых данных на медиану\n",
|
|
|
|
|
"# df[\"AgeFillMedian\"] = df[\"Age\"].fillna(df[\"Age\"].median())\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"# df.tail()"
|
2024-09-21 09:46:46 +04:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 4,
|
|
|
|
|
"metadata": {},
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"outputs": [],
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"source": [
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"# df[\"AgeCopy\"] = df[\"Age\"]\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"# # Замена данных сразу в DataFrame без копирования\n",
|
|
|
|
|
"# df.fillna({\"AgeCopy\": 0}, inplace=True)\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"# df.tail()"
|
2024-09-21 09:46:46 +04:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Удаление наблюдений с пропусками"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 5,
|
|
|
|
|
"metadata": {},
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"outputs": [],
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"source": [
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"# dropna_df = df.dropna()\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"# print(dropna_df.shape)\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"# print(fillna_df.isnull().any())"
|
2024-09-21 09:46:46 +04:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Создание выборок данных\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"Библиотека scikit-learn\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"https://scikit-learn.org/stable/index.html"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 6,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Функция для создания выборок\n",
|
|
|
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def split_stratified_into_train_val_test(\n",
|
|
|
|
|
" df_input,\n",
|
|
|
|
|
" stratify_colname=\"y\",\n",
|
|
|
|
|
" frac_train=0.6,\n",
|
|
|
|
|
" frac_val=0.15,\n",
|
|
|
|
|
" frac_test=0.25,\n",
|
|
|
|
|
" random_state=None,\n",
|
|
|
|
|
"):\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
|
|
|
|
" following fractional ratios provided by the user, where each subset is\n",
|
|
|
|
|
" stratified by the values in a specific column (that is, each subset has\n",
|
|
|
|
|
" the same relative frequency of the values in the column). It performs this\n",
|
|
|
|
|
" splitting by running train_test_split() twice.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" Parameters\n",
|
|
|
|
|
" ----------\n",
|
|
|
|
|
" df_input : Pandas dataframe\n",
|
|
|
|
|
" Input dataframe to be split.\n",
|
|
|
|
|
" stratify_colname : str\n",
|
|
|
|
|
" The name of the column that will be used for stratification. Usually\n",
|
|
|
|
|
" this column would be for the label.\n",
|
|
|
|
|
" frac_train : float\n",
|
|
|
|
|
" frac_val : float\n",
|
|
|
|
|
" frac_test : float\n",
|
|
|
|
|
" The ratios with which the dataframe will be split into train, val, and\n",
|
|
|
|
|
" test data. The values should be expressed as float fractions and should\n",
|
|
|
|
|
" sum to 1.0.\n",
|
|
|
|
|
" random_state : int, None, or RandomStateInstance\n",
|
|
|
|
|
" Value to be passed to train_test_split().\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" Returns\n",
|
|
|
|
|
" -------\n",
|
|
|
|
|
" df_train, df_val, df_test :\n",
|
|
|
|
|
" Dataframes containing the three splits.\n",
|
|
|
|
|
" \"\"\"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" if frac_train + frac_val + frac_test != 1.0:\n",
|
|
|
|
|
" raise ValueError(\n",
|
|
|
|
|
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
|
|
|
|
" % (frac_train, frac_val, frac_test)\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" if stratify_colname not in df_input.columns:\n",
|
|
|
|
|
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" X = df_input # Contains all columns.\n",
|
|
|
|
|
" y = df_input[\n",
|
|
|
|
|
" [stratify_colname]\n",
|
|
|
|
|
" ] # Dataframe of just the column on which to stratify.\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Split original dataframe into train and temp dataframes.\n",
|
|
|
|
|
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
|
|
|
|
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Split the temp dataframe into val and test dataframes.\n",
|
|
|
|
|
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
|
|
|
|
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
|
|
|
|
" df_temp,\n",
|
|
|
|
|
" y_temp,\n",
|
|
|
|
|
" stratify=y_temp,\n",
|
|
|
|
|
" test_size=relative_frac_test,\n",
|
|
|
|
|
" random_state=random_state,\n",
|
|
|
|
|
" )\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" return df_train, df_val, df_test"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 7,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"Leatherinterior\n",
|
|
|
|
|
"1 13954\n",
|
|
|
|
|
"0 5283\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"Name: count, dtype: int64\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"Обучающая выборка: (11542, 3)\n",
|
|
|
|
|
"Leatherinterior\n",
|
|
|
|
|
"1 8372\n",
|
|
|
|
|
"0 3170\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"Name: count, dtype: int64\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"Контрольная выборка: (3847, 3)\n",
|
|
|
|
|
"Leatherinterior\n",
|
|
|
|
|
"1 2791\n",
|
|
|
|
|
"0 1056\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"Name: count, dtype: int64\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"Тестовая выборка: (3848, 3)\n",
|
|
|
|
|
"Leatherinterior\n",
|
|
|
|
|
"1 2791\n",
|
|
|
|
|
"0 1057\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"Name: count, dtype: int64\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Вывод распределения количества наблюдений по меткам (классам)\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"print(df.Leatherinterior.value_counts())\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"data = df[[\"Leatherinterior\", \"Price\", \"Airbags\"]].copy()\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
|
|
|
|
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" data,\n",
|
|
|
|
|
" stratify_colname=\"Leatherinterior\",\n",
|
|
|
|
|
" frac_train=0.60,\n",
|
|
|
|
|
" frac_val=0.20,\n",
|
|
|
|
|
" frac_test=0.20,\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"print(df_train.Leatherinterior.value_counts())\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"print(df_val.Leatherinterior.value_counts())\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"print(df_test.Leatherinterior.value_counts())"
|
2024-09-21 09:46:46 +04:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"Выборка с избытком (oversampling)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"Выборка с недостатком (undersampling)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"Библиотека imbalanced-learn\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"https://imbalanced-learn.org/stable/"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 8,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"Обучающая выборка: (11542, 3)\n",
|
|
|
|
|
"Leatherinterior\n",
|
|
|
|
|
"1 8372\n",
|
|
|
|
|
"0 3170\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"Name: count, dtype: int64\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"Обучающая выборка после oversampling: (16585, 3)\n",
|
|
|
|
|
"Leatherinterior\n",
|
|
|
|
|
"1 8372\n",
|
|
|
|
|
"0 8213\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"Name: count, dtype: int64\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <th>Leatherinterior</th>\n",
|
|
|
|
|
" <th>Price</th>\n",
|
|
|
|
|
" <th>Airbags</th>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>12231</td>\n",
|
|
|
|
|
" <td>8</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>18817</td>\n",
|
|
|
|
|
" <td>10</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <td>15053</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>1</td>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <td>470</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>19914</td>\n",
|
|
|
|
|
" <td>6</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>...</th>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <th>16580</th>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" <td>0</td>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <td>13015</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <th>16581</th>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>8799</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <th>16582</th>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" <td>0</td>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <td>2057</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <th>16583</th>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" <td>0</td>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <td>2000</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" <th>16584</th>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1910</td>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"<p>16585 rows × 3 columns</p>\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
2024-10-22 18:54:39 +04:00
|
|
|
|
" Leatherinterior Price Airbags\n",
|
|
|
|
|
"0 1 12231 8\n",
|
|
|
|
|
"1 1 18817 10\n",
|
|
|
|
|
"2 1 15053 4\n",
|
|
|
|
|
"3 1 470 0\n",
|
|
|
|
|
"4 1 19914 6\n",
|
|
|
|
|
"... ... ... ...\n",
|
|
|
|
|
"16580 0 13015 4\n",
|
|
|
|
|
"16581 0 8799 2\n",
|
|
|
|
|
"16582 0 2057 2\n",
|
|
|
|
|
"16583 0 2000 2\n",
|
|
|
|
|
"16584 0 1910 1\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"[16585 rows x 3 columns]"
|
2024-09-21 09:46:46 +04:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 8,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"from imblearn.over_sampling import ADASYN\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"ada = ADASYN()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"print(df_train.Leatherinterior.value_counts())\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Leatherinterior\"]) # type: ignore\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"print(df_train_adasyn.Leatherinterior.value_counts())\n",
|
2024-09-21 09:46:46 +04:00
|
|
|
|
"\n",
|
|
|
|
|
"df_train_adasyn"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"display_name": ".venv",
|
|
|
|
|
"language": "python",
|
|
|
|
|
"name": "python3"
|
|
|
|
|
},
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
"version": 3
|
|
|
|
|
},
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
"name": "python",
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
"pygments_lexer": "ipython3",
|
2024-10-22 18:54:39 +04:00
|
|
|
|
"version": "3.12.4"
|
2024-09-21 09:46:46 +04:00
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 2
|
|
|
|
|
}
|