1317 lines
48 KiB
Plaintext
1317 lines
48 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Загрузка набора данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 69,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Price</th>\n",
|
|||
|
" <th>Levy</th>\n",
|
|||
|
" <th>Manufacturer</th>\n",
|
|||
|
" <th>Model</th>\n",
|
|||
|
" <th>Prod. year</th>\n",
|
|||
|
" <th>Category</th>\n",
|
|||
|
" <th>Leather interior</th>\n",
|
|||
|
" <th>Fuel type</th>\n",
|
|||
|
" <th>Engine volume</th>\n",
|
|||
|
" <th>Mileage</th>\n",
|
|||
|
" <th>Cylinders</th>\n",
|
|||
|
" <th>Gear box type</th>\n",
|
|||
|
" <th>Drive wheels</th>\n",
|
|||
|
" <th>Doors</th>\n",
|
|||
|
" <th>Wheel</th>\n",
|
|||
|
" <th>Color</th>\n",
|
|||
|
" <th>Airbags</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>13328</td>\n",
|
|||
|
" <td>1399</td>\n",
|
|||
|
" <td>LEXUS</td>\n",
|
|||
|
" <td>RX 450</td>\n",
|
|||
|
" <td>2010</td>\n",
|
|||
|
" <td>Jeep</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Hybrid</td>\n",
|
|||
|
" <td>3.5</td>\n",
|
|||
|
" <td>186005</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>Automatic</td>\n",
|
|||
|
" <td>4x4</td>\n",
|
|||
|
" <td>Четырехдверный</td>\n",
|
|||
|
" <td>Left wheel</td>\n",
|
|||
|
" <td>Silver</td>\n",
|
|||
|
" <td>12</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>16621</td>\n",
|
|||
|
" <td>1018</td>\n",
|
|||
|
" <td>CHEVROLET</td>\n",
|
|||
|
" <td>Equinox</td>\n",
|
|||
|
" <td>2011</td>\n",
|
|||
|
" <td>Jeep</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Petrol</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>192000</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>Tiptronic</td>\n",
|
|||
|
" <td>4x4</td>\n",
|
|||
|
" <td>Четырехдверный</td>\n",
|
|||
|
" <td>Left wheel</td>\n",
|
|||
|
" <td>Black</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>8467</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>HONDA</td>\n",
|
|||
|
" <td>FIT</td>\n",
|
|||
|
" <td>2006</td>\n",
|
|||
|
" <td>Hatchback</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Petrol</td>\n",
|
|||
|
" <td>1.3</td>\n",
|
|||
|
" <td>200000</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>Variator</td>\n",
|
|||
|
" <td>Front</td>\n",
|
|||
|
" <td>Четырехдверный</td>\n",
|
|||
|
" <td>Right-hand drive</td>\n",
|
|||
|
" <td>Black</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>3607</td>\n",
|
|||
|
" <td>862</td>\n",
|
|||
|
" <td>FORD</td>\n",
|
|||
|
" <td>Escape</td>\n",
|
|||
|
" <td>2011</td>\n",
|
|||
|
" <td>Jeep</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Hybrid</td>\n",
|
|||
|
" <td>2.5</td>\n",
|
|||
|
" <td>168966</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>Automatic</td>\n",
|
|||
|
" <td>4x4</td>\n",
|
|||
|
" <td>Четырехдверный</td>\n",
|
|||
|
" <td>Left wheel</td>\n",
|
|||
|
" <td>White</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>11726</td>\n",
|
|||
|
" <td>446</td>\n",
|
|||
|
" <td>HONDA</td>\n",
|
|||
|
" <td>FIT</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" <td>Hatchback</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Petrol</td>\n",
|
|||
|
" <td>1.3</td>\n",
|
|||
|
" <td>91901</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>Automatic</td>\n",
|
|||
|
" <td>Front</td>\n",
|
|||
|
" <td>Четырехдверный</td>\n",
|
|||
|
" <td>Left wheel</td>\n",
|
|||
|
" <td>Silver</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12592</th>\n",
|
|||
|
" <td>8781</td>\n",
|
|||
|
" <td>1107</td>\n",
|
|||
|
" <td>OPEL</td>\n",
|
|||
|
" <td>Combo</td>\n",
|
|||
|
" <td>2007</td>\n",
|
|||
|
" <td>Goods wagon</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Diesel</td>\n",
|
|||
|
" <td>1.7</td>\n",
|
|||
|
" <td>236000</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>Manual</td>\n",
|
|||
|
" <td>Front</td>\n",
|
|||
|
" <td>Четырехдверный</td>\n",
|
|||
|
" <td>Left wheel</td>\n",
|
|||
|
" <td>Beige</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12593</th>\n",
|
|||
|
" <td>7840</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>NISSAN</td>\n",
|
|||
|
" <td>Skyline</td>\n",
|
|||
|
" <td>2003</td>\n",
|
|||
|
" <td>Sedan</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Petrol</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>220000</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>Tiptronic</td>\n",
|
|||
|
" <td>Rear</td>\n",
|
|||
|
" <td>Четырехдверный</td>\n",
|
|||
|
" <td>Right-hand drive</td>\n",
|
|||
|
" <td>White</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12594</th>\n",
|
|||
|
" <td>8467</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>MERCEDES-BENZ</td>\n",
|
|||
|
" <td>CLK 200</td>\n",
|
|||
|
" <td>1999</td>\n",
|
|||
|
" <td>Coupe</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>CNG</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>300000</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>Manual</td>\n",
|
|||
|
" <td>Rear</td>\n",
|
|||
|
" <td>Двухдверный</td>\n",
|
|||
|
" <td>Left wheel</td>\n",
|
|||
|
" <td>Silver</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12595</th>\n",
|
|||
|
" <td>15681</td>\n",
|
|||
|
" <td>831</td>\n",
|
|||
|
" <td>HYUNDAI</td>\n",
|
|||
|
" <td>Sonata</td>\n",
|
|||
|
" <td>2011</td>\n",
|
|||
|
" <td>Sedan</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Petrol</td>\n",
|
|||
|
" <td>2.4</td>\n",
|
|||
|
" <td>161600</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>Tiptronic</td>\n",
|
|||
|
" <td>Front</td>\n",
|
|||
|
" <td>Четырехдверный</td>\n",
|
|||
|
" <td>Left wheel</td>\n",
|
|||
|
" <td>Red</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12596</th>\n",
|
|||
|
" <td>26108</td>\n",
|
|||
|
" <td>836</td>\n",
|
|||
|
" <td>HYUNDAI</td>\n",
|
|||
|
" <td>Tucson</td>\n",
|
|||
|
" <td>2010</td>\n",
|
|||
|
" <td>Jeep</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Diesel</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>116365</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>Automatic</td>\n",
|
|||
|
" <td>Front</td>\n",
|
|||
|
" <td>Четырехдверный</td>\n",
|
|||
|
" <td>Left wheel</td>\n",
|
|||
|
" <td>Grey</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>12597 rows × 17 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Price Levy Manufacturer Model Prod. year Category \\\n",
|
|||
|
"0 13328 1399 LEXUS RX 450 2010 Jeep \n",
|
|||
|
"1 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
|
|||
|
"2 8467 0 HONDA FIT 2006 Hatchback \n",
|
|||
|
"3 3607 862 FORD Escape 2011 Jeep \n",
|
|||
|
"4 11726 446 HONDA FIT 2014 Hatchback \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"12592 8781 1107 OPEL Combo 2007 Goods wagon \n",
|
|||
|
"12593 7840 0 NISSAN Skyline 2003 Sedan \n",
|
|||
|
"12594 8467 0 MERCEDES-BENZ CLK 200 1999 Coupe \n",
|
|||
|
"12595 15681 831 HYUNDAI Sonata 2011 Sedan \n",
|
|||
|
"12596 26108 836 HYUNDAI Tucson 2010 Jeep \n",
|
|||
|
"\n",
|
|||
|
" Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
|
|||
|
"0 Yes Hybrid 3.5 186005 6 \n",
|
|||
|
"1 No Petrol 3.0 192000 6 \n",
|
|||
|
"2 No Petrol 1.3 200000 4 \n",
|
|||
|
"3 Yes Hybrid 2.5 168966 4 \n",
|
|||
|
"4 Yes Petrol 1.3 91901 4 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"12592 No Diesel 1.7 236000 4 \n",
|
|||
|
"12593 Yes Petrol 3.0 220000 6 \n",
|
|||
|
"12594 Yes CNG 2.0 300000 4 \n",
|
|||
|
"12595 Yes Petrol 2.4 161600 4 \n",
|
|||
|
"12596 Yes Diesel 2.0 116365 4 \n",
|
|||
|
"\n",
|
|||
|
" Gear box type Drive wheels Doors Wheel Color \\\n",
|
|||
|
"0 Automatic 4x4 Четырехдверный Left wheel Silver \n",
|
|||
|
"1 Tiptronic 4x4 Четырехдверный Left wheel Black \n",
|
|||
|
"2 Variator Front Четырехдверный Right-hand drive Black \n",
|
|||
|
"3 Automatic 4x4 Четырехдверный Left wheel White \n",
|
|||
|
"4 Automatic Front Четырехдверный Left wheel Silver \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"12592 Manual Front Четырехдверный Left wheel Beige \n",
|
|||
|
"12593 Tiptronic Rear Четырехдверный Right-hand drive White \n",
|
|||
|
"12594 Manual Rear Двухдверный Left wheel Silver \n",
|
|||
|
"12595 Tiptronic Front Четырехдверный Left wheel Red \n",
|
|||
|
"12596 Automatic Front Четырехдверный Left wheel Grey \n",
|
|||
|
"\n",
|
|||
|
" Airbags \n",
|
|||
|
"0 12 \n",
|
|||
|
"1 8 \n",
|
|||
|
"2 2 \n",
|
|||
|
"3 0 \n",
|
|||
|
"4 4 \n",
|
|||
|
"... ... \n",
|
|||
|
"12592 4 \n",
|
|||
|
"12593 0 \n",
|
|||
|
"12594 5 \n",
|
|||
|
"12595 8 \n",
|
|||
|
"12596 4 \n",
|
|||
|
"\n",
|
|||
|
"[12597 rows x 17 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 69,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import math\n",
|
|||
|
"\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.compose import ColumnTransformer\n",
|
|||
|
"from sklearn.impute import SimpleImputer\n",
|
|||
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"from sklearn.preprocessing import LabelEncoder\n",
|
|||
|
"from sklearn.preprocessing import PolynomialFeatures\n",
|
|||
|
"from sklearn import metrics\n",
|
|||
|
"from sklearn import linear_model, tree, neighbors, ensemble\n",
|
|||
|
"\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn import set_config\n",
|
|||
|
"from sklearn.pipeline import make_pipeline\n",
|
|||
|
"\n",
|
|||
|
"from transformers import CarsFeatures\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"set_config(transform_output=\"pandas\")\n",
|
|||
|
"\n",
|
|||
|
"random_state = 9\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"../data/car-price-prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"df = df.drop(columns=[\"Unnamed: 0\"])\n",
|
|||
|
"\n",
|
|||
|
"df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 70,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"Price int64\n",
|
|||
|
"Levy int64\n",
|
|||
|
"Manufacturer object\n",
|
|||
|
"Model object\n",
|
|||
|
"Prod. year int64\n",
|
|||
|
"Category object\n",
|
|||
|
"Leather interior object\n",
|
|||
|
"Fuel type object\n",
|
|||
|
"Engine volume float64\n",
|
|||
|
"Mileage int64\n",
|
|||
|
"Cylinders int64\n",
|
|||
|
"Gear box type object\n",
|
|||
|
"Drive wheels object\n",
|
|||
|
"Doors object\n",
|
|||
|
"Wheel object\n",
|
|||
|
"Color object\n",
|
|||
|
"Airbags int64\n",
|
|||
|
"dtype: object"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 70,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df.dtypes"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Разбиение на выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 71,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размеры выборок:\n",
|
|||
|
"Обучающая выборка: 8817 записей\n",
|
|||
|
"Category\n",
|
|||
|
"Sedan 3954\n",
|
|||
|
"Jeep 2263\n",
|
|||
|
"Hatchback 1554\n",
|
|||
|
"Minivan 312\n",
|
|||
|
"Coupe 251\n",
|
|||
|
"Universal 180\n",
|
|||
|
"Microbus 143\n",
|
|||
|
"Goods wagon 120\n",
|
|||
|
"Pickup 22\n",
|
|||
|
"Cabriolet 16\n",
|
|||
|
"Limousine 2\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Тестовая выборка: 3780 записей\n",
|
|||
|
"Category\n",
|
|||
|
"Sedan 1692\n",
|
|||
|
"Jeep 990\n",
|
|||
|
"Hatchback 636\n",
|
|||
|
"Minivan 151\n",
|
|||
|
"Coupe 117\n",
|
|||
|
"Universal 82\n",
|
|||
|
"Goods wagon 52\n",
|
|||
|
"Microbus 46\n",
|
|||
|
"Pickup 8\n",
|
|||
|
"Cabriolet 5\n",
|
|||
|
"Limousine 1\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"X = df\n",
|
|||
|
"y = df[\"Category\"]\n",
|
|||
|
"\n",
|
|||
|
"train_df, test_df, y_train, y_test = train_test_split(\n",
|
|||
|
" X, y, test_size=0.3, random_state=42\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размеры выборок:\")\n",
|
|||
|
"print(f\"Обучающая выборка: {train_df.shape[0]} записей\")\n",
|
|||
|
"print(train_df.Category.value_counts())\n",
|
|||
|
"print(f\"Тестовая выборка: {test_df.shape[0]} записей\")\n",
|
|||
|
"print(test_df.Category.value_counts())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Oversampling"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 72,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размеры выборок:\n",
|
|||
|
"Обучающая выборка: 43494 записей\n",
|
|||
|
"Category\n",
|
|||
|
"Sedan 3954\n",
|
|||
|
"Jeep 3954\n",
|
|||
|
"Universal 3954\n",
|
|||
|
"Hatchback 3954\n",
|
|||
|
"Coupe 3954\n",
|
|||
|
"Goods wagon 3954\n",
|
|||
|
"Minivan 3954\n",
|
|||
|
"Microbus 3954\n",
|
|||
|
"Pickup 3954\n",
|
|||
|
"Limousine 3954\n",
|
|||
|
"Cabriolet 3954\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Тестовая выборка: 18612 записей\n",
|
|||
|
"Category\n",
|
|||
|
"Hatchback 1692\n",
|
|||
|
"Sedan 1692\n",
|
|||
|
"Universal 1692\n",
|
|||
|
"Jeep 1692\n",
|
|||
|
"Coupe 1692\n",
|
|||
|
"Minivan 1692\n",
|
|||
|
"Goods wagon 1692\n",
|
|||
|
"Microbus 1692\n",
|
|||
|
"Pickup 1692\n",
|
|||
|
"Cabriolet 1692\n",
|
|||
|
"Limousine 1692\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop(\"Category\", axis=1)\n",
|
|||
|
" y = df[\"Category\"]\n",
|
|||
|
"\n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
"\n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"train_df_overs = oversample(train_df)\n",
|
|||
|
"test_df_overs = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размеры выборок:\")\n",
|
|||
|
"print(f\"Обучающая выборка: {train_df_overs.shape[0]} записей\")\n",
|
|||
|
"print(train_df_overs.Category.value_counts())\n",
|
|||
|
"print(f\"Тестовая выборка: {test_df_overs.shape[0]} записей\")\n",
|
|||
|
"print(test_df_overs.Category.value_counts())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 73,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"price_y_train = train_df[\"Price\"]\n",
|
|||
|
"price_y_test = test_df[\"Price\"]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Формирование конвейера для классификации данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 74,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"columns_to_drop = [\"Price\", \"Color\", \"Model\", \"Manufacturer\"]\n",
|
|||
|
"\n",
|
|||
|
"num_columns = [\n",
|
|||
|
" column\n",
|
|||
|
" for column in df.columns\n",
|
|||
|
" if column not in columns_to_drop and df[column].dtype != \"object\"\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"cat_columns = [\n",
|
|||
|
" column\n",
|
|||
|
" for column in df.columns\n",
|
|||
|
" if column not in columns_to_drop and df[column].dtype == \"object\"\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"num_imputer = SimpleImputer(strategy=\"median\")\n",
|
|||
|
"num_scaler = StandardScaler()\n",
|
|||
|
"preprocessing_num = Pipeline(\n",
|
|||
|
" [\n",
|
|||
|
" (\"imputer\", num_imputer),\n",
|
|||
|
" (\"scaler\", num_scaler),\n",
|
|||
|
" ]\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
|
|||
|
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
|
|||
|
"preprocessing_cat = Pipeline(\n",
|
|||
|
" [\n",
|
|||
|
" (\"imputer\", cat_imputer),\n",
|
|||
|
" (\"encoder\", cat_encoder),\n",
|
|||
|
" ]\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"features_preprocessing = ColumnTransformer(\n",
|
|||
|
" verbose_feature_names_out=False,\n",
|
|||
|
" transformers=[\n",
|
|||
|
" (\"prepocessing_num\", preprocessing_num, num_columns),\n",
|
|||
|
" (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
|
|||
|
" # (\"prepocessing_features\", num_imputer, [\"Prod. year\"]),\n",
|
|||
|
" ],\n",
|
|||
|
" remainder=\"passthrough\",\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"features_engineering = ColumnTransformer(\n",
|
|||
|
" verbose_feature_names_out=False,\n",
|
|||
|
" transformers=[\n",
|
|||
|
" (\"add_features\", CarsFeatures(), [\"Prod. year\"]),\n",
|
|||
|
" ],\n",
|
|||
|
" remainder=\"passthrough\",\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"drop_columns = ColumnTransformer(\n",
|
|||
|
" verbose_feature_names_out=False,\n",
|
|||
|
" transformers=[\n",
|
|||
|
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
|
|||
|
" ],\n",
|
|||
|
" remainder=\"passthrough\",\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"features_postprocessing = ColumnTransformer(\n",
|
|||
|
" verbose_feature_names_out=False,\n",
|
|||
|
" transformers=[\n",
|
|||
|
" (\"prepocessing_num\", preprocessing_num, [\"Age\"]),\n",
|
|||
|
" ],\n",
|
|||
|
" remainder=\"passthrough\",\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"pipeline_end = Pipeline(\n",
|
|||
|
" [\n",
|
|||
|
" (\"features_preprocessing\", features_preprocessing),\n",
|
|||
|
" (\"drop_columns\", drop_columns),\n",
|
|||
|
" ]\n",
|
|||
|
")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Демонстрация работы конвейера для предобработки данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 75,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Levy</th>\n",
|
|||
|
" <th>Prod. year</th>\n",
|
|||
|
" <th>Engine volume</th>\n",
|
|||
|
" <th>Mileage</th>\n",
|
|||
|
" <th>Cylinders</th>\n",
|
|||
|
" <th>Airbags</th>\n",
|
|||
|
" <th>Category_Coupe</th>\n",
|
|||
|
" <th>Category_Goods wagon</th>\n",
|
|||
|
" <th>Category_Hatchback</th>\n",
|
|||
|
" <th>Category_Jeep</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>Fuel type_Petrol</th>\n",
|
|||
|
" <th>Fuel type_Plug-in Hybrid</th>\n",
|
|||
|
" <th>Gear box type_Manual</th>\n",
|
|||
|
" <th>Gear box type_Tiptronic</th>\n",
|
|||
|
" <th>Gear box type_Variator</th>\n",
|
|||
|
" <th>Drive wheels_Front</th>\n",
|
|||
|
" <th>Drive wheels_Rear</th>\n",
|
|||
|
" <th>Doors_Многодверный</th>\n",
|
|||
|
" <th>Doors_Четырехдверный</th>\n",
|
|||
|
" <th>Wheel_Right-hand drive</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10083</th>\n",
|
|||
|
" <td>-1.192982</td>\n",
|
|||
|
" <td>1.946936</td>\n",
|
|||
|
" <td>-0.479341</td>\n",
|
|||
|
" <td>-1.531744</td>\n",
|
|||
|
" <td>-0.403213</td>\n",
|
|||
|
" <td>-0.683755</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9482</th>\n",
|
|||
|
" <td>-1.192982</td>\n",
|
|||
|
" <td>-0.879266</td>\n",
|
|||
|
" <td>-0.887855</td>\n",
|
|||
|
" <td>-0.130245</td>\n",
|
|||
|
" <td>-0.403213</td>\n",
|
|||
|
" <td>-1.190217</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6177</th>\n",
|
|||
|
" <td>0.081576</td>\n",
|
|||
|
" <td>0.642535</td>\n",
|
|||
|
" <td>-0.479341</td>\n",
|
|||
|
" <td>-0.651122</td>\n",
|
|||
|
" <td>-0.403213</td>\n",
|
|||
|
" <td>0.835631</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11756</th>\n",
|
|||
|
" <td>-1.192982</td>\n",
|
|||
|
" <td>-1.531466</td>\n",
|
|||
|
" <td>0.473858</td>\n",
|
|||
|
" <td>1.263152</td>\n",
|
|||
|
" <td>-0.403213</td>\n",
|
|||
|
" <td>-0.430524</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6557</th>\n",
|
|||
|
" <td>1.703146</td>\n",
|
|||
|
" <td>1.512135</td>\n",
|
|||
|
" <td>0.473858</td>\n",
|
|||
|
" <td>-0.739330</td>\n",
|
|||
|
" <td>-0.403213</td>\n",
|
|||
|
" <td>-0.683755</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11964</th>\n",
|
|||
|
" <td>-1.192982</td>\n",
|
|||
|
" <td>-0.879266</td>\n",
|
|||
|
" <td>0.337687</td>\n",
|
|||
|
" <td>0.318018</td>\n",
|
|||
|
" <td>1.538421</td>\n",
|
|||
|
" <td>-0.683755</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5191</th>\n",
|
|||
|
" <td>0.476602</td>\n",
|
|||
|
" <td>0.859935</td>\n",
|
|||
|
" <td>-0.206998</td>\n",
|
|||
|
" <td>-0.011145</td>\n",
|
|||
|
" <td>-0.403213</td>\n",
|
|||
|
" <td>1.342092</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5390</th>\n",
|
|||
|
" <td>0.646834</td>\n",
|
|||
|
" <td>-0.661866</td>\n",
|
|||
|
" <td>-0.887855</td>\n",
|
|||
|
" <td>1.155137</td>\n",
|
|||
|
" <td>-0.403213</td>\n",
|
|||
|
" <td>0.329169</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>860</th>\n",
|
|||
|
" <td>-1.192982</td>\n",
|
|||
|
" <td>1.077335</td>\n",
|
|||
|
" <td>-0.751684</td>\n",
|
|||
|
" <td>-0.697325</td>\n",
|
|||
|
" <td>-0.403213</td>\n",
|
|||
|
" <td>0.329169</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7270</th>\n",
|
|||
|
" <td>0.446048</td>\n",
|
|||
|
" <td>0.425135</td>\n",
|
|||
|
" <td>0.337687</td>\n",
|
|||
|
" <td>-0.465093</td>\n",
|
|||
|
" <td>7.363324</td>\n",
|
|||
|
" <td>0.329169</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>8817 rows × 31 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Levy Prod. year Engine volume Mileage Cylinders Airbags \\\n",
|
|||
|
"10083 -1.192982 1.946936 -0.479341 -1.531744 -0.403213 -0.683755 \n",
|
|||
|
"9482 -1.192982 -0.879266 -0.887855 -0.130245 -0.403213 -1.190217 \n",
|
|||
|
"6177 0.081576 0.642535 -0.479341 -0.651122 -0.403213 0.835631 \n",
|
|||
|
"11756 -1.192982 -1.531466 0.473858 1.263152 -0.403213 -0.430524 \n",
|
|||
|
"6557 1.703146 1.512135 0.473858 -0.739330 -0.403213 -0.683755 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"11964 -1.192982 -0.879266 0.337687 0.318018 1.538421 -0.683755 \n",
|
|||
|
"5191 0.476602 0.859935 -0.206998 -0.011145 -0.403213 1.342092 \n",
|
|||
|
"5390 0.646834 -0.661866 -0.887855 1.155137 -0.403213 0.329169 \n",
|
|||
|
"860 -1.192982 1.077335 -0.751684 -0.697325 -0.403213 0.329169 \n",
|
|||
|
"7270 0.446048 0.425135 0.337687 -0.465093 7.363324 0.329169 \n",
|
|||
|
"\n",
|
|||
|
" Category_Coupe Category_Goods wagon Category_Hatchback \\\n",
|
|||
|
"10083 0.0 0.0 0.0 \n",
|
|||
|
"9482 0.0 0.0 0.0 \n",
|
|||
|
"6177 0.0 0.0 0.0 \n",
|
|||
|
"11756 0.0 0.0 0.0 \n",
|
|||
|
"6557 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"11964 0.0 0.0 0.0 \n",
|
|||
|
"5191 0.0 0.0 0.0 \n",
|
|||
|
"5390 0.0 0.0 1.0 \n",
|
|||
|
"860 0.0 0.0 0.0 \n",
|
|||
|
"7270 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Category_Jeep ... Fuel type_Petrol Fuel type_Plug-in Hybrid \\\n",
|
|||
|
"10083 0.0 ... 1.0 0.0 \n",
|
|||
|
"9482 0.0 ... 1.0 0.0 \n",
|
|||
|
"6177 0.0 ... 1.0 0.0 \n",
|
|||
|
"11756 1.0 ... 0.0 0.0 \n",
|
|||
|
"6557 0.0 ... 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"11964 0.0 ... 1.0 0.0 \n",
|
|||
|
"5191 0.0 ... 1.0 0.0 \n",
|
|||
|
"5390 0.0 ... 0.0 0.0 \n",
|
|||
|
"860 0.0 ... 1.0 0.0 \n",
|
|||
|
"7270 0.0 ... 1.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Gear box type_Manual Gear box type_Tiptronic Gear box type_Variator \\\n",
|
|||
|
"10083 0.0 0.0 0.0 \n",
|
|||
|
"9482 0.0 1.0 0.0 \n",
|
|||
|
"6177 0.0 1.0 0.0 \n",
|
|||
|
"11756 0.0 0.0 0.0 \n",
|
|||
|
"6557 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"11964 1.0 0.0 0.0 \n",
|
|||
|
"5191 0.0 0.0 0.0 \n",
|
|||
|
"5390 0.0 0.0 1.0 \n",
|
|||
|
"860 0.0 0.0 0.0 \n",
|
|||
|
"7270 0.0 1.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Drive wheels_Front Drive wheels_Rear Doors_Многодверный \\\n",
|
|||
|
"10083 1.0 0.0 0.0 \n",
|
|||
|
"9482 1.0 0.0 0.0 \n",
|
|||
|
"6177 1.0 0.0 0.0 \n",
|
|||
|
"11756 0.0 0.0 0.0 \n",
|
|||
|
"6557 1.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"11964 0.0 0.0 0.0 \n",
|
|||
|
"5191 1.0 0.0 0.0 \n",
|
|||
|
"5390 1.0 0.0 0.0 \n",
|
|||
|
"860 1.0 0.0 0.0 \n",
|
|||
|
"7270 1.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" Doors_Четырехдверный Wheel_Right-hand drive \n",
|
|||
|
"10083 1.0 0.0 \n",
|
|||
|
"9482 1.0 1.0 \n",
|
|||
|
"6177 1.0 0.0 \n",
|
|||
|
"11756 1.0 0.0 \n",
|
|||
|
"6557 1.0 0.0 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"11964 1.0 0.0 \n",
|
|||
|
"5191 1.0 0.0 \n",
|
|||
|
"5390 1.0 0.0 \n",
|
|||
|
"860 1.0 0.0 \n",
|
|||
|
"7270 1.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[8817 rows x 31 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 75,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"preprocessing_result = pipeline_end.fit_transform(train_df)\n",
|
|||
|
"preprocessed_df = pd.DataFrame(\n",
|
|||
|
" preprocessing_result,\n",
|
|||
|
" columns=pipeline_end.get_feature_names_out(),\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"preprocessed_df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Формирование набора моделей"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 76,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"models = {\n",
|
|||
|
" \"linear\": {\"model\": linear_model.LinearRegression(n_jobs=-1)},\n",
|
|||
|
" \"linear_poly\": {\n",
|
|||
|
" \"model\": make_pipeline(\n",
|
|||
|
" PolynomialFeatures(degree=2),\n",
|
|||
|
" linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n",
|
|||
|
" )\n",
|
|||
|
" },\n",
|
|||
|
" \"linear_interact\": {\n",
|
|||
|
" \"model\": make_pipeline(\n",
|
|||
|
" PolynomialFeatures(interaction_only=True),\n",
|
|||
|
" linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n",
|
|||
|
" )\n",
|
|||
|
" },\n",
|
|||
|
" \"ridge\": {\"model\": linear_model.RidgeCV()},\n",
|
|||
|
" \"decision_tree\": {\n",
|
|||
|
" \"model\": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)\n",
|
|||
|
" },\n",
|
|||
|
" \"knn\": {\"model\": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},\n",
|
|||
|
" \"random_forest\": {\n",
|
|||
|
" \"model\": ensemble.RandomForestRegressor(\n",
|
|||
|
" max_depth=7, random_state=random_state, n_jobs=-1\n",
|
|||
|
" )\n",
|
|||
|
" },\n",
|
|||
|
"}"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Обучение и оценка моделей с помощью различных алгоритмов"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 77,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: linear\n",
|
|||
|
"Model: linear_poly\n",
|
|||
|
"Model: linear_interact\n",
|
|||
|
"Model: ridge\n",
|
|||
|
"Model: decision_tree\n",
|
|||
|
"Model: knn\n",
|
|||
|
"Model: random_forest\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"for model_name in models.keys():\n",
|
|||
|
" print(f\"Model: {model_name}\")\n",
|
|||
|
"\n",
|
|||
|
" model = models[model_name][\"model\"]\n",
|
|||
|
"\n",
|
|||
|
" model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
|
|||
|
" model_pipeline = model_pipeline.fit(train_df, price_y_train.values.ravel())\n",
|
|||
|
"\n",
|
|||
|
" y_train_pred = model_pipeline.predict(train_df)\n",
|
|||
|
" y_test_pred = model_pipeline.predict(test_df)\n",
|
|||
|
"\n",
|
|||
|
" models[model_name][\"fitted\"] = model_pipeline\n",
|
|||
|
" models[model_name][\"train_preds\"] = y_train_pred\n",
|
|||
|
" models[model_name][\"preds\"] = y_test_pred\n",
|
|||
|
" models[model_name][\"RMSE_train\"] = math.sqrt(\n",
|
|||
|
" metrics.mean_squared_error(price_y_train, y_train_pred)\n",
|
|||
|
" )\n",
|
|||
|
" models[model_name][\"RMSE_test\"] = math.sqrt(\n",
|
|||
|
" metrics.mean_squared_error(price_y_test, y_test_pred)\n",
|
|||
|
" )\n",
|
|||
|
" models[model_name][\"RMAE_test\"] = math.sqrt(\n",
|
|||
|
" metrics.mean_absolute_error(price_y_test, y_test_pred)\n",
|
|||
|
" )\n",
|
|||
|
" models[model_name][\"R2_test\"] = metrics.r2_score(price_y_test, y_test_pred)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Вывод результатов оценки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 78,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style type=\"text/css\">\n",
|
|||
|
"#T_96325_row0_col0, #T_96325_row0_col1, #T_96325_row1_col1, #T_96325_row2_col1, #T_96325_row3_col1, #T_96325_row4_col1 {\n",
|
|||
|
" background-color: #26818e;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_96325_row0_col2, #T_96325_row1_col2, #T_96325_row2_col2, #T_96325_row3_col2, #T_96325_row4_col2, #T_96325_row6_col3 {\n",
|
|||
|
" background-color: #4e02a2;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_96325_row0_col3, #T_96325_row1_col3, #T_96325_row2_col3, #T_96325_row3_col3, #T_96325_row4_col3, #T_96325_row5_col3, #T_96325_row6_col2 {\n",
|
|||
|
" background-color: #da5a6a;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_96325_row1_col0 {\n",
|
|||
|
" background-color: #1fa088;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_96325_row2_col0, #T_96325_row6_col0 {\n",
|
|||
|
" background-color: #2db27d;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_96325_row3_col0, #T_96325_row4_col0, #T_96325_row6_col1 {\n",
|
|||
|
" background-color: #a8db34;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_96325_row5_col0 {\n",
|
|||
|
" background-color: #26ad81;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_96325_row5_col1 {\n",
|
|||
|
" background-color: #25838e;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_96325_row5_col2 {\n",
|
|||
|
" background-color: #6c00a8;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"</style>\n",
|
|||
|
"<table id=\"T_96325\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"blank level0\" > </th>\n",
|
|||
|
" <th id=\"T_96325_level0_col0\" class=\"col_heading level0 col0\" >RMSE_train</th>\n",
|
|||
|
" <th id=\"T_96325_level0_col1\" class=\"col_heading level0 col1\" >RMSE_test</th>\n",
|
|||
|
" <th id=\"T_96325_level0_col2\" class=\"col_heading level0 col2\" >RMAE_test</th>\n",
|
|||
|
" <th id=\"T_96325_level0_col3\" class=\"col_heading level0 col3\" >R2_test</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_96325_level0_row0\" class=\"row_heading level0 row0\" >knn</th>\n",
|
|||
|
" <td id=\"T_96325_row0_col0\" class=\"data row0 col0\" >5777.663053</td>\n",
|
|||
|
" <td id=\"T_96325_row0_col1\" class=\"data row0 col1\" >6607.095563</td>\n",
|
|||
|
" <td id=\"T_96325_row0_col2\" class=\"data row0 col2\" >67.047388</td>\n",
|
|||
|
" <td id=\"T_96325_row0_col3\" class=\"data row0 col3\" >0.632511</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_96325_level0_row1\" class=\"row_heading level0 row1\" >random_forest</th>\n",
|
|||
|
" <td id=\"T_96325_row1_col0\" class=\"data row1 col0\" >6567.406346</td>\n",
|
|||
|
" <td id=\"T_96325_row1_col1\" class=\"data row1 col1\" >6852.474190</td>\n",
|
|||
|
" <td id=\"T_96325_row1_col2\" class=\"data row1 col2\" >70.119860</td>\n",
|
|||
|
" <td id=\"T_96325_row1_col3\" class=\"data row1 col3\" >0.604708</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_96325_level0_row2\" class=\"row_heading level0 row2\" >decision_tree</th>\n",
|
|||
|
" <td id=\"T_96325_row2_col0\" class=\"data row2 col0\" >7022.066577</td>\n",
|
|||
|
" <td id=\"T_96325_row2_col1\" class=\"data row2 col1\" >7393.444466</td>\n",
|
|||
|
" <td id=\"T_96325_row2_col2\" class=\"data row2 col2\" >72.910390</td>\n",
|
|||
|
" <td id=\"T_96325_row2_col3\" class=\"data row2 col3\" >0.539832</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_96325_level0_row3\" class=\"row_heading level0 row3\" >linear</th>\n",
|
|||
|
" <td id=\"T_96325_row3_col0\" class=\"data row3 col0\" >8399.752941</td>\n",
|
|||
|
" <td id=\"T_96325_row3_col1\" class=\"data row3 col1\" >8498.166215</td>\n",
|
|||
|
" <td id=\"T_96325_row3_col2\" class=\"data row3 col2\" >80.676781</td>\n",
|
|||
|
" <td id=\"T_96325_row3_col3\" class=\"data row3 col3\" >0.392042</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_96325_level0_row4\" class=\"row_heading level0 row4\" >ridge</th>\n",
|
|||
|
" <td id=\"T_96325_row4_col0\" class=\"data row4 col0\" >8400.004465</td>\n",
|
|||
|
" <td id=\"T_96325_row4_col1\" class=\"data row4 col1\" >8498.452033</td>\n",
|
|||
|
" <td id=\"T_96325_row4_col2\" class=\"data row4 col2\" >80.683952</td>\n",
|
|||
|
" <td id=\"T_96325_row4_col3\" class=\"data row4 col3\" >0.392001</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_96325_level0_row5\" class=\"row_heading level0 row5\" >linear_poly</th>\n",
|
|||
|
" <td id=\"T_96325_row5_col0\" class=\"data row5 col0\" >6880.451269</td>\n",
|
|||
|
" <td id=\"T_96325_row5_col1\" class=\"data row5 col1\" >3235067552304.437988</td>\n",
|
|||
|
" <td id=\"T_96325_row5_col2\" class=\"data row5 col2\" >267308.588962</td>\n",
|
|||
|
" <td id=\"T_96325_row5_col3\" class=\"data row5 col3\" >-88102783177967152.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_96325_level0_row6\" class=\"row_heading level0 row6\" >linear_interact</th>\n",
|
|||
|
" <td id=\"T_96325_row6_col0\" class=\"data row6 col0\" >7037.525048</td>\n",
|
|||
|
" <td id=\"T_96325_row6_col1\" class=\"data row6 col1\" >113842510019087.921875</td>\n",
|
|||
|
" <td id=\"T_96325_row6_col2\" class=\"data row6 col2\" >1576050.007127</td>\n",
|
|||
|
" <td id=\"T_96325_row6_col3\" class=\"data row6 col3\" >-109101782409976135680.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"<pandas.io.formats.style.Styler at 0x18c02098860>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 78,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"reg_metrics = pd.DataFrame.from_dict(models, \"index\")[\n",
|
|||
|
" [\"RMSE_train\", \"RMSE_test\", \"RMAE_test\", \"R2_test\"]\n",
|
|||
|
"]\n",
|
|||
|
"reg_metrics.sort_values(by=\"RMSE_test\").style.background_gradient(\n",
|
|||
|
" cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE_train\", \"RMSE_test\"]\n",
|
|||
|
").background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"RMAE_test\", \"R2_test\"])"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.8"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|