mai_pi-33_zakharov/notebooks/lab4_pipeline.ipynb

1317 lines
48 KiB
Plaintext
Raw Normal View History

2024-12-12 23:48:52 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Загрузка набора данных"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Price</th>\n",
" <th>Levy</th>\n",
" <th>Manufacturer</th>\n",
" <th>Model</th>\n",
" <th>Prod. year</th>\n",
" <th>Category</th>\n",
" <th>Leather interior</th>\n",
" <th>Fuel type</th>\n",
" <th>Engine volume</th>\n",
" <th>Mileage</th>\n",
" <th>Cylinders</th>\n",
" <th>Gear box type</th>\n",
" <th>Drive wheels</th>\n",
" <th>Doors</th>\n",
" <th>Wheel</th>\n",
" <th>Color</th>\n",
" <th>Airbags</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13328</td>\n",
" <td>1399</td>\n",
" <td>LEXUS</td>\n",
" <td>RX 450</td>\n",
" <td>2010</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Hybrid</td>\n",
" <td>3.5</td>\n",
" <td>186005</td>\n",
" <td>6</td>\n",
" <td>Automatic</td>\n",
" <td>4x4</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>16621</td>\n",
" <td>1018</td>\n",
" <td>CHEVROLET</td>\n",
" <td>Equinox</td>\n",
" <td>2011</td>\n",
" <td>Jeep</td>\n",
" <td>No</td>\n",
" <td>Petrol</td>\n",
" <td>3.0</td>\n",
" <td>192000</td>\n",
" <td>6</td>\n",
" <td>Tiptronic</td>\n",
" <td>4x4</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Black</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8467</td>\n",
" <td>0</td>\n",
" <td>HONDA</td>\n",
" <td>FIT</td>\n",
" <td>2006</td>\n",
" <td>Hatchback</td>\n",
" <td>No</td>\n",
" <td>Petrol</td>\n",
" <td>1.3</td>\n",
" <td>200000</td>\n",
" <td>4</td>\n",
" <td>Variator</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Right-hand drive</td>\n",
" <td>Black</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3607</td>\n",
" <td>862</td>\n",
" <td>FORD</td>\n",
" <td>Escape</td>\n",
" <td>2011</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Hybrid</td>\n",
" <td>2.5</td>\n",
" <td>168966</td>\n",
" <td>4</td>\n",
" <td>Automatic</td>\n",
" <td>4x4</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>White</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>11726</td>\n",
" <td>446</td>\n",
" <td>HONDA</td>\n",
" <td>FIT</td>\n",
" <td>2014</td>\n",
" <td>Hatchback</td>\n",
" <td>Yes</td>\n",
" <td>Petrol</td>\n",
" <td>1.3</td>\n",
" <td>91901</td>\n",
" <td>4</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12592</th>\n",
" <td>8781</td>\n",
" <td>1107</td>\n",
" <td>OPEL</td>\n",
" <td>Combo</td>\n",
" <td>2007</td>\n",
" <td>Goods wagon</td>\n",
" <td>No</td>\n",
" <td>Diesel</td>\n",
" <td>1.7</td>\n",
" <td>236000</td>\n",
" <td>4</td>\n",
" <td>Manual</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Beige</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12593</th>\n",
" <td>7840</td>\n",
" <td>0</td>\n",
" <td>NISSAN</td>\n",
" <td>Skyline</td>\n",
" <td>2003</td>\n",
" <td>Sedan</td>\n",
" <td>Yes</td>\n",
" <td>Petrol</td>\n",
" <td>3.0</td>\n",
" <td>220000</td>\n",
" <td>6</td>\n",
" <td>Tiptronic</td>\n",
" <td>Rear</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Right-hand drive</td>\n",
" <td>White</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12594</th>\n",
" <td>8467</td>\n",
" <td>0</td>\n",
" <td>MERCEDES-BENZ</td>\n",
" <td>CLK 200</td>\n",
" <td>1999</td>\n",
" <td>Coupe</td>\n",
" <td>Yes</td>\n",
" <td>CNG</td>\n",
" <td>2.0</td>\n",
" <td>300000</td>\n",
" <td>4</td>\n",
" <td>Manual</td>\n",
" <td>Rear</td>\n",
" <td>Двухдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12595</th>\n",
" <td>15681</td>\n",
" <td>831</td>\n",
" <td>HYUNDAI</td>\n",
" <td>Sonata</td>\n",
" <td>2011</td>\n",
" <td>Sedan</td>\n",
" <td>Yes</td>\n",
" <td>Petrol</td>\n",
" <td>2.4</td>\n",
" <td>161600</td>\n",
" <td>4</td>\n",
" <td>Tiptronic</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Red</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12596</th>\n",
" <td>26108</td>\n",
" <td>836</td>\n",
" <td>HYUNDAI</td>\n",
" <td>Tucson</td>\n",
" <td>2010</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Diesel</td>\n",
" <td>2.0</td>\n",
" <td>116365</td>\n",
" <td>4</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Grey</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>12597 rows × 17 columns</p>\n",
"</div>"
],
"text/plain": [
" Price Levy Manufacturer Model Prod. year Category \\\n",
"0 13328 1399 LEXUS RX 450 2010 Jeep \n",
"1 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
"2 8467 0 HONDA FIT 2006 Hatchback \n",
"3 3607 862 FORD Escape 2011 Jeep \n",
"4 11726 446 HONDA FIT 2014 Hatchback \n",
"... ... ... ... ... ... ... \n",
"12592 8781 1107 OPEL Combo 2007 Goods wagon \n",
"12593 7840 0 NISSAN Skyline 2003 Sedan \n",
"12594 8467 0 MERCEDES-BENZ CLK 200 1999 Coupe \n",
"12595 15681 831 HYUNDAI Sonata 2011 Sedan \n",
"12596 26108 836 HYUNDAI Tucson 2010 Jeep \n",
"\n",
" Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
"0 Yes Hybrid 3.5 186005 6 \n",
"1 No Petrol 3.0 192000 6 \n",
"2 No Petrol 1.3 200000 4 \n",
"3 Yes Hybrid 2.5 168966 4 \n",
"4 Yes Petrol 1.3 91901 4 \n",
"... ... ... ... ... ... \n",
"12592 No Diesel 1.7 236000 4 \n",
"12593 Yes Petrol 3.0 220000 6 \n",
"12594 Yes CNG 2.0 300000 4 \n",
"12595 Yes Petrol 2.4 161600 4 \n",
"12596 Yes Diesel 2.0 116365 4 \n",
"\n",
" Gear box type Drive wheels Doors Wheel Color \\\n",
"0 Automatic 4x4 Четырехдверный Left wheel Silver \n",
"1 Tiptronic 4x4 Четырехдверный Left wheel Black \n",
"2 Variator Front Четырехдверный Right-hand drive Black \n",
"3 Automatic 4x4 Четырехдверный Left wheel White \n",
"4 Automatic Front Четырехдверный Left wheel Silver \n",
"... ... ... ... ... ... \n",
"12592 Manual Front Четырехдверный Left wheel Beige \n",
"12593 Tiptronic Rear Четырехдверный Right-hand drive White \n",
"12594 Manual Rear Двухдверный Left wheel Silver \n",
"12595 Tiptronic Front Четырехдверный Left wheel Red \n",
"12596 Automatic Front Четырехдверный Left wheel Grey \n",
"\n",
" Airbags \n",
"0 12 \n",
"1 8 \n",
"2 2 \n",
"3 0 \n",
"4 4 \n",
"... ... \n",
"12592 4 \n",
"12593 0 \n",
"12594 5 \n",
"12595 8 \n",
"12596 4 \n",
"\n",
"[12597 rows x 17 columns]"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import math\n",
"\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"from sklearn import metrics\n",
"from sklearn import linear_model, tree, neighbors, ensemble\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import set_config\n",
"from sklearn.pipeline import make_pipeline\n",
"\n",
"from transformers import CarsFeatures\n",
"\n",
"\n",
"set_config(transform_output=\"pandas\")\n",
"\n",
"random_state = 9\n",
"\n",
"df = pd.read_csv(\"../data/car-price-prediction.csv\")\n",
"\n",
"df = df.drop(columns=[\"Unnamed: 0\"])\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Price int64\n",
"Levy int64\n",
"Manufacturer object\n",
"Model object\n",
"Prod. year int64\n",
"Category object\n",
"Leather interior object\n",
"Fuel type object\n",
"Engine volume float64\n",
"Mileage int64\n",
"Cylinders int64\n",
"Gear box type object\n",
"Drive wheels object\n",
"Doors object\n",
"Wheel object\n",
"Color object\n",
"Airbags int64\n",
"dtype: object"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Разбиение на выборки"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размеры выборок:\n",
"Обучающая выборка: 8817 записей\n",
"Category\n",
"Sedan 3954\n",
"Jeep 2263\n",
"Hatchback 1554\n",
"Minivan 312\n",
"Coupe 251\n",
"Universal 180\n",
"Microbus 143\n",
"Goods wagon 120\n",
"Pickup 22\n",
"Cabriolet 16\n",
"Limousine 2\n",
"Name: count, dtype: int64\n",
"Тестовая выборка: 3780 записей\n",
"Category\n",
"Sedan 1692\n",
"Jeep 990\n",
"Hatchback 636\n",
"Minivan 151\n",
"Coupe 117\n",
"Universal 82\n",
"Goods wagon 52\n",
"Microbus 46\n",
"Pickup 8\n",
"Cabriolet 5\n",
"Limousine 1\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"X = df\n",
"y = df[\"Category\"]\n",
"\n",
"train_df, test_df, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.3, random_state=42\n",
")\n",
"\n",
"print(\"Размеры выборок:\")\n",
"print(f\"Обучающая выборка: {train_df.shape[0]} записей\")\n",
"print(train_df.Category.value_counts())\n",
"print(f\"Тестовая выборка: {test_df.shape[0]} записей\")\n",
"print(test_df.Category.value_counts())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Oversampling"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размеры выборок:\n",
"Обучающая выборка: 43494 записей\n",
"Category\n",
"Sedan 3954\n",
"Jeep 3954\n",
"Universal 3954\n",
"Hatchback 3954\n",
"Coupe 3954\n",
"Goods wagon 3954\n",
"Minivan 3954\n",
"Microbus 3954\n",
"Pickup 3954\n",
"Limousine 3954\n",
"Cabriolet 3954\n",
"Name: count, dtype: int64\n",
"Тестовая выборка: 18612 записей\n",
"Category\n",
"Hatchback 1692\n",
"Sedan 1692\n",
"Universal 1692\n",
"Jeep 1692\n",
"Coupe 1692\n",
"Minivan 1692\n",
"Goods wagon 1692\n",
"Microbus 1692\n",
"Pickup 1692\n",
"Cabriolet 1692\n",
"Limousine 1692\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"def oversample(df):\n",
" X = df.drop(\"Category\", axis=1)\n",
" y = df[\"Category\"]\n",
"\n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
"\n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"\n",
"train_df_overs = oversample(train_df)\n",
"test_df_overs = oversample(test_df)\n",
"\n",
"print(\"Размеры выборок:\")\n",
"print(f\"Обучающая выборка: {train_df_overs.shape[0]} записей\")\n",
"print(train_df_overs.Category.value_counts())\n",
"print(f\"Тестовая выборка: {test_df_overs.shape[0]} записей\")\n",
"print(test_df_overs.Category.value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"price_y_train = train_df[\"Price\"]\n",
"price_y_test = test_df[\"Price\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Формирование конвейера для классификации данных"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"columns_to_drop = [\"Price\", \"Color\", \"Model\", \"Manufacturer\"]\n",
"\n",
"num_columns = [\n",
" column\n",
" for column in df.columns\n",
" if column not in columns_to_drop and df[column].dtype != \"object\"\n",
"]\n",
"\n",
"cat_columns = [\n",
" column\n",
" for column in df.columns\n",
" if column not in columns_to_drop and df[column].dtype == \"object\"\n",
"]\n",
"\n",
"num_imputer = SimpleImputer(strategy=\"median\")\n",
"num_scaler = StandardScaler()\n",
"preprocessing_num = Pipeline(\n",
" [\n",
" (\"imputer\", num_imputer),\n",
" (\"scaler\", num_scaler),\n",
" ]\n",
")\n",
"\n",
"cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
"preprocessing_cat = Pipeline(\n",
" [\n",
" (\"imputer\", cat_imputer),\n",
" (\"encoder\", cat_encoder),\n",
" ]\n",
")\n",
"\n",
"features_preprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_num\", preprocessing_num, num_columns),\n",
" (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
" # (\"prepocessing_features\", num_imputer, [\"Prod. year\"]),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"features_engineering = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"add_features\", CarsFeatures(), [\"Prod. year\"]),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"drop_columns = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"features_postprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_num\", preprocessing_num, [\"Age\"]),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"\n",
"pipeline_end = Pipeline(\n",
" [\n",
" (\"features_preprocessing\", features_preprocessing),\n",
" (\"drop_columns\", drop_columns),\n",
" ]\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Демонстрация работы конвейера для предобработки данных"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Levy</th>\n",
" <th>Prod. year</th>\n",
" <th>Engine volume</th>\n",
" <th>Mileage</th>\n",
" <th>Cylinders</th>\n",
" <th>Airbags</th>\n",
" <th>Category_Coupe</th>\n",
" <th>Category_Goods wagon</th>\n",
" <th>Category_Hatchback</th>\n",
" <th>Category_Jeep</th>\n",
" <th>...</th>\n",
" <th>Fuel type_Petrol</th>\n",
" <th>Fuel type_Plug-in Hybrid</th>\n",
" <th>Gear box type_Manual</th>\n",
" <th>Gear box type_Tiptronic</th>\n",
" <th>Gear box type_Variator</th>\n",
" <th>Drive wheels_Front</th>\n",
" <th>Drive wheels_Rear</th>\n",
" <th>Doors_Многодверный</th>\n",
" <th>Doors_Четырехдверный</th>\n",
" <th>Wheel_Right-hand drive</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>10083</th>\n",
" <td>-1.192982</td>\n",
" <td>1.946936</td>\n",
" <td>-0.479341</td>\n",
" <td>-1.531744</td>\n",
" <td>-0.403213</td>\n",
" <td>-0.683755</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9482</th>\n",
" <td>-1.192982</td>\n",
" <td>-0.879266</td>\n",
" <td>-0.887855</td>\n",
" <td>-0.130245</td>\n",
" <td>-0.403213</td>\n",
" <td>-1.190217</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6177</th>\n",
" <td>0.081576</td>\n",
" <td>0.642535</td>\n",
" <td>-0.479341</td>\n",
" <td>-0.651122</td>\n",
" <td>-0.403213</td>\n",
" <td>0.835631</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11756</th>\n",
" <td>-1.192982</td>\n",
" <td>-1.531466</td>\n",
" <td>0.473858</td>\n",
" <td>1.263152</td>\n",
" <td>-0.403213</td>\n",
" <td>-0.430524</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6557</th>\n",
" <td>1.703146</td>\n",
" <td>1.512135</td>\n",
" <td>0.473858</td>\n",
" <td>-0.739330</td>\n",
" <td>-0.403213</td>\n",
" <td>-0.683755</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11964</th>\n",
" <td>-1.192982</td>\n",
" <td>-0.879266</td>\n",
" <td>0.337687</td>\n",
" <td>0.318018</td>\n",
" <td>1.538421</td>\n",
" <td>-0.683755</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5191</th>\n",
" <td>0.476602</td>\n",
" <td>0.859935</td>\n",
" <td>-0.206998</td>\n",
" <td>-0.011145</td>\n",
" <td>-0.403213</td>\n",
" <td>1.342092</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5390</th>\n",
" <td>0.646834</td>\n",
" <td>-0.661866</td>\n",
" <td>-0.887855</td>\n",
" <td>1.155137</td>\n",
" <td>-0.403213</td>\n",
" <td>0.329169</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>860</th>\n",
" <td>-1.192982</td>\n",
" <td>1.077335</td>\n",
" <td>-0.751684</td>\n",
" <td>-0.697325</td>\n",
" <td>-0.403213</td>\n",
" <td>0.329169</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7270</th>\n",
" <td>0.446048</td>\n",
" <td>0.425135</td>\n",
" <td>0.337687</td>\n",
" <td>-0.465093</td>\n",
" <td>7.363324</td>\n",
" <td>0.329169</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8817 rows × 31 columns</p>\n",
"</div>"
],
"text/plain": [
" Levy Prod. year Engine volume Mileage Cylinders Airbags \\\n",
"10083 -1.192982 1.946936 -0.479341 -1.531744 -0.403213 -0.683755 \n",
"9482 -1.192982 -0.879266 -0.887855 -0.130245 -0.403213 -1.190217 \n",
"6177 0.081576 0.642535 -0.479341 -0.651122 -0.403213 0.835631 \n",
"11756 -1.192982 -1.531466 0.473858 1.263152 -0.403213 -0.430524 \n",
"6557 1.703146 1.512135 0.473858 -0.739330 -0.403213 -0.683755 \n",
"... ... ... ... ... ... ... \n",
"11964 -1.192982 -0.879266 0.337687 0.318018 1.538421 -0.683755 \n",
"5191 0.476602 0.859935 -0.206998 -0.011145 -0.403213 1.342092 \n",
"5390 0.646834 -0.661866 -0.887855 1.155137 -0.403213 0.329169 \n",
"860 -1.192982 1.077335 -0.751684 -0.697325 -0.403213 0.329169 \n",
"7270 0.446048 0.425135 0.337687 -0.465093 7.363324 0.329169 \n",
"\n",
" Category_Coupe Category_Goods wagon Category_Hatchback \\\n",
"10083 0.0 0.0 0.0 \n",
"9482 0.0 0.0 0.0 \n",
"6177 0.0 0.0 0.0 \n",
"11756 0.0 0.0 0.0 \n",
"6557 0.0 0.0 0.0 \n",
"... ... ... ... \n",
"11964 0.0 0.0 0.0 \n",
"5191 0.0 0.0 0.0 \n",
"5390 0.0 0.0 1.0 \n",
"860 0.0 0.0 0.0 \n",
"7270 0.0 0.0 0.0 \n",
"\n",
" Category_Jeep ... Fuel type_Petrol Fuel type_Plug-in Hybrid \\\n",
"10083 0.0 ... 1.0 0.0 \n",
"9482 0.0 ... 1.0 0.0 \n",
"6177 0.0 ... 1.0 0.0 \n",
"11756 1.0 ... 0.0 0.0 \n",
"6557 0.0 ... 0.0 0.0 \n",
"... ... ... ... ... \n",
"11964 0.0 ... 1.0 0.0 \n",
"5191 0.0 ... 1.0 0.0 \n",
"5390 0.0 ... 0.0 0.0 \n",
"860 0.0 ... 1.0 0.0 \n",
"7270 0.0 ... 1.0 0.0 \n",
"\n",
" Gear box type_Manual Gear box type_Tiptronic Gear box type_Variator \\\n",
"10083 0.0 0.0 0.0 \n",
"9482 0.0 1.0 0.0 \n",
"6177 0.0 1.0 0.0 \n",
"11756 0.0 0.0 0.0 \n",
"6557 0.0 0.0 0.0 \n",
"... ... ... ... \n",
"11964 1.0 0.0 0.0 \n",
"5191 0.0 0.0 0.0 \n",
"5390 0.0 0.0 1.0 \n",
"860 0.0 0.0 0.0 \n",
"7270 0.0 1.0 0.0 \n",
"\n",
" Drive wheels_Front Drive wheels_Rear Doors_Многодверный \\\n",
"10083 1.0 0.0 0.0 \n",
"9482 1.0 0.0 0.0 \n",
"6177 1.0 0.0 0.0 \n",
"11756 0.0 0.0 0.0 \n",
"6557 1.0 0.0 0.0 \n",
"... ... ... ... \n",
"11964 0.0 0.0 0.0 \n",
"5191 1.0 0.0 0.0 \n",
"5390 1.0 0.0 0.0 \n",
"860 1.0 0.0 0.0 \n",
"7270 1.0 0.0 0.0 \n",
"\n",
" Doors_Четырехдверный Wheel_Right-hand drive \n",
"10083 1.0 0.0 \n",
"9482 1.0 1.0 \n",
"6177 1.0 0.0 \n",
"11756 1.0 0.0 \n",
"6557 1.0 0.0 \n",
"... ... ... \n",
"11964 1.0 0.0 \n",
"5191 1.0 0.0 \n",
"5390 1.0 0.0 \n",
"860 1.0 0.0 \n",
"7270 1.0 0.0 \n",
"\n",
"[8817 rows x 31 columns]"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessing_result = pipeline_end.fit_transform(train_df)\n",
"preprocessed_df = pd.DataFrame(\n",
" preprocessing_result,\n",
" columns=pipeline_end.get_feature_names_out(),\n",
")\n",
"\n",
"preprocessed_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Формирование набора моделей"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"models = {\n",
" \"linear\": {\"model\": linear_model.LinearRegression(n_jobs=-1)},\n",
" \"linear_poly\": {\n",
" \"model\": make_pipeline(\n",
" PolynomialFeatures(degree=2),\n",
" linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n",
" )\n",
" },\n",
" \"linear_interact\": {\n",
" \"model\": make_pipeline(\n",
" PolynomialFeatures(interaction_only=True),\n",
" linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n",
" )\n",
" },\n",
" \"ridge\": {\"model\": linear_model.RidgeCV()},\n",
" \"decision_tree\": {\n",
" \"model\": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)\n",
" },\n",
" \"knn\": {\"model\": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},\n",
" \"random_forest\": {\n",
" \"model\": ensemble.RandomForestRegressor(\n",
" max_depth=7, random_state=random_state, n_jobs=-1\n",
" )\n",
" },\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Обучение и оценка моделей с помощью различных алгоритмов"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: linear\n",
"Model: linear_poly\n",
"Model: linear_interact\n",
"Model: ridge\n",
"Model: decision_tree\n",
"Model: knn\n",
"Model: random_forest\n"
]
}
],
"source": [
"for model_name in models.keys():\n",
" print(f\"Model: {model_name}\")\n",
"\n",
" model = models[model_name][\"model\"]\n",
"\n",
" model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
" model_pipeline = model_pipeline.fit(train_df, price_y_train.values.ravel())\n",
"\n",
" y_train_pred = model_pipeline.predict(train_df)\n",
" y_test_pred = model_pipeline.predict(test_df)\n",
"\n",
" models[model_name][\"fitted\"] = model_pipeline\n",
" models[model_name][\"train_preds\"] = y_train_pred\n",
" models[model_name][\"preds\"] = y_test_pred\n",
" models[model_name][\"RMSE_train\"] = math.sqrt(\n",
" metrics.mean_squared_error(price_y_train, y_train_pred)\n",
" )\n",
" models[model_name][\"RMSE_test\"] = math.sqrt(\n",
" metrics.mean_squared_error(price_y_test, y_test_pred)\n",
" )\n",
" models[model_name][\"RMAE_test\"] = math.sqrt(\n",
" metrics.mean_absolute_error(price_y_test, y_test_pred)\n",
" )\n",
" models[model_name][\"R2_test\"] = metrics.r2_score(price_y_test, y_test_pred)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Вывод результатов оценки"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_96325_row0_col0, #T_96325_row0_col1, #T_96325_row1_col1, #T_96325_row2_col1, #T_96325_row3_col1, #T_96325_row4_col1 {\n",
" background-color: #26818e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_96325_row0_col2, #T_96325_row1_col2, #T_96325_row2_col2, #T_96325_row3_col2, #T_96325_row4_col2, #T_96325_row6_col3 {\n",
" background-color: #4e02a2;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_96325_row0_col3, #T_96325_row1_col3, #T_96325_row2_col3, #T_96325_row3_col3, #T_96325_row4_col3, #T_96325_row5_col3, #T_96325_row6_col2 {\n",
" background-color: #da5a6a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_96325_row1_col0 {\n",
" background-color: #1fa088;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_96325_row2_col0, #T_96325_row6_col0 {\n",
" background-color: #2db27d;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_96325_row3_col0, #T_96325_row4_col0, #T_96325_row6_col1 {\n",
" background-color: #a8db34;\n",
" color: #000000;\n",
"}\n",
"#T_96325_row5_col0 {\n",
" background-color: #26ad81;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_96325_row5_col1 {\n",
" background-color: #25838e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_96325_row5_col2 {\n",
" background-color: #6c00a8;\n",
" color: #f1f1f1;\n",
"}\n",
"</style>\n",
"<table id=\"T_96325\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_96325_level0_col0\" class=\"col_heading level0 col0\" >RMSE_train</th>\n",
" <th id=\"T_96325_level0_col1\" class=\"col_heading level0 col1\" >RMSE_test</th>\n",
" <th id=\"T_96325_level0_col2\" class=\"col_heading level0 col2\" >RMAE_test</th>\n",
" <th id=\"T_96325_level0_col3\" class=\"col_heading level0 col3\" >R2_test</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_96325_level0_row0\" class=\"row_heading level0 row0\" >knn</th>\n",
" <td id=\"T_96325_row0_col0\" class=\"data row0 col0\" >5777.663053</td>\n",
" <td id=\"T_96325_row0_col1\" class=\"data row0 col1\" >6607.095563</td>\n",
" <td id=\"T_96325_row0_col2\" class=\"data row0 col2\" >67.047388</td>\n",
" <td id=\"T_96325_row0_col3\" class=\"data row0 col3\" >0.632511</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_96325_level0_row1\" class=\"row_heading level0 row1\" >random_forest</th>\n",
" <td id=\"T_96325_row1_col0\" class=\"data row1 col0\" >6567.406346</td>\n",
" <td id=\"T_96325_row1_col1\" class=\"data row1 col1\" >6852.474190</td>\n",
" <td id=\"T_96325_row1_col2\" class=\"data row1 col2\" >70.119860</td>\n",
" <td id=\"T_96325_row1_col3\" class=\"data row1 col3\" >0.604708</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_96325_level0_row2\" class=\"row_heading level0 row2\" >decision_tree</th>\n",
" <td id=\"T_96325_row2_col0\" class=\"data row2 col0\" >7022.066577</td>\n",
" <td id=\"T_96325_row2_col1\" class=\"data row2 col1\" >7393.444466</td>\n",
" <td id=\"T_96325_row2_col2\" class=\"data row2 col2\" >72.910390</td>\n",
" <td id=\"T_96325_row2_col3\" class=\"data row2 col3\" >0.539832</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_96325_level0_row3\" class=\"row_heading level0 row3\" >linear</th>\n",
" <td id=\"T_96325_row3_col0\" class=\"data row3 col0\" >8399.752941</td>\n",
" <td id=\"T_96325_row3_col1\" class=\"data row3 col1\" >8498.166215</td>\n",
" <td id=\"T_96325_row3_col2\" class=\"data row3 col2\" >80.676781</td>\n",
" <td id=\"T_96325_row3_col3\" class=\"data row3 col3\" >0.392042</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_96325_level0_row4\" class=\"row_heading level0 row4\" >ridge</th>\n",
" <td id=\"T_96325_row4_col0\" class=\"data row4 col0\" >8400.004465</td>\n",
" <td id=\"T_96325_row4_col1\" class=\"data row4 col1\" >8498.452033</td>\n",
" <td id=\"T_96325_row4_col2\" class=\"data row4 col2\" >80.683952</td>\n",
" <td id=\"T_96325_row4_col3\" class=\"data row4 col3\" >0.392001</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_96325_level0_row5\" class=\"row_heading level0 row5\" >linear_poly</th>\n",
" <td id=\"T_96325_row5_col0\" class=\"data row5 col0\" >6880.451269</td>\n",
" <td id=\"T_96325_row5_col1\" class=\"data row5 col1\" >3235067552304.437988</td>\n",
" <td id=\"T_96325_row5_col2\" class=\"data row5 col2\" >267308.588962</td>\n",
" <td id=\"T_96325_row5_col3\" class=\"data row5 col3\" >-88102783177967152.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_96325_level0_row6\" class=\"row_heading level0 row6\" >linear_interact</th>\n",
" <td id=\"T_96325_row6_col0\" class=\"data row6 col0\" >7037.525048</td>\n",
" <td id=\"T_96325_row6_col1\" class=\"data row6 col1\" >113842510019087.921875</td>\n",
" <td id=\"T_96325_row6_col2\" class=\"data row6 col2\" >1576050.007127</td>\n",
" <td id=\"T_96325_row6_col3\" class=\"data row6 col3\" >-109101782409976135680.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x18c02098860>"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"reg_metrics = pd.DataFrame.from_dict(models, \"index\")[\n",
" [\"RMSE_train\", \"RMSE_test\", \"RMAE_test\", \"R2_test\"]\n",
"]\n",
"reg_metrics.sort_values(by=\"RMSE_test\").style.background_gradient(\n",
" cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE_train\", \"RMSE_test\"]\n",
").background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"RMAE_test\", \"R2_test\"])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}