{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"df = pd.read_csv(\"..//static//csv//car_price_prediction.csv\", sep=\",\")\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ID | \n",
" Price | \n",
" Levy | \n",
" Manufacturer | \n",
" Model | \n",
" Prod. year | \n",
" Category | \n",
" Leather interior | \n",
" Fuel type | \n",
" Engine volume | \n",
" Mileage | \n",
" Cylinders | \n",
" Gear box type | \n",
" Drive wheels | \n",
" Doors | \n",
" Wheel | \n",
" Color | \n",
" Airbags | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 45654403 | \n",
" 13328 | \n",
" 1399 | \n",
" LEXUS | \n",
" RX 450 | \n",
" 2010 | \n",
" Jeep | \n",
" Yes | \n",
" Hybrid | \n",
" 3.5 | \n",
" 186005 km | \n",
" 6.0 | \n",
" Automatic | \n",
" 4x4 | \n",
" 04-May | \n",
" Left wheel | \n",
" Silver | \n",
" 12 | \n",
"
\n",
" \n",
" 1 | \n",
" 44731507 | \n",
" 16621 | \n",
" 1018 | \n",
" CHEVROLET | \n",
" Equinox | \n",
" 2011 | \n",
" Jeep | \n",
" No | \n",
" Petrol | \n",
" 3 | \n",
" 192000 km | \n",
" 6.0 | \n",
" Tiptronic | \n",
" 4x4 | \n",
" 04-May | \n",
" Left wheel | \n",
" Black | \n",
" 8 | \n",
"
\n",
" \n",
" 2 | \n",
" 45774419 | \n",
" 8467 | \n",
" - | \n",
" HONDA | \n",
" FIT | \n",
" 2006 | \n",
" Hatchback | \n",
" No | \n",
" Petrol | \n",
" 1.3 | \n",
" 200000 km | \n",
" 4.0 | \n",
" Variator | \n",
" Front | \n",
" 04-May | \n",
" Right-hand drive | \n",
" Black | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" 45769185 | \n",
" 3607 | \n",
" 862 | \n",
" FORD | \n",
" Escape | \n",
" 2011 | \n",
" Jeep | \n",
" Yes | \n",
" Hybrid | \n",
" 2.5 | \n",
" 168966 km | \n",
" 4.0 | \n",
" Automatic | \n",
" 4x4 | \n",
" 04-May | \n",
" Left wheel | \n",
" White | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 45809263 | \n",
" 11726 | \n",
" 446 | \n",
" HONDA | \n",
" FIT | \n",
" 2014 | \n",
" Hatchback | \n",
" Yes | \n",
" Petrol | \n",
" 1.3 | \n",
" 91901 km | \n",
" 4.0 | \n",
" Automatic | \n",
" Front | \n",
" 04-May | \n",
" Left wheel | \n",
" Silver | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ID Price Levy Manufacturer Model Prod. year Category \\\n",
"0 45654403 13328 1399 LEXUS RX 450 2010 Jeep \n",
"1 44731507 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
"2 45774419 8467 - HONDA FIT 2006 Hatchback \n",
"3 45769185 3607 862 FORD Escape 2011 Jeep \n",
"4 45809263 11726 446 HONDA FIT 2014 Hatchback \n",
"\n",
" Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
"0 Yes Hybrid 3.5 186005 km 6.0 \n",
"1 No Petrol 3 192000 km 6.0 \n",
"2 No Petrol 1.3 200000 km 4.0 \n",
"3 Yes Hybrid 2.5 168966 km 4.0 \n",
"4 Yes Petrol 1.3 91901 km 4.0 \n",
"\n",
" Gear box type Drive wheels Doors Wheel Color Airbags \n",
"0 Automatic 4x4 04-May Left wheel Silver 12 \n",
"1 Tiptronic 4x4 04-May Left wheel Black 8 \n",
"2 Variator Front 04-May Right-hand drive Black 2 \n",
"3 Automatic 4x4 04-May Left wheel White 0 \n",
"4 Automatic Front 04-May Left wheel Silver 4 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Бизнес-цели\n",
"- **Задача регрессии**: Построить модель для предсказания цены автомобиля (`Price`) на основе его характеристик.\n",
"- **Задача классификации**: Определить категорию автомобиля (`Category`) по характеристикам."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ввиду того что я первый раз обучаю модель прогнозируемое качество предсказания - не выше 50 %"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ID int64\n",
"Price int64\n",
"Levy object\n",
"Manufacturer object\n",
"Model object\n",
"Prod. year int64\n",
"Category object\n",
"Leather interior object\n",
"Fuel type object\n",
"Engine volume object\n",
"Mileage object\n",
"Cylinders float64\n",
"Gear box type object\n",
"Drive wheels object\n",
"Doors object\n",
"Wheel object\n",
"Color object\n",
"Airbags int64\n",
"dtype: object"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# небольшая обработка данных"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"Q1 = df['Price'].quantile(0.25)\n",
"Q3 = df['Price'].quantile(0.75)\n",
"IQR = Q3 - Q1\n",
"df = df[(df['Price'] >= Q1 - 1.5 * IQR) & (df['Price'] <= Q3 + 1.5 * IQR)]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df['Levy']=pd.to_numeric(df['Levy'],errors='coerce')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"df['Mileage'] = df['Mileage'].str.replace(' km', '').str.replace(',', '')\n",
"df['Mileage'] = df['Mileage'].astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"df['Engine volume'] = df['Engine volume'].apply(lambda x: float(re.match(r'\\d+(\\.\\d+)?', x).group()) if isinstance(x, str) else x)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df.drop_duplicates(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Разделим данные на признаки и целевые переменные\n",
"X = df.drop(columns=['Price','ID']) # Признаки\n",
"y = df['Price'] # Целевая переменная для регрессии"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# наполняем пайплайн обработчиками"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"\n",
"# Определение числовых и категориальных столбцов\n",
"numeric_features = ['Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']\n",
"categorical_features = ['Manufacturer', 'Model', 'Category', 'Fuel type', \n",
" 'Gear box type', 'Drive wheels', 'Doors', \n",
" 'Wheel', 'Color']\n",
"\n",
"# Обработка числовых данных\n",
"numeric_transformer = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='median')), # Заполнение пропусков медианой\n",
" ('scaler', StandardScaler()) # Нормализация данных\n",
"])\n",
"\n",
"# Обработка категориальных данных\n",
"categorical_transformer = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='most_frequent')), # Заполнение пропусков модой\n",
" ('onehot', OneHotEncoder(handle_unknown='ignore')) # Преобразование в One-Hot Encoding\n",
"])\n",
"\n",
"# Комбинированный трансформер\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', numeric_transformer, numeric_features), # Применяем числовую обработку\n",
" ('cat', categorical_transformer, categorical_features) # Применяем категориальную обработку\n",
" ]\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Transformed feature shape: (17869, 1610)\n"
]
}
],
"source": [
"X_transformed = preprocessor.fit_transform(X)\n",
"\n",
"print(f\"Transformed feature shape: {X_transformed.shape}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# посмотрим результат пайплайна"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Prod. year | \n",
" Engine volume | \n",
" Mileage | \n",
" Cylinders | \n",
" Airbags | \n",
" Manufacturer_ACURA | \n",
" Manufacturer_ALFA ROMEO | \n",
" Manufacturer_AUDI | \n",
" Manufacturer_BMW | \n",
" Manufacturer_BUICK | \n",
" ... | \n",
" Color_Green | \n",
" Color_Grey | \n",
" Color_Orange | \n",
" Color_Pink | \n",
" Color_Purple | \n",
" Color_Red | \n",
" Color_Silver | \n",
" Color_Sky blue | \n",
" Color_White | \n",
" Color_Yellow | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" -0.117071 | \n",
" 1.427610 | \n",
" -0.029000 | \n",
" 1.256101 | \n",
" 1.255022 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.060861 | \n",
" 0.844069 | \n",
" -0.028881 | \n",
" 1.256101 | \n",
" 0.330220 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" -0.828800 | \n",
" -1.139970 | \n",
" -0.028722 | \n",
" -0.470989 | \n",
" -1.056983 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.060861 | \n",
" 0.260528 | \n",
" -0.029340 | \n",
" -0.470989 | \n",
" -1.519384 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.594657 | \n",
" -1.139970 | \n",
" -0.030874 | \n",
" -0.470989 | \n",
" -0.594582 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 17864 | \n",
" -2.074326 | \n",
" -0.323013 | \n",
" -0.026730 | \n",
" -0.470989 | \n",
" -0.363382 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 17865 | \n",
" 0.060861 | \n",
" 0.143820 | \n",
" -0.029486 | \n",
" -0.470989 | \n",
" 0.330220 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 17866 | \n",
" -0.117071 | \n",
" -0.323013 | \n",
" -0.030387 | \n",
" -0.470989 | \n",
" -0.594582 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 17867 | \n",
" -0.650868 | \n",
" -0.323013 | \n",
" -0.031684 | \n",
" -0.470989 | \n",
" -0.594582 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 17868 | \n",
" 0.238793 | \n",
" 0.143820 | \n",
" -0.028982 | \n",
" -0.470989 | \n",
" 1.255022 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
17869 rows × 1610 columns
\n",
"
"
],
"text/plain": [
" Prod. year Engine volume Mileage Cylinders Airbags \\\n",
"0 -0.117071 1.427610 -0.029000 1.256101 1.255022 \n",
"1 0.060861 0.844069 -0.028881 1.256101 0.330220 \n",
"2 -0.828800 -1.139970 -0.028722 -0.470989 -1.056983 \n",
"3 0.060861 0.260528 -0.029340 -0.470989 -1.519384 \n",
"4 0.594657 -1.139970 -0.030874 -0.470989 -0.594582 \n",
"... ... ... ... ... ... \n",
"17864 -2.074326 -0.323013 -0.026730 -0.470989 -0.363382 \n",
"17865 0.060861 0.143820 -0.029486 -0.470989 0.330220 \n",
"17866 -0.117071 -0.323013 -0.030387 -0.470989 -0.594582 \n",
"17867 -0.650868 -0.323013 -0.031684 -0.470989 -0.594582 \n",
"17868 0.238793 0.143820 -0.028982 -0.470989 1.255022 \n",
"\n",
" Manufacturer_ACURA Manufacturer_ALFA ROMEO Manufacturer_AUDI \\\n",
"0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 \n",
"... ... ... ... \n",
"17864 0.0 0.0 0.0 \n",
"17865 0.0 0.0 0.0 \n",
"17866 0.0 0.0 0.0 \n",
"17867 0.0 0.0 0.0 \n",
"17868 0.0 0.0 0.0 \n",
"\n",
" Manufacturer_BMW Manufacturer_BUICK ... Color_Green Color_Grey \\\n",
"0 0.0 0.0 ... 0.0 0.0 \n",
"1 0.0 0.0 ... 0.0 0.0 \n",
"2 0.0 0.0 ... 0.0 0.0 \n",
"3 0.0 0.0 ... 0.0 0.0 \n",
"4 0.0 0.0 ... 0.0 0.0 \n",
"... ... ... ... ... ... \n",
"17864 0.0 0.0 ... 0.0 0.0 \n",
"17865 0.0 0.0 ... 0.0 0.0 \n",
"17866 0.0 0.0 ... 0.0 1.0 \n",
"17867 0.0 0.0 ... 0.0 0.0 \n",
"17868 0.0 0.0 ... 0.0 0.0 \n",
"\n",
" Color_Orange Color_Pink Color_Purple Color_Red Color_Silver \\\n",
"0 0.0 0.0 0.0 0.0 1.0 \n",
"1 0.0 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 1.0 \n",
"... ... ... ... ... ... \n",
"17864 0.0 0.0 0.0 0.0 1.0 \n",
"17865 0.0 0.0 0.0 1.0 0.0 \n",
"17866 0.0 0.0 0.0 0.0 0.0 \n",
"17867 0.0 0.0 0.0 0.0 0.0 \n",
"17868 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
" Color_Sky blue Color_White Color_Yellow \n",
"0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 \n",
"3 0.0 1.0 0.0 \n",
"4 0.0 0.0 0.0 \n",
"... ... ... ... \n",
"17864 0.0 0.0 0.0 \n",
"17865 0.0 0.0 0.0 \n",
"17866 0.0 0.0 0.0 \n",
"17867 0.0 0.0 0.0 \n",
"17868 0.0 1.0 0.0 \n",
"\n",
"[17869 rows x 1610 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Получим имена категориальных признаков после OneHotEncoder\n",
"categorical_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)\n",
"\n",
"# Объединим их с именами числовых признаков\n",
"feature_names = list(numeric_features) + list(categorical_feature_names)\n",
"\n",
"# Создадим DataFrame для преобразованных данных\n",
"X_transformed_df = pd.DataFrame(X_transformed.toarray() if hasattr(X_transformed, 'toarray') else X_transformed, columns=feature_names)\n",
"\n",
"# Выведем пример 5 строк\n",
"X_transformed_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# обучим 3 разные модели с применением RandomizedSearchCV(для подора гиперпараметров)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training LinearRegression...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\МИИ\\AIM-PIbd-31-Kouvshinoff-T-A\\laba\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 1 is smaller than n_iter=10. Running 1 iterations. For exhaustive searches, use GridSearchCV.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training RandomForestRegressor...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\МИИ\\AIM-PIbd-31-Kouvshinoff-T-A\\laba\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 9 is smaller than n_iter=10. Running 9 iterations. For exhaustive searches, use GridSearchCV.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training GradientBoostingRegressor...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\МИИ\\AIM-PIbd-31-Kouvshinoff-T-A\\laba\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n",
" _data = np.array(data, dtype=dtype, copy=copy,\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Model: LinearRegression\n",
"Best Params: {}\n",
"MAE: 6722.0902357642335\n",
"RMSE: 8991.273616765677\n",
"R2: 0.3722951567248176\n",
"\n",
"Model: RandomForestRegressor\n",
"Best Params: {'model__n_estimators': 200, 'model__max_depth': None}\n",
"MAE: 3568.360497561258\n",
"RMSE: 6055.406570308487\n",
"R2: 0.7152920023310496\n",
"\n",
"Model: GradientBoostingRegressor\n",
"Best Params: {'model__n_estimators': 200, 'model__max_depth': 10, 'model__learning_rate': 0.2}\n",
"MAE: 3933.35109066405\n",
"RMSE: 6171.208466996527\n",
"R2: 0.7042985281049783\n"
]
}
],
"source": [
"import numpy as np\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.ensemble import GradientBoostingRegressor\n",
"from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"import matplotlib.pyplot as plt\n",
"\n",
"random_state = 42\n",
"\n",
"# Модели и параметры\n",
"models_regression = {\n",
" \"LinearRegression\": LinearRegression(),\n",
" \"RandomForestRegressor\": RandomForestRegressor(random_state=random_state),\n",
" \"GradientBoostingRegressor\": GradientBoostingRegressor(random_state=random_state)\n",
"}\n",
"\n",
"param_grids_regression = {\n",
" \"LinearRegression\": {},\n",
" \"RandomForestRegressor\": {\n",
" 'model__n_estimators': [50, 100, 200],\n",
" 'model__max_depth': [None, 10, 20],\n",
" },\n",
" \"GradientBoostingRegressor\": {\n",
" 'model__n_estimators': [50, 100, 200],\n",
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
" 'model__max_depth': [3, 5, 10]\n",
" }\n",
"}\n",
"\n",
"# Результаты\n",
"results_regression = {}\n",
"\n",
"# Перебор моделей\n",
"for name, model in models_regression.items():\n",
" print(f\"Training {name}...\")\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" param_grid = param_grids_regression[name]\n",
" grid_search = RandomizedSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)\n",
" grid_search.fit(X_train, y_train)\n",
"\n",
" # Лучшая модель\n",
" best_model = grid_search.best_estimator_\n",
" y_pred = best_model.predict(X_test)\n",
"\n",
" # Метрики\n",
" mae = mean_absolute_error(y_test, y_pred)\n",
" rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
" r2 = r2_score(y_test, y_pred)\n",
"\n",
" # Сохранение результатов\n",
" results_regression[name] = {\n",
" \"Best Params\": grid_search.best_params_,\n",
" \"MAE\": mae,\n",
" \"RMSE\": rmse,\n",
" \"R2\": r2\n",
" }\n",
"\n",
"# Печать результатов\n",
"for name, metrics in results_regression.items():\n",
" print(f\"\\nModel: {name}\")\n",
" for metric, value in metrics.items():\n",
" print(f\"{metric}: {value}\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" MAE | \n",
" RMSE | \n",
" R2 | \n",
"
\n",
" \n",
" \n",
" \n",
" RandomForestRegressor | \n",
" 3568.360498 | \n",
" 6055.406570 | \n",
" 0.715292 | \n",
"
\n",
" \n",
" GradientBoostingRegressor | \n",
" 3933.351091 | \n",
" 6171.208467 | \n",
" 0.704299 | \n",
"
\n",
" \n",
" LinearRegression | \n",
" 6722.090236 | \n",
" 8991.273617 | \n",
" 0.372295 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Импортируем pandas для работы с таблицами\n",
"import pandas as pd\n",
"\n",
"# Формируем таблицу метрик\n",
"reg_metrics = pd.DataFrame.from_dict(results_regression, orient=\"index\")[\n",
" [\"MAE\", \"RMSE\", \"R2\"]\n",
"]\n",
"\n",
"# Визуализация результатов с помощью стилизации\n",
"styled_metrics = (\n",
" reg_metrics.sort_values(by=\"RMSE\")\n",
" .style.background_gradient(cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE\", \"MAE\"])\n",
" .background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"R2\"])\n",
")\n",
"\n",
"styled_metrics"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# невероятно\n",
"Модель может что-то даже предсказать, с погрешностью в 3к$ конечно и ошибкой 70% но всё же. Я думал и 50% не будет.\n",
"Возможно если сузить сильнее входные данные, потому что выбросы очень большие, результат будет лучше. Линейная регрессия кстати вообще не справилась с данными а вот 2 другие ещё более менее"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# приступим к задаче классификации"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training LogisticRegression...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\МИИ\\AIM-PIbd-31-Kouvshinoff-T-A\\laba\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 3 is smaller than n_iter=10. Running 3 iterations. For exhaustive searches, use GridSearchCV.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training RandomForestClassifier...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\МИИ\\AIM-PIbd-31-Kouvshinoff-T-A\\laba\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n",
" _data = np.array(data, dtype=dtype, copy=copy,\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training KNN...\n",
"\n",
"Model: LogisticRegression\n",
"Best Params: {'model__C': 10}\n",
"Accuracy: 0.8612199216564074\n",
"F1 Score: 0.9032383925087788\n",
"Confusion_matrix: [[ 763 303]\n",
" [ 193 2315]]\n",
"\n",
"Model: RandomForestClassifier\n",
"Best Params: {'model__n_estimators': 500, 'model__max_features': 'log2', 'model__max_depth': 20, 'model__criterion': 'gini'}\n",
"Accuracy: 0.802182428651371\n",
"F1 Score: 0.874800779174783\n",
"Confusion_matrix: [[ 397 669]\n",
" [ 38 2470]]\n",
"\n",
"Model: KNN\n",
"Best Params: {'model__weights': 'uniform', 'model__n_neighbors': 5}\n",
"Accuracy: 0.8718522663682149\n",
"F1 Score: 0.9082532051282052\n",
"Confusion_matrix: [[ 849 217]\n",
" [ 241 2267]]\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.metrics import accuracy_score, confusion_matrix, f1_score\n",
"\n",
"X = df.drop(columns=['Leather interior','ID']) # Признаки\n",
"# Целевая переменная для классификации\n",
"y_class = df['Leather interior'].map({'Yes': 1, 'No': 0}) # Преобразуем в 0/1\n",
"\n",
"# Разделение данных\n",
"X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_class, test_size=0.2, random_state=42)\n",
"\n",
"# Модели и параметры\n",
"models_classification = {\n",
" \"LogisticRegression\": LogisticRegression(max_iter=1000),\n",
" \"RandomForestClassifier\": RandomForestClassifier(random_state=42),\n",
" \"KNN\": KNeighborsClassifier()\n",
"}\n",
"\n",
"param_grids_classification = {\n",
" \"LogisticRegression\": {\n",
" 'model__C': [0.1, 1, 10]\n",
" },\n",
" \"RandomForestClassifier\": {\n",
" \"model__n_estimators\": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],\n",
" \"model__max_features\": [\"sqrt\", \"log2\", 2],\n",
" \"model__max_depth\": [2, 3, 4, 5, 6, 7, 8, 9 ,10, 20],\n",
" \"model__criterion\": [\"gini\", \"entropy\", \"log_loss\"],\n",
" },\n",
" \"KNN\": {\n",
" 'model__n_neighbors': [3, 5, 7, 9, 11],\n",
" 'model__weights': ['uniform', 'distance']\n",
" }\n",
"}\n",
"\n",
"# Результаты\n",
"results_classification = {}\n",
"\n",
"# Перебор моделей\n",
"for name, model in models_classification.items():\n",
" print(f\"Training {name}...\")\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" param_grid = param_grids_classification[name]\n",
" grid_search = RandomizedSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)\n",
" grid_search.fit(X_train_clf, y_train_clf)\n",
"\n",
" # Лучшая модель\n",
" best_model = grid_search.best_estimator_\n",
" y_pred = best_model.predict(X_test_clf)\n",
"\n",
" # Метрики\n",
" acc = accuracy_score(y_test_clf, y_pred)\n",
" f1 = f1_score(y_test_clf, y_pred)\n",
"\n",
" # Вычисление матрицы ошибок\n",
" c_matrix = confusion_matrix(y_test_clf, y_pred)\n",
"\n",
" # Сохранение результатов\n",
" results_classification[name] = {\n",
" \"Best Params\": grid_search.best_params_,\n",
" \"Accuracy\": acc,\n",
" \"F1 Score\": f1,\n",
" \"Confusion_matrix\": c_matrix\n",
" }\n",
"\n",
"# Печать результатов\n",
"for name, metrics in results_classification.items():\n",
" print(f\"\\nModel: {name}\")\n",
" for metric, value in metrics.items():\n",
" print(f\"{metric}: {value}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# отресуем красивые квадратики\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.metrics import ConfusionMatrixDisplay\n",
"\n",
"\n",
"num_models = len(results_classification)\n",
"num_rows = (num_models // 2) + (num_models % 2) # Количество строк для подграфиков\n",
"_, ax = plt.subplots(num_rows, 2, figsize=(12, 10), sharex=False, sharey=False)\n",
"\n",
"for index, (name, metrics) in enumerate(results_classification.items()):\n",
" c_matrix = metrics[\"Confusion_matrix\"]\n",
" disp = ConfusionMatrixDisplay(\n",
" confusion_matrix=c_matrix, display_labels=[\"No\", \"Yes\"]\n",
" ).plot(ax=ax.flat[index])\n",
" disp.ax_.set_title(name)\n",
"\n",
"# Корректировка расположения графиков\n",
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" Accuracy | \n",
" F1 Score | \n",
"
\n",
" \n",
" \n",
" \n",
" KNN | \n",
" 0.871852 | \n",
" 0.908253 | \n",
"
\n",
" \n",
" LogisticRegression | \n",
" 0.861220 | \n",
" 0.903238 | \n",
"
\n",
" \n",
" RandomForestClassifier | \n",
" 0.802182 | \n",
" 0.874801 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"# Формируем таблицу метрик классификации\n",
"clf_metrics = pd.DataFrame.from_dict(results_classification, orient=\"index\")[[\"Accuracy\", \"F1 Score\"]]\n",
"\n",
"# Визуализация результатов с помощью стилизации\n",
"styled_metrics_clf = (\n",
" clf_metrics.sort_values(by=\"F1 Score\", ascending=False) # Сортировка по F1 Score\n",
" .style.background_gradient(cmap=\"viridis\", low=0, high=1, subset=[\"F1 Score\", \"Accuracy\"]) # Стилизация столбцов\n",
" .background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"Accuracy\"])\n",
")\n",
"\n",
"styled_metrics_clf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"В итоге KNN и LogisticRegression выдали точность в 90% что я считаю весьма неплохо. RandomForestClassifier близко, но не так хорошо"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "laba",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}