diff --git a/lab_4/laba4.ipynb b/lab_4/laba4.ipynb
new file mode 100644
index 0000000..ba34768
--- /dev/null
+++ b/lab_4/laba4.ipynb
@@ -0,0 +1,1381 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "df = pd.read_csv(\"..//static//csv//car_price_prediction.csv\", sep=\",\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ID | \n",
+ " Price | \n",
+ " Levy | \n",
+ " Manufacturer | \n",
+ " Model | \n",
+ " Prod. year | \n",
+ " Category | \n",
+ " Leather interior | \n",
+ " Fuel type | \n",
+ " Engine volume | \n",
+ " Mileage | \n",
+ " Cylinders | \n",
+ " Gear box type | \n",
+ " Drive wheels | \n",
+ " Doors | \n",
+ " Wheel | \n",
+ " Color | \n",
+ " Airbags | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 45654403 | \n",
+ " 13328 | \n",
+ " 1399 | \n",
+ " LEXUS | \n",
+ " RX 450 | \n",
+ " 2010 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " Hybrid | \n",
+ " 3.5 | \n",
+ " 186005 km | \n",
+ " 6.0 | \n",
+ " Automatic | \n",
+ " 4x4 | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Silver | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 44731507 | \n",
+ " 16621 | \n",
+ " 1018 | \n",
+ " CHEVROLET | \n",
+ " Equinox | \n",
+ " 2011 | \n",
+ " Jeep | \n",
+ " No | \n",
+ " Petrol | \n",
+ " 3 | \n",
+ " 192000 km | \n",
+ " 6.0 | \n",
+ " Tiptronic | \n",
+ " 4x4 | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Black | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 45774419 | \n",
+ " 8467 | \n",
+ " - | \n",
+ " HONDA | \n",
+ " FIT | \n",
+ " 2006 | \n",
+ " Hatchback | \n",
+ " No | \n",
+ " Petrol | \n",
+ " 1.3 | \n",
+ " 200000 km | \n",
+ " 4.0 | \n",
+ " Variator | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Right-hand drive | \n",
+ " Black | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 45769185 | \n",
+ " 3607 | \n",
+ " 862 | \n",
+ " FORD | \n",
+ " Escape | \n",
+ " 2011 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " Hybrid | \n",
+ " 2.5 | \n",
+ " 168966 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " 4x4 | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " White | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 45809263 | \n",
+ " 11726 | \n",
+ " 446 | \n",
+ " HONDA | \n",
+ " FIT | \n",
+ " 2014 | \n",
+ " Hatchback | \n",
+ " Yes | \n",
+ " Petrol | \n",
+ " 1.3 | \n",
+ " 91901 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Silver | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ID Price Levy Manufacturer Model Prod. year Category \\\n",
+ "0 45654403 13328 1399 LEXUS RX 450 2010 Jeep \n",
+ "1 44731507 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
+ "2 45774419 8467 - HONDA FIT 2006 Hatchback \n",
+ "3 45769185 3607 862 FORD Escape 2011 Jeep \n",
+ "4 45809263 11726 446 HONDA FIT 2014 Hatchback \n",
+ "\n",
+ " Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
+ "0 Yes Hybrid 3.5 186005 km 6.0 \n",
+ "1 No Petrol 3 192000 km 6.0 \n",
+ "2 No Petrol 1.3 200000 km 4.0 \n",
+ "3 Yes Hybrid 2.5 168966 km 4.0 \n",
+ "4 Yes Petrol 1.3 91901 km 4.0 \n",
+ "\n",
+ " Gear box type Drive wheels Doors Wheel Color Airbags \n",
+ "0 Automatic 4x4 04-May Left wheel Silver 12 \n",
+ "1 Tiptronic 4x4 04-May Left wheel Black 8 \n",
+ "2 Variator Front 04-May Right-hand drive Black 2 \n",
+ "3 Automatic 4x4 04-May Left wheel White 0 \n",
+ "4 Automatic Front 04-May Left wheel Silver 4 "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Бизнес-цели\n",
+ "- **Задача регрессии**: Построить модель для предсказания цены автомобиля (`Price`) на основе его характеристик.\n",
+ "- **Задача классификации**: Определить категорию автомобиля (`Category`) по характеристикам."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Ввиду того что я первый раз обучаю модель прогнозируемое качество предсказания - не выше 50 %"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "ID int64\n",
+ "Price int64\n",
+ "Levy object\n",
+ "Manufacturer object\n",
+ "Model object\n",
+ "Prod. year int64\n",
+ "Category object\n",
+ "Leather interior object\n",
+ "Fuel type object\n",
+ "Engine volume object\n",
+ "Mileage object\n",
+ "Cylinders float64\n",
+ "Gear box type object\n",
+ "Drive wheels object\n",
+ "Doors object\n",
+ "Wheel object\n",
+ "Color object\n",
+ "Airbags int64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# небольшая обработка данных"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "Q1 = df['Price'].quantile(0.25)\n",
+ "Q3 = df['Price'].quantile(0.75)\n",
+ "IQR = Q3 - Q1\n",
+ "df = df[(df['Price'] >= Q1 - 1.5 * IQR) & (df['Price'] <= Q3 + 1.5 * IQR)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['Levy']=pd.to_numeric(df['Levy'],errors='coerce')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['Mileage'] = df['Mileage'].str.replace(' km', '').str.replace(',', '')\n",
+ "df['Mileage'] = df['Mileage'].astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "\n",
+ "df['Engine volume'] = df['Engine volume'].apply(lambda x: float(re.match(r'\\d+(\\.\\d+)?', x).group()) if isinstance(x, str) else x)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.drop_duplicates(inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Разделим данные на признаки и целевые переменные\n",
+ "X = df.drop(columns=['Price','ID']) # Признаки\n",
+ "y = df['Price'] # Целевая переменная для регрессии"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# наполняем пайплайн обработчиками"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
+ "\n",
+ "# Определение числовых и категориальных столбцов\n",
+ "numeric_features = ['Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']\n",
+ "categorical_features = ['Manufacturer', 'Model', 'Category', 'Fuel type', \n",
+ " 'Gear box type', 'Drive wheels', 'Doors', \n",
+ " 'Wheel', 'Color']\n",
+ "\n",
+ "# Обработка числовых данных\n",
+ "numeric_transformer = Pipeline(steps=[\n",
+ " ('imputer', SimpleImputer(strategy='median')), # Заполнение пропусков медианой\n",
+ " ('scaler', StandardScaler()) # Нормализация данных\n",
+ "])\n",
+ "\n",
+ "# Обработка категориальных данных\n",
+ "categorical_transformer = Pipeline(steps=[\n",
+ " ('imputer', SimpleImputer(strategy='most_frequent')), # Заполнение пропусков модой\n",
+ " ('onehot', OneHotEncoder(handle_unknown='ignore')) # Преобразование в One-Hot Encoding\n",
+ "])\n",
+ "\n",
+ "# Комбинированный трансформер\n",
+ "preprocessor = ColumnTransformer(\n",
+ " transformers=[\n",
+ " ('num', numeric_transformer, numeric_features), # Применяем числовую обработку\n",
+ " ('cat', categorical_transformer, categorical_features) # Применяем категориальную обработку\n",
+ " ]\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Transformed feature shape: (17869, 1610)\n"
+ ]
+ }
+ ],
+ "source": [
+ "X_transformed = preprocessor.fit_transform(X)\n",
+ "\n",
+ "print(f\"Transformed feature shape: {X_transformed.shape}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# посмотрим результат пайплайна"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Prod. year | \n",
+ " Engine volume | \n",
+ " Mileage | \n",
+ " Cylinders | \n",
+ " Airbags | \n",
+ " Manufacturer_ACURA | \n",
+ " Manufacturer_ALFA ROMEO | \n",
+ " Manufacturer_AUDI | \n",
+ " Manufacturer_BMW | \n",
+ " Manufacturer_BUICK | \n",
+ " ... | \n",
+ " Color_Green | \n",
+ " Color_Grey | \n",
+ " Color_Orange | \n",
+ " Color_Pink | \n",
+ " Color_Purple | \n",
+ " Color_Red | \n",
+ " Color_Silver | \n",
+ " Color_Sky blue | \n",
+ " Color_White | \n",
+ " Color_Yellow | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " -0.117071 | \n",
+ " 1.427610 | \n",
+ " -0.029000 | \n",
+ " 1.256101 | \n",
+ " 1.255022 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0.060861 | \n",
+ " 0.844069 | \n",
+ " -0.028881 | \n",
+ " 1.256101 | \n",
+ " 0.330220 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " -0.828800 | \n",
+ " -1.139970 | \n",
+ " -0.028722 | \n",
+ " -0.470989 | \n",
+ " -1.056983 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.060861 | \n",
+ " 0.260528 | \n",
+ " -0.029340 | \n",
+ " -0.470989 | \n",
+ " -1.519384 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.594657 | \n",
+ " -1.139970 | \n",
+ " -0.030874 | \n",
+ " -0.470989 | \n",
+ " -0.594582 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 17864 | \n",
+ " -2.074326 | \n",
+ " -0.323013 | \n",
+ " -0.026730 | \n",
+ " -0.470989 | \n",
+ " -0.363382 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 17865 | \n",
+ " 0.060861 | \n",
+ " 0.143820 | \n",
+ " -0.029486 | \n",
+ " -0.470989 | \n",
+ " 0.330220 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 17866 | \n",
+ " -0.117071 | \n",
+ " -0.323013 | \n",
+ " -0.030387 | \n",
+ " -0.470989 | \n",
+ " -0.594582 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 17867 | \n",
+ " -0.650868 | \n",
+ " -0.323013 | \n",
+ " -0.031684 | \n",
+ " -0.470989 | \n",
+ " -0.594582 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 17868 | \n",
+ " 0.238793 | \n",
+ " 0.143820 | \n",
+ " -0.028982 | \n",
+ " -0.470989 | \n",
+ " 1.255022 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
17869 rows × 1610 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Prod. year Engine volume Mileage Cylinders Airbags \\\n",
+ "0 -0.117071 1.427610 -0.029000 1.256101 1.255022 \n",
+ "1 0.060861 0.844069 -0.028881 1.256101 0.330220 \n",
+ "2 -0.828800 -1.139970 -0.028722 -0.470989 -1.056983 \n",
+ "3 0.060861 0.260528 -0.029340 -0.470989 -1.519384 \n",
+ "4 0.594657 -1.139970 -0.030874 -0.470989 -0.594582 \n",
+ "... ... ... ... ... ... \n",
+ "17864 -2.074326 -0.323013 -0.026730 -0.470989 -0.363382 \n",
+ "17865 0.060861 0.143820 -0.029486 -0.470989 0.330220 \n",
+ "17866 -0.117071 -0.323013 -0.030387 -0.470989 -0.594582 \n",
+ "17867 -0.650868 -0.323013 -0.031684 -0.470989 -0.594582 \n",
+ "17868 0.238793 0.143820 -0.028982 -0.470989 1.255022 \n",
+ "\n",
+ " Manufacturer_ACURA Manufacturer_ALFA ROMEO Manufacturer_AUDI \\\n",
+ "0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 \n",
+ "... ... ... ... \n",
+ "17864 0.0 0.0 0.0 \n",
+ "17865 0.0 0.0 0.0 \n",
+ "17866 0.0 0.0 0.0 \n",
+ "17867 0.0 0.0 0.0 \n",
+ "17868 0.0 0.0 0.0 \n",
+ "\n",
+ " Manufacturer_BMW Manufacturer_BUICK ... Color_Green Color_Grey \\\n",
+ "0 0.0 0.0 ... 0.0 0.0 \n",
+ "1 0.0 0.0 ... 0.0 0.0 \n",
+ "2 0.0 0.0 ... 0.0 0.0 \n",
+ "3 0.0 0.0 ... 0.0 0.0 \n",
+ "4 0.0 0.0 ... 0.0 0.0 \n",
+ "... ... ... ... ... ... \n",
+ "17864 0.0 0.0 ... 0.0 0.0 \n",
+ "17865 0.0 0.0 ... 0.0 0.0 \n",
+ "17866 0.0 0.0 ... 0.0 1.0 \n",
+ "17867 0.0 0.0 ... 0.0 0.0 \n",
+ "17868 0.0 0.0 ... 0.0 0.0 \n",
+ "\n",
+ " Color_Orange Color_Pink Color_Purple Color_Red Color_Silver \\\n",
+ "0 0.0 0.0 0.0 0.0 1.0 \n",
+ "1 0.0 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 1.0 \n",
+ "... ... ... ... ... ... \n",
+ "17864 0.0 0.0 0.0 0.0 1.0 \n",
+ "17865 0.0 0.0 0.0 1.0 0.0 \n",
+ "17866 0.0 0.0 0.0 0.0 0.0 \n",
+ "17867 0.0 0.0 0.0 0.0 0.0 \n",
+ "17868 0.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Color_Sky blue Color_White Color_Yellow \n",
+ "0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 \n",
+ "3 0.0 1.0 0.0 \n",
+ "4 0.0 0.0 0.0 \n",
+ "... ... ... ... \n",
+ "17864 0.0 0.0 0.0 \n",
+ "17865 0.0 0.0 0.0 \n",
+ "17866 0.0 0.0 0.0 \n",
+ "17867 0.0 0.0 0.0 \n",
+ "17868 0.0 1.0 0.0 \n",
+ "\n",
+ "[17869 rows x 1610 columns]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Получим имена категориальных признаков после OneHotEncoder\n",
+ "categorical_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)\n",
+ "\n",
+ "# Объединим их с именами числовых признаков\n",
+ "feature_names = list(numeric_features) + list(categorical_feature_names)\n",
+ "\n",
+ "# Создадим DataFrame для преобразованных данных\n",
+ "X_transformed_df = pd.DataFrame(X_transformed.toarray() if hasattr(X_transformed, 'toarray') else X_transformed, columns=feature_names)\n",
+ "\n",
+ "# Выведем пример 5 строк\n",
+ "X_transformed_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# обучим 3 разные модели с применением RandomizedSearchCV(для подора гиперпараметров)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training LinearRegression...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "d:\\МИИ\\AIM-PIbd-31-Kouvshinoff-T-A\\laba\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 1 is smaller than n_iter=10. Running 1 iterations. For exhaustive searches, use GridSearchCV.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training RandomForestRegressor...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "d:\\МИИ\\AIM-PIbd-31-Kouvshinoff-T-A\\laba\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 9 is smaller than n_iter=10. Running 9 iterations. For exhaustive searches, use GridSearchCV.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training GradientBoostingRegressor...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "d:\\МИИ\\AIM-PIbd-31-Kouvshinoff-T-A\\laba\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n",
+ " _data = np.array(data, dtype=dtype, copy=copy,\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Model: LinearRegression\n",
+ "Best Params: {}\n",
+ "MAE: 6722.0902357642335\n",
+ "RMSE: 8991.273616765677\n",
+ "R2: 0.3722951567248176\n",
+ "\n",
+ "Model: RandomForestRegressor\n",
+ "Best Params: {'model__n_estimators': 200, 'model__max_depth': None}\n",
+ "MAE: 3568.360497561258\n",
+ "RMSE: 6055.406570308487\n",
+ "R2: 0.7152920023310496\n",
+ "\n",
+ "Model: GradientBoostingRegressor\n",
+ "Best Params: {'model__n_estimators': 200, 'model__max_depth': 10, 'model__learning_rate': 0.2}\n",
+ "MAE: 3933.35109066405\n",
+ "RMSE: 6171.208466996527\n",
+ "R2: 0.7042985281049783\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "from sklearn.linear_model import LinearRegression\n",
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "from sklearn.ensemble import GradientBoostingRegressor\n",
+ "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n",
+ "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "random_state = 42\n",
+ "\n",
+ "# Модели и параметры\n",
+ "models_regression = {\n",
+ " \"LinearRegression\": LinearRegression(),\n",
+ " \"RandomForestRegressor\": RandomForestRegressor(random_state=random_state),\n",
+ " \"GradientBoostingRegressor\": GradientBoostingRegressor(random_state=random_state)\n",
+ "}\n",
+ "\n",
+ "param_grids_regression = {\n",
+ " \"LinearRegression\": {},\n",
+ " \"RandomForestRegressor\": {\n",
+ " 'model__n_estimators': [50, 100, 200],\n",
+ " 'model__max_depth': [None, 10, 20],\n",
+ " },\n",
+ " \"GradientBoostingRegressor\": {\n",
+ " 'model__n_estimators': [50, 100, 200],\n",
+ " 'model__learning_rate': [0.01, 0.1, 0.2],\n",
+ " 'model__max_depth': [3, 5, 10]\n",
+ " }\n",
+ "}\n",
+ "\n",
+ "# Результаты\n",
+ "results_regression = {}\n",
+ "\n",
+ "# Перебор моделей\n",
+ "for name, model in models_regression.items():\n",
+ " print(f\"Training {name}...\")\n",
+ " pipeline = Pipeline(steps=[\n",
+ " ('preprocessor', preprocessor),\n",
+ " ('model', model)\n",
+ " ])\n",
+ " param_grid = param_grids_regression[name]\n",
+ " grid_search = RandomizedSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)\n",
+ " grid_search.fit(X_train, y_train)\n",
+ "\n",
+ " # Лучшая модель\n",
+ " best_model = grid_search.best_estimator_\n",
+ " y_pred = best_model.predict(X_test)\n",
+ "\n",
+ " # Метрики\n",
+ " mae = mean_absolute_error(y_test, y_pred)\n",
+ " rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
+ " r2 = r2_score(y_test, y_pred)\n",
+ "\n",
+ " # Сохранение результатов\n",
+ " results_regression[name] = {\n",
+ " \"Best Params\": grid_search.best_params_,\n",
+ " \"MAE\": mae,\n",
+ " \"RMSE\": rmse,\n",
+ " \"R2\": r2\n",
+ " }\n",
+ "\n",
+ "# Печать результатов\n",
+ "for name, metrics in results_regression.items():\n",
+ " print(f\"\\nModel: {name}\")\n",
+ " for metric, value in metrics.items():\n",
+ " print(f\"{metric}: {value}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " MAE | \n",
+ " RMSE | \n",
+ " R2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " RandomForestRegressor | \n",
+ " 3568.360498 | \n",
+ " 6055.406570 | \n",
+ " 0.715292 | \n",
+ "
\n",
+ " \n",
+ " GradientBoostingRegressor | \n",
+ " 3933.351091 | \n",
+ " 6171.208467 | \n",
+ " 0.704299 | \n",
+ "
\n",
+ " \n",
+ " LinearRegression | \n",
+ " 6722.090236 | \n",
+ " 8991.273617 | \n",
+ " 0.372295 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Импортируем pandas для работы с таблицами\n",
+ "import pandas as pd\n",
+ "\n",
+ "# Формируем таблицу метрик\n",
+ "reg_metrics = pd.DataFrame.from_dict(results_regression, orient=\"index\")[\n",
+ " [\"MAE\", \"RMSE\", \"R2\"]\n",
+ "]\n",
+ "\n",
+ "# Визуализация результатов с помощью стилизации\n",
+ "styled_metrics = (\n",
+ " reg_metrics.sort_values(by=\"RMSE\")\n",
+ " .style.background_gradient(cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE\", \"MAE\"])\n",
+ " .background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"R2\"])\n",
+ ")\n",
+ "\n",
+ "styled_metrics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# невероятно\n",
+ "Модель может что-то даже предсказать, с погрешностью в 3к$ конечно и ошибкой 70% но всё же. Я думал и 50% не будет.\n",
+ "Возможно если сузить сильнее входные данные, потому что выбросы очень большие, результат будет лучше. Линейная регрессия кстати вообще не справилась с данными а вот 2 другие ещё более менее"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# приступим к задаче классификации"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training LogisticRegression...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "d:\\МИИ\\AIM-PIbd-31-Kouvshinoff-T-A\\laba\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 3 is smaller than n_iter=10. Running 3 iterations. For exhaustive searches, use GridSearchCV.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training RandomForestClassifier...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "d:\\МИИ\\AIM-PIbd-31-Kouvshinoff-T-A\\laba\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n",
+ " _data = np.array(data, dtype=dtype, copy=copy,\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training KNN...\n",
+ "\n",
+ "Model: LogisticRegression\n",
+ "Best Params: {'model__C': 10}\n",
+ "Accuracy: 0.8612199216564074\n",
+ "F1 Score: 0.9032383925087788\n",
+ "Confusion_matrix: [[ 763 303]\n",
+ " [ 193 2315]]\n",
+ "\n",
+ "Model: RandomForestClassifier\n",
+ "Best Params: {'model__n_estimators': 500, 'model__max_features': 'log2', 'model__max_depth': 20, 'model__criterion': 'gini'}\n",
+ "Accuracy: 0.802182428651371\n",
+ "F1 Score: 0.874800779174783\n",
+ "Confusion_matrix: [[ 397 669]\n",
+ " [ 38 2470]]\n",
+ "\n",
+ "Model: KNN\n",
+ "Best Params: {'model__weights': 'uniform', 'model__n_neighbors': 5}\n",
+ "Accuracy: 0.8718522663682149\n",
+ "F1 Score: 0.9082532051282052\n",
+ "Confusion_matrix: [[ 849 217]\n",
+ " [ 241 2267]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.neighbors import KNeighborsClassifier\n",
+ "from sklearn.metrics import accuracy_score, confusion_matrix, f1_score\n",
+ "\n",
+ "X = df.drop(columns=['Leather interior','ID']) # Признаки\n",
+ "# Целевая переменная для классификации\n",
+ "y_class = df['Leather interior'].map({'Yes': 1, 'No': 0}) # Преобразуем в 0/1\n",
+ "\n",
+ "# Разделение данных\n",
+ "X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_class, test_size=0.2, random_state=42)\n",
+ "\n",
+ "# Модели и параметры\n",
+ "models_classification = {\n",
+ " \"LogisticRegression\": LogisticRegression(max_iter=1000),\n",
+ " \"RandomForestClassifier\": RandomForestClassifier(random_state=42),\n",
+ " \"KNN\": KNeighborsClassifier()\n",
+ "}\n",
+ "\n",
+ "param_grids_classification = {\n",
+ " \"LogisticRegression\": {\n",
+ " 'model__C': [0.1, 1, 10]\n",
+ " },\n",
+ " \"RandomForestClassifier\": {\n",
+ " \"model__n_estimators\": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],\n",
+ " \"model__max_features\": [\"sqrt\", \"log2\", 2],\n",
+ " \"model__max_depth\": [2, 3, 4, 5, 6, 7, 8, 9 ,10, 20],\n",
+ " \"model__criterion\": [\"gini\", \"entropy\", \"log_loss\"],\n",
+ " },\n",
+ " \"KNN\": {\n",
+ " 'model__n_neighbors': [3, 5, 7, 9, 11],\n",
+ " 'model__weights': ['uniform', 'distance']\n",
+ " }\n",
+ "}\n",
+ "\n",
+ "# Результаты\n",
+ "results_classification = {}\n",
+ "\n",
+ "# Перебор моделей\n",
+ "for name, model in models_classification.items():\n",
+ " print(f\"Training {name}...\")\n",
+ " pipeline = Pipeline(steps=[\n",
+ " ('preprocessor', preprocessor),\n",
+ " ('model', model)\n",
+ " ])\n",
+ " param_grid = param_grids_classification[name]\n",
+ " grid_search = RandomizedSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)\n",
+ " grid_search.fit(X_train_clf, y_train_clf)\n",
+ "\n",
+ " # Лучшая модель\n",
+ " best_model = grid_search.best_estimator_\n",
+ " y_pred = best_model.predict(X_test_clf)\n",
+ "\n",
+ " # Метрики\n",
+ " acc = accuracy_score(y_test_clf, y_pred)\n",
+ " f1 = f1_score(y_test_clf, y_pred)\n",
+ "\n",
+ " # Вычисление матрицы ошибок\n",
+ " c_matrix = confusion_matrix(y_test_clf, y_pred)\n",
+ "\n",
+ " # Сохранение результатов\n",
+ " results_classification[name] = {\n",
+ " \"Best Params\": grid_search.best_params_,\n",
+ " \"Accuracy\": acc,\n",
+ " \"F1 Score\": f1,\n",
+ " \"Confusion_matrix\": c_matrix\n",
+ " }\n",
+ "\n",
+ "# Печать результатов\n",
+ "for name, metrics in results_classification.items():\n",
+ " print(f\"\\nModel: {name}\")\n",
+ " for metric, value in metrics.items():\n",
+ " print(f\"{metric}: {value}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# отресуем красивые квадратики\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import ConfusionMatrixDisplay\n",
+ "\n",
+ "\n",
+ "num_models = len(results_classification)\n",
+ "num_rows = (num_models // 2) + (num_models % 2) # Количество строк для подграфиков\n",
+ "_, ax = plt.subplots(num_rows, 2, figsize=(12, 10), sharex=False, sharey=False)\n",
+ "\n",
+ "for index, (name, metrics) in enumerate(results_classification.items()):\n",
+ " c_matrix = metrics[\"Confusion_matrix\"]\n",
+ " disp = ConfusionMatrixDisplay(\n",
+ " confusion_matrix=c_matrix, display_labels=[\"No\", \"Yes\"]\n",
+ " ).plot(ax=ax.flat[index])\n",
+ " disp.ax_.set_title(name)\n",
+ "\n",
+ "# Корректировка расположения графиков\n",
+ "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Accuracy | \n",
+ " F1 Score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " KNN | \n",
+ " 0.871852 | \n",
+ " 0.908253 | \n",
+ "
\n",
+ " \n",
+ " LogisticRegression | \n",
+ " 0.861220 | \n",
+ " 0.903238 | \n",
+ "
\n",
+ " \n",
+ " RandomForestClassifier | \n",
+ " 0.802182 | \n",
+ " 0.874801 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "# Формируем таблицу метрик классификации\n",
+ "clf_metrics = pd.DataFrame.from_dict(results_classification, orient=\"index\")[[\"Accuracy\", \"F1 Score\"]]\n",
+ "\n",
+ "# Визуализация результатов с помощью стилизации\n",
+ "styled_metrics_clf = (\n",
+ " clf_metrics.sort_values(by=\"F1 Score\", ascending=False) # Сортировка по F1 Score\n",
+ " .style.background_gradient(cmap=\"viridis\", low=0, high=1, subset=[\"F1 Score\", \"Accuracy\"]) # Стилизация столбцов\n",
+ " .background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"Accuracy\"])\n",
+ ")\n",
+ "\n",
+ "styled_metrics_clf"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "В итоге KNN и LogisticRegression выдали точность в 90% что я считаю весьма неплохо. RandomForestClassifier близко, но не так хорошо"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "laba",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}