diff --git a/notebooks/lab4.ipynb b/notebooks/lab4.ipynb
new file mode 100644
index 0000000..659ae62
--- /dev/null
+++ b/notebooks/lab4.ipynb
@@ -0,0 +1,1074 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Загрузка набора данных"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 219,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Price | \n",
+ " Levy | \n",
+ " Manufacturer | \n",
+ " Model | \n",
+ " Prod. year | \n",
+ " Category | \n",
+ " Leather interior | \n",
+ " Fuel type | \n",
+ " Engine volume | \n",
+ " Mileage | \n",
+ " Cylinders | \n",
+ " Gear box type | \n",
+ " Drive wheels | \n",
+ " Doors | \n",
+ " Wheel | \n",
+ " Color | \n",
+ " Airbags | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 13328 | \n",
+ " 1399 | \n",
+ " LEXUS | \n",
+ " RX 450 | \n",
+ " 2010 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " Hybrid | \n",
+ " 3.5 | \n",
+ " 186005 | \n",
+ " 6 | \n",
+ " Automatic | \n",
+ " 4x4 | \n",
+ " Четырехдверный | \n",
+ " Left wheel | \n",
+ " Silver | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 16621 | \n",
+ " 1018 | \n",
+ " CHEVROLET | \n",
+ " Equinox | \n",
+ " 2011 | \n",
+ " Jeep | \n",
+ " No | \n",
+ " Petrol | \n",
+ " 3.0 | \n",
+ " 192000 | \n",
+ " 6 | \n",
+ " Tiptronic | \n",
+ " 4x4 | \n",
+ " Четырехдверный | \n",
+ " Left wheel | \n",
+ " Black | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 8467 | \n",
+ " 0 | \n",
+ " HONDA | \n",
+ " FIT | \n",
+ " 2006 | \n",
+ " Hatchback | \n",
+ " No | \n",
+ " Petrol | \n",
+ " 1.3 | \n",
+ " 200000 | \n",
+ " 4 | \n",
+ " Variator | \n",
+ " Front | \n",
+ " Четырехдверный | \n",
+ " Right-hand drive | \n",
+ " Black | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3607 | \n",
+ " 862 | \n",
+ " FORD | \n",
+ " Escape | \n",
+ " 2011 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " Hybrid | \n",
+ " 2.5 | \n",
+ " 168966 | \n",
+ " 4 | \n",
+ " Automatic | \n",
+ " 4x4 | \n",
+ " Четырехдверный | \n",
+ " Left wheel | \n",
+ " White | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 11726 | \n",
+ " 446 | \n",
+ " HONDA | \n",
+ " FIT | \n",
+ " 2014 | \n",
+ " Hatchback | \n",
+ " Yes | \n",
+ " Petrol | \n",
+ " 1.3 | \n",
+ " 91901 | \n",
+ " 4 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " Четырехдверный | \n",
+ " Left wheel | \n",
+ " Silver | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 12592 | \n",
+ " 8781 | \n",
+ " 1107 | \n",
+ " OPEL | \n",
+ " Combo | \n",
+ " 2007 | \n",
+ " Goods wagon | \n",
+ " No | \n",
+ " Diesel | \n",
+ " 1.7 | \n",
+ " 236000 | \n",
+ " 4 | \n",
+ " Manual | \n",
+ " Front | \n",
+ " Четырехдверный | \n",
+ " Left wheel | \n",
+ " Beige | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 12593 | \n",
+ " 7840 | \n",
+ " 0 | \n",
+ " NISSAN | \n",
+ " Skyline | \n",
+ " 2003 | \n",
+ " Sedan | \n",
+ " Yes | \n",
+ " Petrol | \n",
+ " 3.0 | \n",
+ " 220000 | \n",
+ " 6 | \n",
+ " Tiptronic | \n",
+ " Rear | \n",
+ " Четырехдверный | \n",
+ " Right-hand drive | \n",
+ " White | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 12594 | \n",
+ " 8467 | \n",
+ " 0 | \n",
+ " MERCEDES-BENZ | \n",
+ " CLK 200 | \n",
+ " 1999 | \n",
+ " Coupe | \n",
+ " Yes | \n",
+ " CNG | \n",
+ " 2.0 | \n",
+ " 300000 | \n",
+ " 4 | \n",
+ " Manual | \n",
+ " Rear | \n",
+ " Двухдверный | \n",
+ " Left wheel | \n",
+ " Silver | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 12595 | \n",
+ " 15681 | \n",
+ " 831 | \n",
+ " HYUNDAI | \n",
+ " Sonata | \n",
+ " 2011 | \n",
+ " Sedan | \n",
+ " Yes | \n",
+ " Petrol | \n",
+ " 2.4 | \n",
+ " 161600 | \n",
+ " 4 | \n",
+ " Tiptronic | \n",
+ " Front | \n",
+ " Четырехдверный | \n",
+ " Left wheel | \n",
+ " Red | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 12596 | \n",
+ " 26108 | \n",
+ " 836 | \n",
+ " HYUNDAI | \n",
+ " Tucson | \n",
+ " 2010 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " Diesel | \n",
+ " 2.0 | \n",
+ " 116365 | \n",
+ " 4 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " Четырехдверный | \n",
+ " Left wheel | \n",
+ " Grey | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
12597 rows × 17 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Price Levy Manufacturer Model Prod. year Category \\\n",
+ "0 13328 1399 LEXUS RX 450 2010 Jeep \n",
+ "1 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
+ "2 8467 0 HONDA FIT 2006 Hatchback \n",
+ "3 3607 862 FORD Escape 2011 Jeep \n",
+ "4 11726 446 HONDA FIT 2014 Hatchback \n",
+ "... ... ... ... ... ... ... \n",
+ "12592 8781 1107 OPEL Combo 2007 Goods wagon \n",
+ "12593 7840 0 NISSAN Skyline 2003 Sedan \n",
+ "12594 8467 0 MERCEDES-BENZ CLK 200 1999 Coupe \n",
+ "12595 15681 831 HYUNDAI Sonata 2011 Sedan \n",
+ "12596 26108 836 HYUNDAI Tucson 2010 Jeep \n",
+ "\n",
+ " Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
+ "0 Yes Hybrid 3.5 186005 6 \n",
+ "1 No Petrol 3.0 192000 6 \n",
+ "2 No Petrol 1.3 200000 4 \n",
+ "3 Yes Hybrid 2.5 168966 4 \n",
+ "4 Yes Petrol 1.3 91901 4 \n",
+ "... ... ... ... ... ... \n",
+ "12592 No Diesel 1.7 236000 4 \n",
+ "12593 Yes Petrol 3.0 220000 6 \n",
+ "12594 Yes CNG 2.0 300000 4 \n",
+ "12595 Yes Petrol 2.4 161600 4 \n",
+ "12596 Yes Diesel 2.0 116365 4 \n",
+ "\n",
+ " Gear box type Drive wheels Doors Wheel Color \\\n",
+ "0 Automatic 4x4 Четырехдверный Left wheel Silver \n",
+ "1 Tiptronic 4x4 Четырехдверный Left wheel Black \n",
+ "2 Variator Front Четырехдверный Right-hand drive Black \n",
+ "3 Automatic 4x4 Четырехдверный Left wheel White \n",
+ "4 Automatic Front Четырехдверный Left wheel Silver \n",
+ "... ... ... ... ... ... \n",
+ "12592 Manual Front Четырехдверный Left wheel Beige \n",
+ "12593 Tiptronic Rear Четырехдверный Right-hand drive White \n",
+ "12594 Manual Rear Двухдверный Left wheel Silver \n",
+ "12595 Tiptronic Front Четырехдверный Left wheel Red \n",
+ "12596 Automatic Front Четырехдверный Left wheel Grey \n",
+ "\n",
+ " Airbags \n",
+ "0 12 \n",
+ "1 8 \n",
+ "2 2 \n",
+ "3 0 \n",
+ "4 4 \n",
+ "... ... \n",
+ "12592 4 \n",
+ "12593 0 \n",
+ "12594 5 \n",
+ "12595 8 \n",
+ "12596 4 \n",
+ "\n",
+ "[12597 rows x 17 columns]"
+ ]
+ },
+ "execution_count": 219,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "import featuretools as ft\n",
+ "import re\n",
+ "\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "from imblearn.over_sampling import RandomOverSampler\n",
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn import set_config\n",
+ "\n",
+ "from transformers import CarsFeatures\n",
+ "\n",
+ "set_config(transform_output=\"pandas\")\n",
+ "\n",
+ "random_state = 9\n",
+ "\n",
+ "df = pd.read_csv(\"../data/car-price-prediction.csv\")\n",
+ "\n",
+ "df = df.drop(columns=[\"Unnamed: 0\"])\n",
+ "\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 220,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Price int64\n",
+ "Levy int64\n",
+ "Manufacturer object\n",
+ "Model object\n",
+ "Prod. year int64\n",
+ "Category object\n",
+ "Leather interior object\n",
+ "Fuel type object\n",
+ "Engine volume float64\n",
+ "Mileage int64\n",
+ "Cylinders int64\n",
+ "Gear box type object\n",
+ "Drive wheels object\n",
+ "Doors object\n",
+ "Wheel object\n",
+ "Color object\n",
+ "Airbags int64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 220,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Разбиение на выборки"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 221,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Размеры выборок:\n",
+ "Обучающая выборка: 8817 записей\n",
+ "Category\n",
+ "Sedan 3954\n",
+ "Jeep 2263\n",
+ "Hatchback 1554\n",
+ "Minivan 312\n",
+ "Coupe 251\n",
+ "Universal 180\n",
+ "Microbus 143\n",
+ "Goods wagon 120\n",
+ "Pickup 22\n",
+ "Cabriolet 16\n",
+ "Limousine 2\n",
+ "Name: count, dtype: int64\n",
+ "Тестовая выборка: 3780 записей\n",
+ "Category\n",
+ "Sedan 1692\n",
+ "Jeep 990\n",
+ "Hatchback 636\n",
+ "Minivan 151\n",
+ "Coupe 117\n",
+ "Universal 82\n",
+ "Goods wagon 52\n",
+ "Microbus 46\n",
+ "Pickup 8\n",
+ "Cabriolet 5\n",
+ "Limousine 1\n",
+ "Name: count, dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "X = df\n",
+ "y = df[\"Category\"]\n",
+ "\n",
+ "train_df, test_df, y_train, y_test = train_test_split(\n",
+ " X, y, test_size=0.3, random_state=42\n",
+ ")\n",
+ "\n",
+ "print(\"Размеры выборок:\")\n",
+ "print(f\"Обучающая выборка: {train_df.shape[0]} записей\")\n",
+ "print(train_df.Category.value_counts())\n",
+ "print(f\"Тестовая выборка: {test_df.shape[0]} записей\")\n",
+ "print(test_df.Category.value_counts())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Oversampling"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 222,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Размеры выборок:\n",
+ "Обучающая выборка: 43494 записей\n",
+ "Category\n",
+ "Sedan 3954\n",
+ "Jeep 3954\n",
+ "Universal 3954\n",
+ "Hatchback 3954\n",
+ "Coupe 3954\n",
+ "Goods wagon 3954\n",
+ "Minivan 3954\n",
+ "Microbus 3954\n",
+ "Pickup 3954\n",
+ "Limousine 3954\n",
+ "Cabriolet 3954\n",
+ "Name: count, dtype: int64\n",
+ "Тестовая выборка: 18612 записей\n",
+ "Category\n",
+ "Hatchback 1692\n",
+ "Sedan 1692\n",
+ "Universal 1692\n",
+ "Jeep 1692\n",
+ "Coupe 1692\n",
+ "Minivan 1692\n",
+ "Goods wagon 1692\n",
+ "Microbus 1692\n",
+ "Pickup 1692\n",
+ "Cabriolet 1692\n",
+ "Limousine 1692\n",
+ "Name: count, dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "def oversample(df):\n",
+ " X = df.drop(\"Category\", axis=1)\n",
+ " y = df[\"Category\"]\n",
+ "\n",
+ " oversampler = RandomOverSampler(random_state=42)\n",
+ " X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
+ "\n",
+ " resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
+ " return resampled_df\n",
+ "\n",
+ "\n",
+ "train_df_overs = oversample(train_df)\n",
+ "test_df_overs = oversample(test_df)\n",
+ "\n",
+ "print(\"Размеры выборок:\")\n",
+ "print(f\"Обучающая выборка: {train_df_overs.shape[0]} записей\")\n",
+ "print(train_df_overs.Category.value_counts())\n",
+ "print(f\"Тестовая выборка: {test_df_overs.shape[0]} записей\")\n",
+ "print(test_df_overs.Category.value_counts())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Формирование конвейера для классификации данных"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 223,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "columns_to_drop = [\"Prod. year\"]\n",
+ "num_columns = [\n",
+ " column\n",
+ " for column in df.columns\n",
+ " if column not in columns_to_drop and df[column].dtype != \"object\"\n",
+ "]\n",
+ "cat_columns = [\n",
+ " column\n",
+ " for column in df.columns\n",
+ " if column not in columns_to_drop and df[column].dtype == \"object\"\n",
+ "]\n",
+ "\n",
+ "num_imputer = SimpleImputer(strategy=\"median\")\n",
+ "num_scaler = StandardScaler()\n",
+ "preprocessing_num = Pipeline(\n",
+ " [\n",
+ " (\"imputer\", num_imputer),\n",
+ " (\"scaler\", num_scaler),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
+ "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
+ "preprocessing_cat = Pipeline(\n",
+ " [\n",
+ " (\"imputer\", cat_imputer),\n",
+ " (\"encoder\", cat_encoder),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "features_preprocessing = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"prepocessing_num\", preprocessing_num, num_columns),\n",
+ " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
+ " (\"prepocessing_features\", num_imputer, [\"Prod. year\"]),\n",
+ " ],\n",
+ " remainder=\"passthrough\",\n",
+ ")\n",
+ "\n",
+ "features_engineering = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"add_features\", CarsFeatures(), [\"Prod. year\"]),\n",
+ " ],\n",
+ " remainder=\"passthrough\",\n",
+ ")\n",
+ "\n",
+ "drop_columns = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"drop_columns\", \"drop\", columns_to_drop),\n",
+ " ],\n",
+ " remainder=\"passthrough\",\n",
+ ")\n",
+ "\n",
+ "features_postprocessing = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"prepocessing_num\", preprocessing_num, [\"Age\"]),\n",
+ " ],\n",
+ " remainder=\"passthrough\",\n",
+ ")\n",
+ "\n",
+ "pipeline_end = Pipeline(\n",
+ " [\n",
+ " (\"features_preprocessing\", features_preprocessing),\n",
+ " (\"drop_columns\", drop_columns),\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Демонстрация работы конвейера для предобработки данных"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 224,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Price | \n",
+ " Levy | \n",
+ " Engine volume | \n",
+ " Mileage | \n",
+ " Cylinders | \n",
+ " Airbags | \n",
+ " Manufacturer_ALFA ROMEO | \n",
+ " Manufacturer_AUDI | \n",
+ " Manufacturer_BMW | \n",
+ " Manufacturer_BUICK | \n",
+ " ... | \n",
+ " Color_Green | \n",
+ " Color_Grey | \n",
+ " Color_Orange | \n",
+ " Color_Pink | \n",
+ " Color_Purple | \n",
+ " Color_Red | \n",
+ " Color_Silver | \n",
+ " Color_Sky blue | \n",
+ " Color_White | \n",
+ " Color_Yellow | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 10083 | \n",
+ " 0.153774 | \n",
+ " -1.192982 | \n",
+ " -0.479341 | \n",
+ " -1.531744 | \n",
+ " -0.403213 | \n",
+ " -0.683755 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 9482 | \n",
+ " -0.658018 | \n",
+ " -1.192982 | \n",
+ " -0.887855 | \n",
+ " -0.130245 | \n",
+ " -0.403213 | \n",
+ " -1.190217 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 6177 | \n",
+ " -0.136145 | \n",
+ " 0.081576 | \n",
+ " -0.479341 | \n",
+ " -0.651122 | \n",
+ " -0.403213 | \n",
+ " 0.835631 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 11756 | \n",
+ " -0.455093 | \n",
+ " -1.192982 | \n",
+ " 0.473858 | \n",
+ " 1.263152 | \n",
+ " -0.403213 | \n",
+ " -0.430524 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 6557 | \n",
+ " 2.780795 | \n",
+ " 1.703146 | \n",
+ " 0.473858 | \n",
+ " -0.739330 | \n",
+ " -0.403213 | \n",
+ " -0.683755 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 11964 | \n",
+ " -0.600053 | \n",
+ " -1.192982 | \n",
+ " 0.337687 | \n",
+ " 0.318018 | \n",
+ " 1.538421 | \n",
+ " -0.683755 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 5191 | \n",
+ " -1.368394 | \n",
+ " 0.476602 | \n",
+ " -0.206998 | \n",
+ " -0.011145 | \n",
+ " -0.403213 | \n",
+ " 1.342092 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 5390 | \n",
+ " -0.310134 | \n",
+ " 0.646834 | \n",
+ " -0.887855 | \n",
+ " 1.155137 | \n",
+ " -0.403213 | \n",
+ " 0.329169 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 860 | \n",
+ " -0.107486 | \n",
+ " -1.192982 | \n",
+ " -0.751684 | \n",
+ " -0.697325 | \n",
+ " -0.403213 | \n",
+ " 0.329169 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 7270 | \n",
+ " -0.049151 | \n",
+ " 0.446048 | \n",
+ " 0.337687 | \n",
+ " -0.465093 | \n",
+ " 7.363324 | \n",
+ " 0.329169 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
8817 rows × 1179 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Price Levy Engine volume Mileage Cylinders Airbags \\\n",
+ "10083 0.153774 -1.192982 -0.479341 -1.531744 -0.403213 -0.683755 \n",
+ "9482 -0.658018 -1.192982 -0.887855 -0.130245 -0.403213 -1.190217 \n",
+ "6177 -0.136145 0.081576 -0.479341 -0.651122 -0.403213 0.835631 \n",
+ "11756 -0.455093 -1.192982 0.473858 1.263152 -0.403213 -0.430524 \n",
+ "6557 2.780795 1.703146 0.473858 -0.739330 -0.403213 -0.683755 \n",
+ "... ... ... ... ... ... ... \n",
+ "11964 -0.600053 -1.192982 0.337687 0.318018 1.538421 -0.683755 \n",
+ "5191 -1.368394 0.476602 -0.206998 -0.011145 -0.403213 1.342092 \n",
+ "5390 -0.310134 0.646834 -0.887855 1.155137 -0.403213 0.329169 \n",
+ "860 -0.107486 -1.192982 -0.751684 -0.697325 -0.403213 0.329169 \n",
+ "7270 -0.049151 0.446048 0.337687 -0.465093 7.363324 0.329169 \n",
+ "\n",
+ " Manufacturer_ALFA ROMEO Manufacturer_AUDI Manufacturer_BMW \\\n",
+ "10083 0.0 0.0 0.0 \n",
+ "9482 0.0 0.0 0.0 \n",
+ "6177 0.0 0.0 0.0 \n",
+ "11756 0.0 0.0 0.0 \n",
+ "6557 0.0 0.0 0.0 \n",
+ "... ... ... ... \n",
+ "11964 0.0 1.0 0.0 \n",
+ "5191 0.0 0.0 0.0 \n",
+ "5390 0.0 0.0 0.0 \n",
+ "860 0.0 0.0 0.0 \n",
+ "7270 0.0 0.0 0.0 \n",
+ "\n",
+ " Manufacturer_BUICK ... Color_Green Color_Grey Color_Orange \\\n",
+ "10083 0.0 ... 0.0 0.0 0.0 \n",
+ "9482 0.0 ... 0.0 0.0 0.0 \n",
+ "6177 0.0 ... 0.0 0.0 0.0 \n",
+ "11756 0.0 ... 1.0 0.0 0.0 \n",
+ "6557 0.0 ... 0.0 1.0 0.0 \n",
+ "... ... ... ... ... ... \n",
+ "11964 0.0 ... 0.0 1.0 0.0 \n",
+ "5191 0.0 ... 0.0 0.0 0.0 \n",
+ "5390 0.0 ... 0.0 0.0 0.0 \n",
+ "860 0.0 ... 0.0 0.0 0.0 \n",
+ "7270 0.0 ... 0.0 0.0 0.0 \n",
+ "\n",
+ " Color_Pink Color_Purple Color_Red Color_Silver Color_Sky blue \\\n",
+ "10083 0.0 0.0 0.0 0.0 0.0 \n",
+ "9482 0.0 0.0 0.0 0.0 0.0 \n",
+ "6177 0.0 0.0 0.0 0.0 0.0 \n",
+ "11756 0.0 0.0 0.0 0.0 0.0 \n",
+ "6557 0.0 0.0 0.0 0.0 0.0 \n",
+ "... ... ... ... ... ... \n",
+ "11964 0.0 0.0 0.0 0.0 0.0 \n",
+ "5191 0.0 0.0 0.0 1.0 0.0 \n",
+ "5390 0.0 0.0 0.0 1.0 0.0 \n",
+ "860 0.0 0.0 0.0 1.0 0.0 \n",
+ "7270 0.0 0.0 0.0 1.0 0.0 \n",
+ "\n",
+ " Color_White Color_Yellow \n",
+ "10083 1.0 0.0 \n",
+ "9482 0.0 0.0 \n",
+ "6177 0.0 0.0 \n",
+ "11756 0.0 0.0 \n",
+ "6557 0.0 0.0 \n",
+ "... ... ... \n",
+ "11964 0.0 0.0 \n",
+ "5191 0.0 0.0 \n",
+ "5390 0.0 0.0 \n",
+ "860 0.0 0.0 \n",
+ "7270 0.0 0.0 \n",
+ "\n",
+ "[8817 rows x 1179 columns]"
+ ]
+ },
+ "execution_count": 224,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preprocessing_result = pipeline_end.fit_transform(train_df)\n",
+ "preprocessed_df = pd.DataFrame(\n",
+ " preprocessing_result,\n",
+ " columns=pipeline_end.get_feature_names_out(),\n",
+ ")\n",
+ "\n",
+ "preprocessed_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Формирование набора моделей"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/transformers.py b/notebooks/transformers.py
new file mode 100644
index 0000000..29e87ef
--- /dev/null
+++ b/notebooks/transformers.py
@@ -0,0 +1,17 @@
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+
+
+class CarsFeatures(BaseEstimator, TransformerMixin):
+ def __init__(self):
+ pass
+
+ def fit(self, X, y=None):
+ return self
+
+ def transform(self, X, y=None):
+ X["Age"] = 2020 - X["Prod. year"]
+ return X
+
+ def get_feature_names_out(self, features_in):
+ return np.append(features_in, ["Age"], axis=0)