From 5ab313468c4fd77d05bdcb99c6f833ce9bd863ae Mon Sep 17 00:00:00 2001 From: Zakharov_Rostislav Date: Sat, 7 Dec 2024 13:00:14 +0400 Subject: [PATCH] feat(lab-4): make pipeline --- notebooks/lab4.ipynb | 1074 +++++++++++++++++++++++++++++++++++++ notebooks/transformers.py | 17 + 2 files changed, 1091 insertions(+) create mode 100644 notebooks/lab4.ipynb create mode 100644 notebooks/transformers.py diff --git a/notebooks/lab4.ipynb b/notebooks/lab4.ipynb new file mode 100644 index 0000000..659ae62 --- /dev/null +++ b/notebooks/lab4.ipynb @@ -0,0 +1,1074 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Загрузка набора данных" + ] + }, + { + "cell_type": "code", + "execution_count": 219, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PriceLevyManufacturerModelProd. yearCategoryLeather interiorFuel typeEngine volumeMileageCylindersGear box typeDrive wheelsDoorsWheelColorAirbags
0133281399LEXUSRX 4502010JeepYesHybrid3.51860056Automatic4x4ЧетырехдверныйLeft wheelSilver12
1166211018CHEVROLETEquinox2011JeepNoPetrol3.01920006Tiptronic4x4ЧетырехдверныйLeft wheelBlack8
284670HONDAFIT2006HatchbackNoPetrol1.32000004VariatorFrontЧетырехдверныйRight-hand driveBlack2
33607862FORDEscape2011JeepYesHybrid2.51689664Automatic4x4ЧетырехдверныйLeft wheelWhite0
411726446HONDAFIT2014HatchbackYesPetrol1.3919014AutomaticFrontЧетырехдверныйLeft wheelSilver4
......................................................
1259287811107OPELCombo2007Goods wagonNoDiesel1.72360004ManualFrontЧетырехдверныйLeft wheelBeige4
1259378400NISSANSkyline2003SedanYesPetrol3.02200006TiptronicRearЧетырехдверныйRight-hand driveWhite0
1259484670MERCEDES-BENZCLK 2001999CoupeYesCNG2.03000004ManualRearДвухдверныйLeft wheelSilver5
1259515681831HYUNDAISonata2011SedanYesPetrol2.41616004TiptronicFrontЧетырехдверныйLeft wheelRed8
1259626108836HYUNDAITucson2010JeepYesDiesel2.01163654AutomaticFrontЧетырехдверныйLeft wheelGrey4
\n", + "

12597 rows × 17 columns

\n", + "
" + ], + "text/plain": [ + " Price Levy Manufacturer Model Prod. year Category \\\n", + "0 13328 1399 LEXUS RX 450 2010 Jeep \n", + "1 16621 1018 CHEVROLET Equinox 2011 Jeep \n", + "2 8467 0 HONDA FIT 2006 Hatchback \n", + "3 3607 862 FORD Escape 2011 Jeep \n", + "4 11726 446 HONDA FIT 2014 Hatchback \n", + "... ... ... ... ... ... ... \n", + "12592 8781 1107 OPEL Combo 2007 Goods wagon \n", + "12593 7840 0 NISSAN Skyline 2003 Sedan \n", + "12594 8467 0 MERCEDES-BENZ CLK 200 1999 Coupe \n", + "12595 15681 831 HYUNDAI Sonata 2011 Sedan \n", + "12596 26108 836 HYUNDAI Tucson 2010 Jeep \n", + "\n", + " Leather interior Fuel type Engine volume Mileage Cylinders \\\n", + "0 Yes Hybrid 3.5 186005 6 \n", + "1 No Petrol 3.0 192000 6 \n", + "2 No Petrol 1.3 200000 4 \n", + "3 Yes Hybrid 2.5 168966 4 \n", + "4 Yes Petrol 1.3 91901 4 \n", + "... ... ... ... ... ... \n", + "12592 No Diesel 1.7 236000 4 \n", + "12593 Yes Petrol 3.0 220000 6 \n", + "12594 Yes CNG 2.0 300000 4 \n", + "12595 Yes Petrol 2.4 161600 4 \n", + "12596 Yes Diesel 2.0 116365 4 \n", + "\n", + " Gear box type Drive wheels Doors Wheel Color \\\n", + "0 Automatic 4x4 Четырехдверный Left wheel Silver \n", + "1 Tiptronic 4x4 Четырехдверный Left wheel Black \n", + "2 Variator Front Четырехдверный Right-hand drive Black \n", + "3 Automatic 4x4 Четырехдверный Left wheel White \n", + "4 Automatic Front Четырехдверный Left wheel Silver \n", + "... ... ... ... ... ... \n", + "12592 Manual Front Четырехдверный Left wheel Beige \n", + "12593 Tiptronic Rear Четырехдверный Right-hand drive White \n", + "12594 Manual Rear Двухдверный Left wheel Silver \n", + "12595 Tiptronic Front Четырехдверный Left wheel Red \n", + "12596 Automatic Front Четырехдверный Left wheel Grey \n", + "\n", + " Airbags \n", + "0 12 \n", + "1 8 \n", + "2 2 \n", + "3 0 \n", + "4 4 \n", + "... ... \n", + "12592 4 \n", + "12593 0 \n", + "12594 5 \n", + "12595 8 \n", + "12596 4 \n", + "\n", + "[12597 rows x 17 columns]" + ] + }, + "execution_count": 219, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import featuretools as ft\n", + "import re\n", + "\n", + "from sklearn.preprocessing import StandardScaler\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn import set_config\n", + "\n", + "from transformers import CarsFeatures\n", + "\n", + "set_config(transform_output=\"pandas\")\n", + "\n", + "random_state = 9\n", + "\n", + "df = pd.read_csv(\"../data/car-price-prediction.csv\")\n", + "\n", + "df = df.drop(columns=[\"Unnamed: 0\"])\n", + "\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 220, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Price int64\n", + "Levy int64\n", + "Manufacturer object\n", + "Model object\n", + "Prod. year int64\n", + "Category object\n", + "Leather interior object\n", + "Fuel type object\n", + "Engine volume float64\n", + "Mileage int64\n", + "Cylinders int64\n", + "Gear box type object\n", + "Drive wheels object\n", + "Doors object\n", + "Wheel object\n", + "Color object\n", + "Airbags int64\n", + "dtype: object" + ] + }, + "execution_count": 220, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Разбиение на выборки" + ] + }, + { + "cell_type": "code", + "execution_count": 221, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размеры выборок:\n", + "Обучающая выборка: 8817 записей\n", + "Category\n", + "Sedan 3954\n", + "Jeep 2263\n", + "Hatchback 1554\n", + "Minivan 312\n", + "Coupe 251\n", + "Universal 180\n", + "Microbus 143\n", + "Goods wagon 120\n", + "Pickup 22\n", + "Cabriolet 16\n", + "Limousine 2\n", + "Name: count, dtype: int64\n", + "Тестовая выборка: 3780 записей\n", + "Category\n", + "Sedan 1692\n", + "Jeep 990\n", + "Hatchback 636\n", + "Minivan 151\n", + "Coupe 117\n", + "Universal 82\n", + "Goods wagon 52\n", + "Microbus 46\n", + "Pickup 8\n", + "Cabriolet 5\n", + "Limousine 1\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "X = df\n", + "y = df[\"Category\"]\n", + "\n", + "train_df, test_df, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.3, random_state=42\n", + ")\n", + "\n", + "print(\"Размеры выборок:\")\n", + "print(f\"Обучающая выборка: {train_df.shape[0]} записей\")\n", + "print(train_df.Category.value_counts())\n", + "print(f\"Тестовая выборка: {test_df.shape[0]} записей\")\n", + "print(test_df.Category.value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Oversampling" + ] + }, + { + "cell_type": "code", + "execution_count": 222, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размеры выборок:\n", + "Обучающая выборка: 43494 записей\n", + "Category\n", + "Sedan 3954\n", + "Jeep 3954\n", + "Universal 3954\n", + "Hatchback 3954\n", + "Coupe 3954\n", + "Goods wagon 3954\n", + "Minivan 3954\n", + "Microbus 3954\n", + "Pickup 3954\n", + "Limousine 3954\n", + "Cabriolet 3954\n", + "Name: count, dtype: int64\n", + "Тестовая выборка: 18612 записей\n", + "Category\n", + "Hatchback 1692\n", + "Sedan 1692\n", + "Universal 1692\n", + "Jeep 1692\n", + "Coupe 1692\n", + "Minivan 1692\n", + "Goods wagon 1692\n", + "Microbus 1692\n", + "Pickup 1692\n", + "Cabriolet 1692\n", + "Limousine 1692\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "def oversample(df):\n", + " X = df.drop(\"Category\", axis=1)\n", + " y = df[\"Category\"]\n", + "\n", + " oversampler = RandomOverSampler(random_state=42)\n", + " X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n", + "\n", + " resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n", + " return resampled_df\n", + "\n", + "\n", + "train_df_overs = oversample(train_df)\n", + "test_df_overs = oversample(test_df)\n", + "\n", + "print(\"Размеры выборок:\")\n", + "print(f\"Обучающая выборка: {train_df_overs.shape[0]} записей\")\n", + "print(train_df_overs.Category.value_counts())\n", + "print(f\"Тестовая выборка: {test_df_overs.shape[0]} записей\")\n", + "print(test_df_overs.Category.value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Формирование конвейера для классификации данных" + ] + }, + { + "cell_type": "code", + "execution_count": 223, + "metadata": {}, + "outputs": [], + "source": [ + "columns_to_drop = [\"Prod. year\"]\n", + "num_columns = [\n", + " column\n", + " for column in df.columns\n", + " if column not in columns_to_drop and df[column].dtype != \"object\"\n", + "]\n", + "cat_columns = [\n", + " column\n", + " for column in df.columns\n", + " if column not in columns_to_drop and df[column].dtype == \"object\"\n", + "]\n", + "\n", + "num_imputer = SimpleImputer(strategy=\"median\")\n", + "num_scaler = StandardScaler()\n", + "preprocessing_num = Pipeline(\n", + " [\n", + " (\"imputer\", num_imputer),\n", + " (\"scaler\", num_scaler),\n", + " ]\n", + ")\n", + "\n", + "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n", + "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n", + "preprocessing_cat = Pipeline(\n", + " [\n", + " (\"imputer\", cat_imputer),\n", + " (\"encoder\", cat_encoder),\n", + " ]\n", + ")\n", + "\n", + "features_preprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"prepocessing_num\", preprocessing_num, num_columns),\n", + " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n", + " (\"prepocessing_features\", num_imputer, [\"Prod. year\"]),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "features_engineering = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"add_features\", CarsFeatures(), [\"Prod. year\"]),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "drop_columns = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"drop_columns\", \"drop\", columns_to_drop),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "features_postprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"prepocessing_num\", preprocessing_num, [\"Age\"]),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "pipeline_end = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " (\"drop_columns\", drop_columns),\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Демонстрация работы конвейера для предобработки данных" + ] + }, + { + "cell_type": "code", + "execution_count": 224, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PriceLevyEngine volumeMileageCylindersAirbagsManufacturer_ALFA ROMEOManufacturer_AUDIManufacturer_BMWManufacturer_BUICK...Color_GreenColor_GreyColor_OrangeColor_PinkColor_PurpleColor_RedColor_SilverColor_Sky blueColor_WhiteColor_Yellow
100830.153774-1.192982-0.479341-1.531744-0.403213-0.6837550.00.00.00.0...0.00.00.00.00.00.00.00.01.00.0
9482-0.658018-1.192982-0.887855-0.130245-0.403213-1.1902170.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
6177-0.1361450.081576-0.479341-0.651122-0.4032130.8356310.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
11756-0.455093-1.1929820.4738581.263152-0.403213-0.4305240.00.00.00.0...1.00.00.00.00.00.00.00.00.00.0
65572.7807951.7031460.473858-0.739330-0.403213-0.6837550.00.00.00.0...0.01.00.00.00.00.00.00.00.00.0
..................................................................
11964-0.600053-1.1929820.3376870.3180181.538421-0.6837550.01.00.00.0...0.01.00.00.00.00.00.00.00.00.0
5191-1.3683940.476602-0.206998-0.011145-0.4032131.3420920.00.00.00.0...0.00.00.00.00.00.01.00.00.00.0
5390-0.3101340.646834-0.8878551.155137-0.4032130.3291690.00.00.00.0...0.00.00.00.00.00.01.00.00.00.0
860-0.107486-1.192982-0.751684-0.697325-0.4032130.3291690.00.00.00.0...0.00.00.00.00.00.01.00.00.00.0
7270-0.0491510.4460480.337687-0.4650937.3633240.3291690.00.00.00.0...0.00.00.00.00.00.01.00.00.00.0
\n", + "

8817 rows × 1179 columns

\n", + "
" + ], + "text/plain": [ + " Price Levy Engine volume Mileage Cylinders Airbags \\\n", + "10083 0.153774 -1.192982 -0.479341 -1.531744 -0.403213 -0.683755 \n", + "9482 -0.658018 -1.192982 -0.887855 -0.130245 -0.403213 -1.190217 \n", + "6177 -0.136145 0.081576 -0.479341 -0.651122 -0.403213 0.835631 \n", + "11756 -0.455093 -1.192982 0.473858 1.263152 -0.403213 -0.430524 \n", + "6557 2.780795 1.703146 0.473858 -0.739330 -0.403213 -0.683755 \n", + "... ... ... ... ... ... ... \n", + "11964 -0.600053 -1.192982 0.337687 0.318018 1.538421 -0.683755 \n", + "5191 -1.368394 0.476602 -0.206998 -0.011145 -0.403213 1.342092 \n", + "5390 -0.310134 0.646834 -0.887855 1.155137 -0.403213 0.329169 \n", + "860 -0.107486 -1.192982 -0.751684 -0.697325 -0.403213 0.329169 \n", + "7270 -0.049151 0.446048 0.337687 -0.465093 7.363324 0.329169 \n", + "\n", + " Manufacturer_ALFA ROMEO Manufacturer_AUDI Manufacturer_BMW \\\n", + "10083 0.0 0.0 0.0 \n", + "9482 0.0 0.0 0.0 \n", + "6177 0.0 0.0 0.0 \n", + "11756 0.0 0.0 0.0 \n", + "6557 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "11964 0.0 1.0 0.0 \n", + "5191 0.0 0.0 0.0 \n", + "5390 0.0 0.0 0.0 \n", + "860 0.0 0.0 0.0 \n", + "7270 0.0 0.0 0.0 \n", + "\n", + " Manufacturer_BUICK ... Color_Green Color_Grey Color_Orange \\\n", + "10083 0.0 ... 0.0 0.0 0.0 \n", + "9482 0.0 ... 0.0 0.0 0.0 \n", + "6177 0.0 ... 0.0 0.0 0.0 \n", + "11756 0.0 ... 1.0 0.0 0.0 \n", + "6557 0.0 ... 0.0 1.0 0.0 \n", + "... ... ... ... ... ... \n", + "11964 0.0 ... 0.0 1.0 0.0 \n", + "5191 0.0 ... 0.0 0.0 0.0 \n", + "5390 0.0 ... 0.0 0.0 0.0 \n", + "860 0.0 ... 0.0 0.0 0.0 \n", + "7270 0.0 ... 0.0 0.0 0.0 \n", + "\n", + " Color_Pink Color_Purple Color_Red Color_Silver Color_Sky blue \\\n", + "10083 0.0 0.0 0.0 0.0 0.0 \n", + "9482 0.0 0.0 0.0 0.0 0.0 \n", + "6177 0.0 0.0 0.0 0.0 0.0 \n", + "11756 0.0 0.0 0.0 0.0 0.0 \n", + "6557 0.0 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... ... \n", + "11964 0.0 0.0 0.0 0.0 0.0 \n", + "5191 0.0 0.0 0.0 1.0 0.0 \n", + "5390 0.0 0.0 0.0 1.0 0.0 \n", + "860 0.0 0.0 0.0 1.0 0.0 \n", + "7270 0.0 0.0 0.0 1.0 0.0 \n", + "\n", + " Color_White Color_Yellow \n", + "10083 1.0 0.0 \n", + "9482 0.0 0.0 \n", + "6177 0.0 0.0 \n", + "11756 0.0 0.0 \n", + "6557 0.0 0.0 \n", + "... ... ... \n", + "11964 0.0 0.0 \n", + "5191 0.0 0.0 \n", + "5390 0.0 0.0 \n", + "860 0.0 0.0 \n", + "7270 0.0 0.0 \n", + "\n", + "[8817 rows x 1179 columns]" + ] + }, + "execution_count": 224, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessing_result = pipeline_end.fit_transform(train_df)\n", + "preprocessed_df = pd.DataFrame(\n", + " preprocessing_result,\n", + " columns=pipeline_end.get_feature_names_out(),\n", + ")\n", + "\n", + "preprocessed_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Формирование набора моделей" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/transformers.py b/notebooks/transformers.py new file mode 100644 index 0000000..29e87ef --- /dev/null +++ b/notebooks/transformers.py @@ -0,0 +1,17 @@ +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin + + +class CarsFeatures(BaseEstimator, TransformerMixin): + def __init__(self): + pass + + def fit(self, X, y=None): + return self + + def transform(self, X, y=None): + X["Age"] = 2020 - X["Prod. year"] + return X + + def get_feature_names_out(self, features_in): + return np.append(features_in, ["Age"], axis=0)