{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Выбор бизнес-целей\n", "### Задача регрессии:\n", "\n", "Цель: Предсказать цену автомобиля (Price) на основе других характеристик.\n", "\n", "Применение: Это может быть полезно для автосалонов, онлайн-площадок по продаже автомобилей, а также для частных лиц, которые хотят оценить рыночную стоимость своего автомобиля.\n", "\n", "Задача классификации:\n", "\n", "Цель: Классифицировать автомобили по категориям (например, \"Эконом\", \"Средний\", \"Премиум\") на основе цены и других характеристик.\n", "\n", "Применение: Это может быть полезно для маркетинговых кампаний, определения целевой аудитории, а также для анализа рынка автомобилей." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ID Price Levy Manufacturer Model Prod. year Category \\\n", "0 45654403 13328 1399 LEXUS RX 450 2010 Jeep \n", "1 44731507 16621 1018 CHEVROLET Equinox 2011 Jeep \n", "2 45774419 8467 - HONDA FIT 2006 Hatchback \n", "3 45769185 3607 862 FORD Escape 2011 Jeep \n", "4 45809263 11726 446 HONDA FIT 2014 Hatchback \n", "\n", " Leather interior Fuel type Engine volume Mileage Cylinders \\\n", "0 Yes Hybrid 3.5 186005 km 6.0 \n", "1 No Petrol 3 192000 km 6.0 \n", "2 No Petrol 1.3 200000 km 4.0 \n", "3 Yes Hybrid 2.5 168966 km 4.0 \n", "4 Yes Petrol 1.3 91901 km 4.0 \n", "\n", " Gear box type Drive wheels Doors Wheel Color Airbags \n", "0 Automatic 4x4 04-May Left wheel Silver 12 \n", "1 Tiptronic 4x4 04-May Left wheel Black 8 \n", "2 Variator Front 04-May Right-hand drive Black 2 \n", "3 Automatic 4x4 04-May Left wheel White 0 \n", "4 Automatic Front 04-May Left wheel Silver 4 \n", "Index(['ID', 'Price', 'Levy', 'Manufacturer', 'Model', 'Prod. year',\n", " 'Category', 'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',\n", " 'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color',\n", " 'Airbags'],\n", " dtype='object')\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.linear_model import LinearRegression, LogisticRegression\n", "from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier\n", "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", "from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, roc_auc_score, confusion_matrix, classification_report\n", "df = pd.read_csv(\"./static/csv/car_price_prediction.csv\")\n", "print(df.head())\n", "print(df.columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Предобработка данных" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ID 0\n", "Price 0\n", "Levy 0\n", "Manufacturer 0\n", "Model 0\n", "Prod. year 0\n", "Category 0\n", "Leather interior 0\n", "Fuel type 0\n", "Engine volume 0\n", "Mileage 0\n", "Cylinders 0\n", "Gear box type 0\n", "Drive wheels 0\n", "Doors 0\n", "Wheel 0\n", "Color 0\n", "Airbags 0\n", "dtype: int64\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Egor\\AppData\\Local\\Temp\\ipykernel_16964\\3618217033.py:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " df['Levy'].fillna(df['Levy'].median(), inplace=True)\n", "C:\\Users\\Egor\\AppData\\Local\\Temp\\ipykernel_16964\\3618217033.py:10: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " df['Mileage'].fillna(df['Mileage'].median(), inplace=True)\n" ] } ], "source": [ "# Проверка наличия пропущенных значений\n", "print(df.isnull().sum())\n", "\n", "# Очистка столбца 'Levy' от нечисловых значений\n", "df['Levy'] = pd.to_numeric(df['Levy'], errors='coerce')\n", "df['Mileage'] = pd.to_numeric(df['Levy'], errors='coerce')\n", "\n", "# Заполнение пропущенных значений\n", "df['Levy'].fillna(df['Levy'].median(), inplace=True)\n", "df['Mileage'].fillna(df['Mileage'].median(), inplace=True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Определение числовых и категориальных признаков\n", "numeric_features = ['Levy', 'Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']\n", "categorical_features = ['Manufacturer', 'Model', 'Category', 'Leather interior', 'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color']\n", "\n", "# Преобразование категориальных признаков в числовые\n", "df = pd.get_dummies(df, columns=categorical_features, drop_first=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Разделение данных на тренировочный и тестовый наборы" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Задача регрессии\n", "X_reg = df.drop(['ID', 'Price'], axis=1)\n", "y_reg = df['Price']\n", "\n", "# Задача классификации\n", "df['Category'] = pd.cut(df['Price'], bins=[0, 10000, 20000, np.inf], labels=['Эконом', 'Средний', 'Премиум'])\n", "X_class = df.drop(['ID', 'Price', 'Category'], axis=1)\n", "y_class = df['Category']\n", "\n", "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n", "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 2 }