{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Выбор бизнес-целей\n",
    "### Задача регрессии:\n",
    "\n",
    "Цель: Предсказать цену автомобиля (Price) на основе других характеристик.\n",
    "\n",
    "Применение: Это может быть полезно для автосалонов, онлайн-площадок по продаже автомобилей, а также для частных лиц, которые хотят оценить рыночную стоимость своего автомобиля.\n",
    "\n",
    "Задача классификации:\n",
    "\n",
    "Цель: Классифицировать автомобили по категориям (например, \"Эконом\", \"Средний\", \"Премиум\") на основе цены и других характеристик.\n",
    "\n",
    "Применение: Это может быть полезно для маркетинговых кампаний, определения целевой аудитории, а также для анализа рынка автомобилей."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "         ID  Price  Levy Manufacturer    Model  Prod. year   Category  \\\n",
      "0  45654403  13328  1399        LEXUS   RX 450        2010       Jeep   \n",
      "1  44731507  16621  1018    CHEVROLET  Equinox        2011       Jeep   \n",
      "2  45774419   8467     -        HONDA      FIT        2006  Hatchback   \n",
      "3  45769185   3607   862         FORD   Escape        2011       Jeep   \n",
      "4  45809263  11726   446        HONDA      FIT        2014  Hatchback   \n",
      "\n",
      "  Leather interior Fuel type Engine volume    Mileage  Cylinders  \\\n",
      "0              Yes    Hybrid           3.5  186005 km        6.0   \n",
      "1               No    Petrol             3  192000 km        6.0   \n",
      "2               No    Petrol           1.3  200000 km        4.0   \n",
      "3              Yes    Hybrid           2.5  168966 km        4.0   \n",
      "4              Yes    Petrol           1.3   91901 km        4.0   \n",
      "\n",
      "  Gear box type Drive wheels   Doors             Wheel   Color  Airbags  \n",
      "0     Automatic          4x4  04-May        Left wheel  Silver       12  \n",
      "1     Tiptronic          4x4  04-May        Left wheel   Black        8  \n",
      "2      Variator        Front  04-May  Right-hand drive   Black        2  \n",
      "3     Automatic          4x4  04-May        Left wheel   White        0  \n",
      "4     Automatic        Front  04-May        Left wheel  Silver        4  \n",
      "Index(['ID', 'Price', 'Levy', 'Manufacturer', 'Model', 'Prod. year',\n",
      "       'Category', 'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',\n",
      "       'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color',\n",
      "       'Airbags'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score\n",
    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.linear_model import LinearRegression, LogisticRegression\n",
    "from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
    "from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, roc_auc_score, confusion_matrix, classification_report\n",
    "df = pd.read_csv(\"./static/csv/car_price_prediction.csv\")\n",
    "print(df.head())\n",
    "print(df.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Предобработка данных"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ID                  0\n",
      "Price               0\n",
      "Levy                0\n",
      "Manufacturer        0\n",
      "Model               0\n",
      "Prod. year          0\n",
      "Category            0\n",
      "Leather interior    0\n",
      "Fuel type           0\n",
      "Engine volume       0\n",
      "Mileage             0\n",
      "Cylinders           0\n",
      "Gear box type       0\n",
      "Drive wheels        0\n",
      "Doors               0\n",
      "Wheel               0\n",
      "Color               0\n",
      "Airbags             0\n",
      "dtype: int64\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Egor\\AppData\\Local\\Temp\\ipykernel_16964\\3618217033.py:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
      "\n",
      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
      "\n",
      "\n",
      "  df['Levy'].fillna(df['Levy'].median(), inplace=True)\n",
      "C:\\Users\\Egor\\AppData\\Local\\Temp\\ipykernel_16964\\3618217033.py:10: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
      "\n",
      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
      "\n",
      "\n",
      "  df['Mileage'].fillna(df['Mileage'].median(), inplace=True)\n"
     ]
    }
   ],
   "source": [
    "# Проверка наличия пропущенных значений\n",
    "print(df.isnull().sum())\n",
    "\n",
    "# Очистка столбца 'Levy' от нечисловых значений\n",
    "df['Levy'] = pd.to_numeric(df['Levy'], errors='coerce')\n",
    "df['Mileage'] = pd.to_numeric(df['Levy'], errors='coerce')\n",
    "\n",
    "# Заполнение пропущенных значений\n",
    "df['Levy'].fillna(df['Levy'].median(), inplace=True)\n",
    "df['Mileage'].fillna(df['Mileage'].median(), inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Определение числовых и категориальных признаков\n",
    "numeric_features = ['Levy', 'Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']\n",
    "categorical_features = ['Manufacturer', 'Model', 'Category', 'Leather interior', 'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color']\n",
    "\n",
    "# Преобразование категориальных признаков в числовые\n",
    "df = pd.get_dummies(df, columns=categorical_features, drop_first=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Разделение данных на тренировочный и тестовый наборы"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Задача регрессии\n",
    "X_reg = df.drop(['ID', 'Price'], axis=1)\n",
    "y_reg = df['Price']\n",
    "\n",
    "# Задача классификации\n",
    "df['Category'] = pd.cut(df['Price'], bins=[0, 10000, 20000, np.inf], labels=['Эконом', 'Средний', 'Премиум'])\n",
    "X_class = df.drop(['ID', 'Price', 'Category'], axis=1)\n",
    "y_class = df['Category']\n",
    "\n",
    "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
    "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}