210 lines
9.1 KiB
Plaintext
210 lines
9.1 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Выбор бизнес-целей\n",
|
||
"### Задача регрессии:\n",
|
||
"\n",
|
||
"Цель: Предсказать цену автомобиля (Price) на основе других характеристик.\n",
|
||
"\n",
|
||
"Применение: Это может быть полезно для автосалонов, онлайн-площадок по продаже автомобилей, а также для частных лиц, которые хотят оценить рыночную стоимость своего автомобиля.\n",
|
||
"\n",
|
||
"Задача классификации:\n",
|
||
"\n",
|
||
"Цель: Классифицировать автомобили по категориям (например, \"Эконом\", \"Средний\", \"Премиум\") на основе цены и других характеристик.\n",
|
||
"\n",
|
||
"Применение: Это может быть полезно для маркетинговых кампаний, определения целевой аудитории, а также для анализа рынка автомобилей."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" ID Price Levy Manufacturer Model Prod. year Category \\\n",
|
||
"0 45654403 13328 1399 LEXUS RX 450 2010 Jeep \n",
|
||
"1 44731507 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
|
||
"2 45774419 8467 - HONDA FIT 2006 Hatchback \n",
|
||
"3 45769185 3607 862 FORD Escape 2011 Jeep \n",
|
||
"4 45809263 11726 446 HONDA FIT 2014 Hatchback \n",
|
||
"\n",
|
||
" Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
|
||
"0 Yes Hybrid 3.5 186005 km 6.0 \n",
|
||
"1 No Petrol 3 192000 km 6.0 \n",
|
||
"2 No Petrol 1.3 200000 km 4.0 \n",
|
||
"3 Yes Hybrid 2.5 168966 km 4.0 \n",
|
||
"4 Yes Petrol 1.3 91901 km 4.0 \n",
|
||
"\n",
|
||
" Gear box type Drive wheels Doors Wheel Color Airbags \n",
|
||
"0 Automatic 4x4 04-May Left wheel Silver 12 \n",
|
||
"1 Tiptronic 4x4 04-May Left wheel Black 8 \n",
|
||
"2 Variator Front 04-May Right-hand drive Black 2 \n",
|
||
"3 Automatic 4x4 04-May Left wheel White 0 \n",
|
||
"4 Automatic Front 04-May Left wheel Silver 4 \n",
|
||
"Index(['ID', 'Price', 'Levy', 'Manufacturer', 'Model', 'Prod. year',\n",
|
||
" 'Category', 'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',\n",
|
||
" 'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color',\n",
|
||
" 'Airbags'],\n",
|
||
" dtype='object')\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import seaborn as sns\n",
|
||
"from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score\n",
|
||
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
||
"from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier\n",
|
||
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
||
"from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, roc_auc_score, confusion_matrix, classification_report\n",
|
||
"df = pd.read_csv(\"./static/csv/car_price_prediction.csv\")\n",
|
||
"print(df.head())\n",
|
||
"print(df.columns)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Предобработка данных"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"ID 0\n",
|
||
"Price 0\n",
|
||
"Levy 0\n",
|
||
"Manufacturer 0\n",
|
||
"Model 0\n",
|
||
"Prod. year 0\n",
|
||
"Category 0\n",
|
||
"Leather interior 0\n",
|
||
"Fuel type 0\n",
|
||
"Engine volume 0\n",
|
||
"Mileage 0\n",
|
||
"Cylinders 0\n",
|
||
"Gear box type 0\n",
|
||
"Drive wheels 0\n",
|
||
"Doors 0\n",
|
||
"Wheel 0\n",
|
||
"Color 0\n",
|
||
"Airbags 0\n",
|
||
"dtype: int64\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\Users\\Egor\\AppData\\Local\\Temp\\ipykernel_16964\\3618217033.py:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
|
||
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
|
||
"\n",
|
||
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
|
||
"\n",
|
||
"\n",
|
||
" df['Levy'].fillna(df['Levy'].median(), inplace=True)\n",
|
||
"C:\\Users\\Egor\\AppData\\Local\\Temp\\ipykernel_16964\\3618217033.py:10: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
|
||
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
|
||
"\n",
|
||
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
|
||
"\n",
|
||
"\n",
|
||
" df['Mileage'].fillna(df['Mileage'].median(), inplace=True)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Проверка наличия пропущенных значений\n",
|
||
"print(df.isnull().sum())\n",
|
||
"\n",
|
||
"# Очистка столбца 'Levy' от нечисловых значений\n",
|
||
"df['Levy'] = pd.to_numeric(df['Levy'], errors='coerce')\n",
|
||
"df['Mileage'] = pd.to_numeric(df['Levy'], errors='coerce')\n",
|
||
"\n",
|
||
"# Заполнение пропущенных значений\n",
|
||
"df['Levy'].fillna(df['Levy'].median(), inplace=True)\n",
|
||
"df['Mileage'].fillna(df['Mileage'].median(), inplace=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Определение числовых и категориальных признаков\n",
|
||
"numeric_features = ['Levy', 'Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']\n",
|
||
"categorical_features = ['Manufacturer', 'Model', 'Category', 'Leather interior', 'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color']\n",
|
||
"\n",
|
||
"# Преобразование категориальных признаков в числовые\n",
|
||
"df = pd.get_dummies(df, columns=categorical_features, drop_first=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Разделение данных на тренировочный и тестовый наборы"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Задача регрессии\n",
|
||
"X_reg = df.drop(['ID', 'Price'], axis=1)\n",
|
||
"y_reg = df['Price']\n",
|
||
"\n",
|
||
"# Задача классификации\n",
|
||
"df['Category'] = pd.cut(df['Price'], bins=[0, 10000, 20000, np.inf], labels=['Эконом', 'Средний', 'Премиум'])\n",
|
||
"X_class = df.drop(['ID', 'Price', 'Category'], axis=1)\n",
|
||
"y_class = df['Category']\n",
|
||
"\n",
|
||
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
||
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|