1466 lines
399 KiB
Plaintext
1466 lines
399 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Загрузка данных из файла"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 97,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Unnamed: 0\n",
|
|||
|
"Name\n",
|
|||
|
"Rating\n",
|
|||
|
"Spec_score\n",
|
|||
|
"No_of_sim\n",
|
|||
|
"Ram\n",
|
|||
|
"Battery\n",
|
|||
|
"Display\n",
|
|||
|
"Camera\n",
|
|||
|
"External_Memory\n",
|
|||
|
"Android_version\n",
|
|||
|
"Price\n",
|
|||
|
"company\n",
|
|||
|
"Inbuilt_memory\n",
|
|||
|
"fast_charging\n",
|
|||
|
"Screen_resolution\n",
|
|||
|
"Processor\n",
|
|||
|
"Processor_name\n",
|
|||
|
" Unnamed: 0 Name Rating Spec_score \\\n",
|
|||
|
"0 0 Samsung Galaxy F14 5G 4.65 68 \n",
|
|||
|
"1 1 Samsung Galaxy A11 4.20 63 \n",
|
|||
|
"2 2 Samsung Galaxy A13 4.30 75 \n",
|
|||
|
"3 3 Samsung Galaxy F23 4.10 73 \n",
|
|||
|
"4 4 Samsung Galaxy A03s (4GB RAM + 64GB) 4.10 69 \n",
|
|||
|
"5 5 Samsung Galaxy M13 5G 4.40 75 \n",
|
|||
|
"6 6 Samsung Galaxy M21 2021 4.10 76 \n",
|
|||
|
"7 7 Samsung Galaxy A12 4.10 71 \n",
|
|||
|
"8 8 Samsung Galaxy A14 5G 4.05 75 \n",
|
|||
|
"9 9 Samsung Galaxy M13 4.50 75 \n",
|
|||
|
"\n",
|
|||
|
" No_of_sim Ram Battery Display \\\n",
|
|||
|
"0 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 6000 mAh Battery 6.6 inches \n",
|
|||
|
"1 Dual Sim, 3G, 4G, VoLTE, 2 GB RAM 4000 mAh Battery 6.4 inches \n",
|
|||
|
"2 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.6 inches \n",
|
|||
|
"3 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 6000 mAh Battery 6.4 inches \n",
|
|||
|
"4 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.5 inches \n",
|
|||
|
"5 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery 6.5 inches \n",
|
|||
|
"6 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 6000 mAh Battery 6.4 inches \n",
|
|||
|
"7 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.5 inches \n",
|
|||
|
"8 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 5000 mAh Battery 6.6 inches \n",
|
|||
|
"9 Dual Sim, 3G, 4G, VoLTE, 6 GB RAM 6000 mAh Battery 6.6 inches \n",
|
|||
|
"\n",
|
|||
|
" Camera \\\n",
|
|||
|
"0 50 MP + 2 MP Dual Rear & 13 MP Front Camera \n",
|
|||
|
"1 13 MP + 5 MP + 2 MP Triple Rear & 8 MP Fro... \n",
|
|||
|
"2 50 MP Quad Rear & 8 MP Front Camera \n",
|
|||
|
"3 48 MP Quad Rear & 13 MP Front Camera \n",
|
|||
|
"4 13 MP + 2 MP + 2 MP Triple Rear & 5 MP Fro... \n",
|
|||
|
"5 50 MP + 2 MP Dual Rear & 5 MP Front Camera \n",
|
|||
|
"6 48 MP + 8 MP + 5 MP Triple Rear & 20 MP Fr... \n",
|
|||
|
"7 48 MP Quad Rear & 8 MP Front Camera \n",
|
|||
|
"8 50 MP + 2 MP + 2 MP Triple Rear & 13 MP Fr... \n",
|
|||
|
"9 50 MP + 5 MP + 2 MP Triple Rear & 8 MP Fro... \n",
|
|||
|
"\n",
|
|||
|
" External_Memory Android_version Price company \\\n",
|
|||
|
"0 Memory Card Supported, upto 1 TB 13 9,999 Samsung \n",
|
|||
|
"1 Memory Card Supported, upto 512 GB 10 9,990 Samsung \n",
|
|||
|
"2 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n",
|
|||
|
"3 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n",
|
|||
|
"4 Memory Card Supported, upto 1 TB 11 11,999 Samsung \n",
|
|||
|
"5 Memory Card Supported, upto 1 TB 12 11,990 Samsung \n",
|
|||
|
"6 Memory Card Supported, upto 512 GB 11 11,990 Samsung \n",
|
|||
|
"7 Memory Card Supported 10 11,990 Samsung \n",
|
|||
|
"8 Memory Card Supported, upto 1 TB 13 11,599 Samsung \n",
|
|||
|
"9 Memory Card Supported, upto 1 TB 12 12,298 Samsung \n",
|
|||
|
"\n",
|
|||
|
" Inbuilt_memory fast_charging \\\n",
|
|||
|
"0 128 GB inbuilt 25W Fast Charging \n",
|
|||
|
"1 32 GB inbuilt 15W Fast Charging \n",
|
|||
|
"2 64 GB inbuilt 25W Fast Charging \n",
|
|||
|
"3 64 GB inbuilt NaN \n",
|
|||
|
"4 64 GB inbuilt 15W Fast Charging \n",
|
|||
|
"5 128 GB inbuilt 15W Fast Charging \n",
|
|||
|
"6 64 GB inbuilt 15W Fast Charging \n",
|
|||
|
"7 64 GB inbuilt 15W Fast Charging \n",
|
|||
|
"8 64 GB inbuilt 15W Fast Charging \n",
|
|||
|
"9 128 GB inbuilt 15W Fast Charging \n",
|
|||
|
"\n",
|
|||
|
" Screen_resolution Processor \\\n",
|
|||
|
"0 2408 x 1080 px Display with Water Drop Notch Octa Core Processor \n",
|
|||
|
"1 720 x 1560 px Display with Punch Hole 1.8 GHz Processor \n",
|
|||
|
"2 1080 x 2408 px Display with Water Drop Notch 2 GHz Processor \n",
|
|||
|
"3 720 x 1600 px Octa Core \n",
|
|||
|
"4 720 x 1600 px Display with Water Drop Notch Octa Core \n",
|
|||
|
"5 720 x 1600 px Octa Core \n",
|
|||
|
"6 1080 x 2340 px Display with Water Drop Notch Octa Core \n",
|
|||
|
"7 720 x 1560 px Display with Water Drop Notch Octa Core \n",
|
|||
|
"8 1080 x 2408 px Octa Core \n",
|
|||
|
"9 1080 x 2400 px Display with Water Drop Notch Octa Core \n",
|
|||
|
"\n",
|
|||
|
" Processor_name \n",
|
|||
|
"0 Exynos 1330 \n",
|
|||
|
"1 Octa Core \n",
|
|||
|
"2 Octa Core \n",
|
|||
|
"3 Helio G88 \n",
|
|||
|
"4 Helio P35 \n",
|
|||
|
"5 Dimensity 700 \n",
|
|||
|
"6 Exynos 9611 \n",
|
|||
|
"7 Helio P35 \n",
|
|||
|
"8 Exynos 1330 \n",
|
|||
|
"9 Exynos 850 \n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"df = pd.read_csv(\"../static/csv/mobile phone price prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"attributes = df.columns\n",
|
|||
|
"for attribute in attributes:\n",
|
|||
|
" print(attribute)\n",
|
|||
|
" # Вывод первых 10 строк\n",
|
|||
|
"print(df.head(10))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Бизнес-цели\n",
|
|||
|
"1. Классифицировать мобильные устройства по ценовым категориям (например, бюджетные, средний класс, флагманы).\n",
|
|||
|
"2. Определить, какие характеристики мобильных устройств наиболее сильно влияют на их рейтинг."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Подготовка данных."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 48,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Пропущенные данные по каждому столбцу:\n",
|
|||
|
"Unnamed: 0 0\n",
|
|||
|
"Name 0\n",
|
|||
|
"Rating 0\n",
|
|||
|
"Spec_score 0\n",
|
|||
|
"No_of_sim 0\n",
|
|||
|
"Ram 0\n",
|
|||
|
"Battery 0\n",
|
|||
|
"Display 0\n",
|
|||
|
"Camera 0\n",
|
|||
|
"External_Memory 0\n",
|
|||
|
"Android_version 443\n",
|
|||
|
"Price 0\n",
|
|||
|
"company 0\n",
|
|||
|
"Inbuilt_memory 19\n",
|
|||
|
"fast_charging 89\n",
|
|||
|
"Screen_resolution 2\n",
|
|||
|
"Processor 28\n",
|
|||
|
"Processor_name 0\n",
|
|||
|
"dtype: int64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import numpy as np\n",
|
|||
|
"\n",
|
|||
|
"# Проверка на пропущенные значения\n",
|
|||
|
"missing_data = df.isnull().sum()\n",
|
|||
|
"print(\"Пропущенные данные по каждому столбцу:\")\n",
|
|||
|
"print(missing_data)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"При проверке на шум можно заметить выброс в 75 оценке. Цена там запредельная.\n",
|
|||
|
"\n",
|
|||
|
"Для удаления выбросов из датасета можно использовать метод межквартильного размаха. Зашумленность не очень высокая. Покрытие данных высокое и подошло бы для поставленной задачи по актуальности.\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" Name Rating Spec_score \\\n",
|
|||
|
"0 Samsung Galaxy F14 5G 4.65 68 \n",
|
|||
|
"1 Samsung Galaxy A11 4.20 63 \n",
|
|||
|
"2 Samsung Galaxy A13 4.30 75 \n",
|
|||
|
"3 Samsung Galaxy F23 4.10 73 \n",
|
|||
|
"4 Samsung Galaxy A03s (4GB RAM + 64GB) 4.10 69 \n",
|
|||
|
"5 Samsung Galaxy M13 5G 4.40 75 \n",
|
|||
|
"6 Samsung Galaxy M21 2021 4.10 76 \n",
|
|||
|
"7 Samsung Galaxy A12 4.10 71 \n",
|
|||
|
"8 Samsung Galaxy A14 5G 4.05 75 \n",
|
|||
|
"9 Samsung Galaxy M13 4.50 75 \n",
|
|||
|
"\n",
|
|||
|
" No_of_sim Ram Battery Display \\\n",
|
|||
|
"0 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 6000 mAh Battery 6.6 inches \n",
|
|||
|
"1 Dual Sim, 3G, 4G, VoLTE, 2 GB RAM 4000 mAh Battery 6.4 inches \n",
|
|||
|
"2 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.6 inches \n",
|
|||
|
"3 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 6000 mAh Battery 6.4 inches \n",
|
|||
|
"4 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.5 inches \n",
|
|||
|
"5 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery 6.5 inches \n",
|
|||
|
"6 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 6000 mAh Battery 6.4 inches \n",
|
|||
|
"7 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.5 inches \n",
|
|||
|
"8 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 5000 mAh Battery 6.6 inches \n",
|
|||
|
"9 Dual Sim, 3G, 4G, VoLTE, 6 GB RAM 6000 mAh Battery 6.6 inches \n",
|
|||
|
"\n",
|
|||
|
" Camera \\\n",
|
|||
|
"0 50 MP + 2 MP Dual Rear & 13 MP Front Camera \n",
|
|||
|
"1 13 MP + 5 MP + 2 MP Triple Rear & 8 MP Fro... \n",
|
|||
|
"2 50 MP Quad Rear & 8 MP Front Camera \n",
|
|||
|
"3 48 MP Quad Rear & 13 MP Front Camera \n",
|
|||
|
"4 13 MP + 2 MP + 2 MP Triple Rear & 5 MP Fro... \n",
|
|||
|
"5 50 MP + 2 MP Dual Rear & 5 MP Front Camera \n",
|
|||
|
"6 48 MP + 8 MP + 5 MP Triple Rear & 20 MP Fr... \n",
|
|||
|
"7 48 MP Quad Rear & 8 MP Front Camera \n",
|
|||
|
"8 50 MP + 2 MP + 2 MP Triple Rear & 13 MP Fr... \n",
|
|||
|
"9 50 MP + 5 MP + 2 MP Triple Rear & 8 MP Fro... \n",
|
|||
|
"\n",
|
|||
|
" External_Memory Android_version Price company \\\n",
|
|||
|
"0 Memory Card Supported, upto 1 TB 13 9999.0 Samsung \n",
|
|||
|
"1 Memory Card Supported, upto 512 GB 10 9990.0 Samsung \n",
|
|||
|
"2 Memory Card Supported, upto 1 TB 12 11999.0 Samsung \n",
|
|||
|
"3 Memory Card Supported, upto 1 TB 12 11999.0 Samsung \n",
|
|||
|
"4 Memory Card Supported, upto 1 TB 11 11999.0 Samsung \n",
|
|||
|
"5 Memory Card Supported, upto 1 TB 12 11990.0 Samsung \n",
|
|||
|
"6 Memory Card Supported, upto 512 GB 11 11990.0 Samsung \n",
|
|||
|
"7 Memory Card Supported 10 11990.0 Samsung \n",
|
|||
|
"8 Memory Card Supported, upto 1 TB 13 11599.0 Samsung \n",
|
|||
|
"9 Memory Card Supported, upto 1 TB 12 12298.0 Samsung \n",
|
|||
|
"\n",
|
|||
|
" Inbuilt_memory fast_charging \\\n",
|
|||
|
"0 128 GB inbuilt 25W Fast Charging \n",
|
|||
|
"1 32 GB inbuilt 15W Fast Charging \n",
|
|||
|
"2 64 GB inbuilt 25W Fast Charging \n",
|
|||
|
"3 64 GB inbuilt NaN \n",
|
|||
|
"4 64 GB inbuilt 15W Fast Charging \n",
|
|||
|
"5 128 GB inbuilt 15W Fast Charging \n",
|
|||
|
"6 64 GB inbuilt 15W Fast Charging \n",
|
|||
|
"7 64 GB inbuilt 15W Fast Charging \n",
|
|||
|
"8 64 GB inbuilt 15W Fast Charging \n",
|
|||
|
"9 128 GB inbuilt 15W Fast Charging \n",
|
|||
|
"\n",
|
|||
|
" Screen_resolution Processor \\\n",
|
|||
|
"0 2408 x 1080 px Display with Water Drop Notch Octa Core Processor \n",
|
|||
|
"1 720 x 1560 px Display with Punch Hole 1.8 GHz Processor \n",
|
|||
|
"2 1080 x 2408 px Display with Water Drop Notch 2 GHz Processor \n",
|
|||
|
"3 720 x 1600 px Octa Core \n",
|
|||
|
"4 720 x 1600 px Display with Water Drop Notch Octa Core \n",
|
|||
|
"5 720 x 1600 px Octa Core \n",
|
|||
|
"6 1080 x 2340 px Display with Water Drop Notch Octa Core \n",
|
|||
|
"7 720 x 1560 px Display with Water Drop Notch Octa Core \n",
|
|||
|
"8 1080 x 2408 px Octa Core \n",
|
|||
|
"9 1080 x 2400 px Display with Water Drop Notch Octa Core \n",
|
|||
|
"\n",
|
|||
|
" Processor_name \n",
|
|||
|
"0 Exynos 1330 \n",
|
|||
|
"1 Octa Core \n",
|
|||
|
"2 Octa Core \n",
|
|||
|
"3 Helio G88 \n",
|
|||
|
"4 Helio P35 \n",
|
|||
|
"5 Dimensity 700 \n",
|
|||
|
"6 Exynos 9611 \n",
|
|||
|
"7 Helio P35 \n",
|
|||
|
"8 Exynos 1330 \n",
|
|||
|
"9 Exynos 850 \n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA3AAAAINCAYAAAB/IZ18AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAClRElEQVR4nOzdfXxT9dk/8E/SJE3SNmltpFClWG0FhBZQFLWFTWUiohvYuYluN09u95ygTjcBFUURBJx7AHVucz7s/glObwab+LBbZYogQ8Uqz0iVAa5QaGmT5ql5/P2BiSTn5KTntElPks/79fJ13+ac0+/35KQuV6/v97o04XA4DCIiIiIiIlI9bV9PgIiIiIiIiLqHARwREREREVGGYABHRERERESUIRjAERERERERZQgGcERERERERBmCARwREREREVGGYABHRERERESUIRjAERERERERZQgGcGkUDofhcDjA3ulERERERKQEA7g06uzshNVqRWdnZ19PhYiIiIiIMhADOCIiIiIiogzBAI6IiIiIiChDMIAjIiIiIiLKEAzgiIiIiIiIMgQDOCIiIiIiogzBAI6IiIiIiChDMIAjIiIiIiLKEAzgiIiIiIiIMgQDOCIiIiIiogzBAI6IiIiIiChDMIAjIiIiIiLKEAzgiIiIiIiIMgQDOCIiIiIiogzBAI6IiIiIiChDMIAjIiIiIiLKEAzgiIiIiIiIMoSurydARERERETZw+72odXpg8Prh8Wkh63AAKvZ0NfTyhoM4IiIiIiIqFc0d3gwd812vLe/NfrauGobljbUorzY1Iczyx5cQklERERERD1md/sEwRsAbNzfinlrtsPu9vXRzLILAzgiIiIiIuqxVqdPELxFbNzfilYnA7jewACOiIiIiIh6zOH1Sx7vTHKcuocBHBERERER9ZjFqJc8XpTkOHUPAzgiIiIiIuoxW6EB46ptosfGVdtgK2Qlyt7AAI6IiIiIiHrMajZgaUOtIIgbV23DsoZathLoJZpwOBzu60nkCofDAavVCrvdDovF0tfTISIiIiLqdZE+cJ1eP4qMetgK2QeuN7EPHBERERER9RqrmQFbKnEJJRERERERUYZgAEdERERERJQhGMARERERERFlCAZwREREREREGYIBHBERERERUYZgAEdERERERJQhGMARERERERFlCAZwREREREREGYIBHBERERERUYZgAEdERERERJQhGMARERERERFlCAZwREREREREGYIBHBERERERUYZgAEdERERERJQhGMARERERERFlCAZwREREREREGYIBHBERERERUYZgAEdERERERJQhGMARERERERFlCAZwREREREREGaJPA7hHHnkEF154IYqKitCvXz9MnjwZ+/btiznH6/Xi1ltvRWlpKQoLC9HQ0ICWlpaYcw4dOoRJkybBbDajX79++MUvfoFAIBBzzjvvvIPzzz8f+fn5qKqqwnPPPSeYzxNPPIGzzjoLRqMRY8aMwQcffCB7LkREREREJI/d7cPnx5xoPNSOz487YXf7+npKqtWnAdy7776LW2+9Ff/617/w5ptvwu/348orr4TL5Yqe87Of/QyvvPIKXn75Zbz77rtobm7GddddFz0eDAYxadIk+Hw+vP/++3j++efx3HPP4f7774+ec+DAAUyaNAmXXXYZPvnkE9xxxx24+eab8Y9//CN6zl/+8hfceeedeOCBB/Dxxx9jxIgRmDBhAo4dO9btuRARERERkTzNHR7MXt2IK371LqY8+T6ueOxdzFndiOYOT19PTZU04XA43NeTiDh+/Dj69euHd999F+PGjYPdbsfpp5+OVatW4bvf/S4AYO/evRg6dCi2bNmCiy++GK+//jquueYaNDc3o6ysDADw1FNPYe7cuTh+/DgMBgPmzp2LV199FTt37oyOdcMNN6CjowNvvPEGAGDMmDG48MIL8fjjjwMAQqEQBg4ciDlz5mDevHndmksyDocDVqsVdrsdFoulV987IiIiIqJMY3f7MHt1I97b3yo4Nq7ahpVTR8FqNvTBzNRLVXvg7HY7AOC0004DAGzbtg1+vx/jx4+PnjNkyBBUVFRgy5YtAIAtW7agpqYmGrwBwIQJE+BwOLBr167oOaf+jMg5kZ/h8/mwbdu2mHO0Wi3Gjx8fPac7c4nX1dUFh8MR8w8REREREZ3U6vSJBm8AsHF/K1qdXEoZTzUBXCgUwh133IG6ujoMHz4cAHD06FEYDAYUFxfHnFtWVoajR49Gzzk1eIscjxyTOsfhcMDj8aC1tRXBYFD0nFN/RrK5xHvkkUdgtVqj/wwcOLCb7wYRERERUfZzeP2SxzuTHM9Fqgngbr31VuzcuRMvvvhiX0+l18yfPx92uz36z+HDh/t6SkREREREqmEx6iWPFyU5notUEcDNnj0b69evxz//+U+ceeaZ0df79+8Pn8+Hjo6OmPNbWlrQv3//6DnxlSAj/57sHIvFApPJBJvNhry8PNFzTv0ZyeYSLz8/HxaLJeYfIiIiIiI6yVZowLhqm+ixcdU22Aq5/y1enwZw4XAYs2fPxtq1a7FhwwZUVlbGHL/gggug1+vx9ttvR1/bt28fDh06hEsuuQQAcMkll2DHjh0x1SLffPNNWCwWnHfeedFzTv0ZkXMiP8NgMOCCCy6IOScUCuHtt9+OntOduRARERERUfdZzQYsbagVBHHjqm1Y1lDLAiYi+rQK5U9/+lOsWrUKf/vb3zB48ODo61arFSaTCQBwyy234LXXXsNzzz0Hi8WCOXPmAADef/99ACfbCIwcORLl5eVYvnw5jh49ih/+8Ie4+eabsWTJEgAn2wgMHz4ct956K2bOnIkNGzbgtttuw6uvvooJEyYAONlGYNq0afj973+Piy66CL/5zW/w0ksvYe/evdG9ccnmkgyrUBIRERERCdndPrQ6fej0+lFk1MNWaGDwlkCfBnAajUb09WeffRbTp08HcLJ59l133YXVq1ejq6sLEyZMwJNPPhmzbPHgwYO45ZZb8M4776CgoADTpk3D0qVLodPpoue88847+NnPfobdu3fjzDPPxIIFC6JjRDz++ON49NFHcfToUYwcORIrVqzAmDFjose7MxcpDOCIiIiIiKgnVNUHLtsxgCMiIiIiop5QRRETIiIiIiIiSo4BHBERERERUYZgAEdERERERJQhGMARERERERFlCAZwREREREREGUKX/BQiIiIiIspFkf5sDq8fFpMetgL2Z+trDOCIiIiIiEigucODuWu24739rdHXxlXbsLShFuXFpj6cWW7jEkoiIiIiIophd/sEwRsAbNzfinlrtsPu9vXRzIgBHBERERERxWh1+gTBW8TG/a1odTKA6ysM4IiIiIiIKIbD65c83pnkOKUOAzgiIiIiIophMeoljxclOU6pwwCOiIiIiIhi2AoNGFdtEz02rtoGWyErUfYVBnBERERERBTDajZgaUOtIIgbV23DsoZathLoQ5pwOBzu60nkCofDAavVCrvdDovF0tfTISIiIqIcI7evW+T8Tq8fRUY9bIXsA9fX2AeOiIiIiCgHKOnrZjUzYFMbLqEkIiIiIspy7OuWPRjAERERERFlOfZ1yx4M4IiIiIiIshz7umUPBnBERERERFmOfd2yBwM4IiIiIqIsx75u2YMBHBERERFRlrOaDXh48nDUV5XGvF5fVYqHJw9npckMwj5wacQ+cERERETUF+xuH+56+VMMGWDBqIHF6AqEkK/TovFwB/YdceCX149gEJch2AeOiIiIiCjLtTp9eGvPMby151jC4wzgMgOXUBIRERERZTlWocweDOCIiIiIiLIcq1BmDwZwRERERERZjlUoswcDOCIiIiKiLGc1G7C0oVYQxI2rtmFZQy33v2UQVqFMI1ahJCIiIqK+ZHf70Or0odPrR5FRD1uhgcFbhmEVSiIiIiKiHGE1M2DLdAzgiIiIiIgyUCSb5vD6YTHpYStgcJYLGMAREREREWWY5g4P5q7Zjvf2t0ZfG1dtw9KGWpQXm/pwZpRqLGJCRERERJRB7G6fIHgDgI37WzFvzXbY3b4+mhmlAwM
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"../static/csv/mobile phone price prediction.csv\",delimiter=',')\n",
|
|||
|
"df.drop(['Unnamed: 0'], axis=1, inplace=True)\n",
|
|||
|
"df['Price'] = df['Price'].str.replace(',', '').astype(float)\n",
|
|||
|
"df.describe(include='all')\n",
|
|||
|
"f, ax = plt.subplots(figsize=(10,6))\n",
|
|||
|
"sns.despine(f)\n",
|
|||
|
"sns.scatterplot(data=df, x='Spec_score', y='Price')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 51,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIjCAYAAABswtioAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXwU9f0/8NfM7M7e2dwH4QiEI1wtBREBKVaRSD3qfVYBr9YC1qtVf16oX0WlilIVqlVAaivaqq22CAiiFtCieAUJBCQm5IQcu5u9Zmfm8/sj7pLNQXaTSXaSvJ+PRx6a3eGz77k2+97P5/P+cIwxBkIIIYQQQgghvY5PdACEEEIIIYQQMlBRQkYIIYQQQgghCUIJGSGEEEIIIYQkCCVkhBBCCCGEEJIglJARQgghhBBCSIJQQkYIIYQQQgghCUIJGSGEEEIIIYQkCCVkhBBCCCGEEJIglJARQgghhBBCSIJQQkYI0aXS0lJwHIe1a9cmOpQo7733HiZNmgSz2QyO49DY2JjokEg/wHEcli5d2uuvq6oqJkyYgEceeaTLbaxfvx4FBQUwGo1ITk7WLjjSxmmnnYbTTjst0WH0qry8PCxYsCDy+/bt28FxHLZv397rsZxyyin4/e9/3+uvS/o/SsgI6WXffPMNLr74YgwbNgxmsxm5ubk488wz8cc//rHHXvOvf/0rnn766TaPV1ZWYunSpfjyyy977LVbC/8xDf8YjUaMGDEC11xzDb777jtNXmPnzp1YunSp5slSXV0dLr30UlgsFjz33HNYv349bDZbh9sn4lyT3nfaaadFXdOpqamYOnUqXn75ZaiqmujwTuhvf/sbysvLsXjx4naff/7558FxHKZNm9bu88XFxViwYAHy8/Px4osv4oUXXoDP58PSpUt79QNz+Auc8A/P80hNTcW8efOwa9euXoujvwuFQli5ciWmTp0Kh8MBu92OqVOnYuXKlQiFQl1ut6fes7V255134rnnnkN1dXWiQyH9DSOE9JodO3YwURTZyJEj2cMPP8xefPFFdv/997O5c+ey/Pz8Hnvds88+mw0bNqzN47t372YA2Jo1a3rstVv74IMPGAB28803s/Xr17OXX36ZLV68mImiyFJTU1lFRQVjjLHDhw93Obbly5czAOzw4cOaxr5x40YGgG3ZsqXTbRN1rknvmz17Nhs8eDBbv349W79+PXvqqafYpEmTGAB25513xtSG3+9noVCohyNt68c//jG78cYbO3x+xowZLC8vjwFgJSUlbZ5ftWpVm+eOHj3KALAHHnigJ0JuV/j94oorrmDr169na9euZf/v//0/lpyczEwmE/v66697LZaeFAwGWTAYTMhrNzU1sdmzZzMA7JxzzmHPPvsse/7559l5553HALDZs2ezpqamLrV9ovfsYcOGsfnz50d+VxSF+f1+pihKF/ek6xRFYdnZ2ey+++7r9dcm/ZshIVkgIQPUI488AqfTid27d7cZ2lNbW5uYoHqA1+s9Yc8RAMyaNQsXX3wxAGDhwoUYPXo0br75Zqxbtw533313b4QZt/A5imVY1kA51z3N5/PBarUmOoxOOZ1O/PKXv4z8/qtf/QpjxozBs88+i4cffhhGo7HNv1FVFZIkwWw2w2w292a4AIAvvvgCX331FZ588sl2nz98+DB27tyJN998E7/61a/w6quv4oEHHojaJp57ortieV+ZPHly1HmYNWsW5s2bh1WrVuH555/v6RB7nCiKCXvt2267DR9++CH++Mc/RvWo3nTTTXjuueewePFi3HHHHVi1alWPxsHzvKb3SyzXVcvXvvjii/HKK6/gwQcfBMdxmsVBBrhEZ4SEDCRjxoxhp512Wszbr1+/nk2dOpVZLBaWnJzMZs2axTZt2hR5/u2332Y///nPWU5ODhNFkY0YMYI99NBDTJblyDbhbzRb/gwbNizSU9X6p2WP1CeffMIKCwtZUlISs1gs7Kc//Sn773//GxXjAw88wACwvXv3siuuuIIlJyezSZMmdbhP4dd94403oh4vKipiANgNN9zAGOu4h2zr1q3s1FNPZVarlTmdTnbeeeexb7/9tk08rX866y17/fXX2eTJk5nZbGZpaWnsqquuYkeOHDnhcWz5rW1r8ZxrAGzRokXsL3/5Cxs9ejQzmUxs8uTJ7MMPP2yz7ZEjR9jChQtZZmYmE0WRjRs3jr300ktttvP7/eyBBx5go0aNYiaTiWVnZ7MLLriAHTx4MKaYGGPswIED7MILL2RZWVnMZDKx3Nxcdtlll7HGxsao7Tq7Thlj7LnnnmPjxo1joiiynJwc9pvf/IY1NDREbTN79mw2fvx49tlnn7FZs2Yxi8XCfvvb3zLGGAsEAuz+++9n+fn5TBRFNnjwYPa73/2OBQKBE+7DokWLmM1mY16vt81zl19+OcvKyorcL7t372Zz585laWlpzGw2s7y8PLZw4cJOj1M47tYuvvhiBiDS69vyPI8bN44ZDAb21ltvRZ5r3aN05MgRdu2110bu77y8PPbrX/86qoekoaGB/fa3v2WDBw9moiiy/Px89thjj8XUe3D//fczURSZJEntPv/www+zlJQUFgwG2U033cRGjRoV9fywYcPavSfau/9a7tu+ffvYRRddxFJSUpjJZGJTpkxh//znP6PaXrNmDQPAtm/fzm666SaWkZHBkpOTO9yX8PvF8uXLox5vampiANjcuXOjHo/luLVs89lnn2XDhw9nFouFnXnmmaysrIypqsoeeughlpuby8xmMzvvvPNYXV1dm9g6u/bjuUZnz57NZs+eHXk+/H66YcMG9n//938sNzeXmUwmdvrpp7fboxneD7PZzKZOnco++uijNm22p7y8nAmCwE4//fQOt/nZz37GDAYDKy8vjzp+7Y1yaHlNdPae3bqHLLzPH3zwQVSb3f17VVVVxRYsWMByc3OZKIosOzubnXfeeW3+dvzzn/9kANiePXtOeMwIiQf1kBHSi4YNG4Zdu3ahqKgIEyZMOOG2Dz74IJYuXYoZM2bgoYcegiiK+PTTT7Ft2zbMnTsXALB27VrY7XbcdtttsNvt2LZtG+6//3643W4sX74cAHDPPffA5XLhyJEjWLFiBQDAbrdj7NixeOihh3D//ffjxhtvxKxZswAAM2bMAABs27YN8+bNw5QpU/DAAw+A53msWbMGp59+Oj7++GOcfPLJUfFecsklGDVqFB599FEwxuI+NocOHQIApKWldbjN+++/j3nz5mHEiBFYunQp/H4//vjHP2LmzJnYs2cP8vLycOGFF+LAgQP429/+hhUrViA9PR0AkJGR0WG7a9euxcKFCzF16lQsW7YMNTU1eOaZZ7Bjxw588cUXSE5Oxj333IMxY8bghRdewEMPPYThw4cjPz+/wzbjOdcA8OGHH2LDhg24+eabYTKZ8Pzzz+Oss87C//73v8i/r6mpwSmnnAKO47B48WJkZGRg48aNuO666+B2u3HLLbcAABRFwTnnnIOtW7fi8ssvx29/+1t4PB5s2bIFRUVFJ4w7TJIkFBYWIhgMYsmSJcjOzkZFRQXeffddNDY2wul0AojtOl26dCkefPBBzJkzBzfddBP279+PVatWYffu3dixY0dU71FdXR3mzZuHyy+/HL/85S+RlZUFVVVx3nnn4b///S9uvPFGjB07Ft988w1WrFiBAwcO4O233+5wPy677DI899xz+Pe//41LLrkk8rjP58M777yDBQsWQBAE1NbWYu7cucjIyMBdd92F5ORklJaW4s033+z0WHXku+++gyAIUb1H27Ztw+uvv47FixcjPT0deXl57f7byspKnHzyyWhsbMSNN96IgoICVFRU4O9//zt8Ph9EUYTP58Ps2bNRUVGBX/3qVxg6dCh27tyJu+++G1VVVe3OG21p586dmDBhQru9dwDw6quv4sILL4Qoirjiiisi52zq1KkAgKeffhqvvPIK3nrrLaxatQp2ux0TJ07EKaecgptuugkXXHABLrzwQgDAj370IwDA3r17MXPmTOTm5uKuu+6CzWbD66+/jvPPPx//+Mc/cMEFF0TF8Jv
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Количество строк до удаления выбросов: 1370\n",
|
|||
|
"Количество строк после удаления выбросов: 1256\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"df['Spec_score'] = df['Spec_score'].astype(int)\n",
|
|||
|
"df['Price'] = df['Price'].str.replace(',', '').astype(float)\n",
|
|||
|
"# Выбор столбцов для анализа\n",
|
|||
|
"column1 = 'Spec_score'\n",
|
|||
|
"column2 = 'Price'\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Функция для удаления выбросов\n",
|
|||
|
"def remove_outliers(df, column):\n",
|
|||
|
" Q1 = df[column].quantile(0.25)\n",
|
|||
|
" Q3 = df[column].quantile(0.75)\n",
|
|||
|
" IQR = Q3 - Q1\n",
|
|||
|
" lower_bound = Q1 - 1.5 * IQR\n",
|
|||
|
" upper_bound = Q3 + 1.5 * IQR\n",
|
|||
|
" return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]\n",
|
|||
|
"\n",
|
|||
|
"# Удаление выбросов для каждого столбца\n",
|
|||
|
"df_cleaned = df.copy()\n",
|
|||
|
"for column in [column1, column2]:\n",
|
|||
|
" df_cleaned = remove_outliers(df_cleaned, column)\n",
|
|||
|
"\n",
|
|||
|
"# Построение точечной диаграммы после удаления выбросов\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df_cleaned[column1], df_cleaned[column2], alpha=0.5)\n",
|
|||
|
"plt.xlabel(column1)\n",
|
|||
|
"plt.ylabel(column2)\n",
|
|||
|
"plt.title(f'Scatter Plot of {column1} vs {column2} (After Removing Outliers)')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Вывод количества строк до и после удаления выбросов\n",
|
|||
|
"print(f\"Количество строк до удаления выбросов: {len(df)}\")\n",
|
|||
|
"print(f\"Количество строк после удаления выбросов: {len(df_cleaned)}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Теперь очистим датасет отпустых строк"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 52,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"(817, 18)\n",
|
|||
|
"Unnamed: 0 False\n",
|
|||
|
"Name False\n",
|
|||
|
"Rating False\n",
|
|||
|
"Spec_score False\n",
|
|||
|
"No_of_sim False\n",
|
|||
|
"Ram False\n",
|
|||
|
"Battery False\n",
|
|||
|
"Display False\n",
|
|||
|
"Camera False\n",
|
|||
|
"External_Memory False\n",
|
|||
|
"Android_version False\n",
|
|||
|
"Price False\n",
|
|||
|
"company False\n",
|
|||
|
"Inbuilt_memory False\n",
|
|||
|
"fast_charging False\n",
|
|||
|
"Screen_resolution False\n",
|
|||
|
"Processor False\n",
|
|||
|
"Processor_name False\n",
|
|||
|
"dtype: bool\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df.dropna(inplace=True)\n",
|
|||
|
"\n",
|
|||
|
"print(df.shape)\n",
|
|||
|
"\n",
|
|||
|
"print(df.isnull().any())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Разбиение данных на обучающую, контрольную и тестовую выборки."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размеры выборок:\n",
|
|||
|
"Обучающая выборка: 490 записей\n",
|
|||
|
"Контрольная выборка: 163 записей\n",
|
|||
|
"Тестовая выборка: 164 записей\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA+QAAAJNCAYAAAC4Ob+DAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACnW0lEQVR4nOzdd3QU9dvG4WfpAZLQe++9F0PovUgVELDQlKIIiIAgCoIFBQsWQEFAUFBEBKSKoCAWOtJ7kV4DhF6S+/2Ds/NmCfiDZMMAfq5zOJrJZOfZ3dnZuedbxiNJBgAAAAAA7ql4bhcAAAAAAMB/EYEcAAAAAAAXEMgBAAAAAHABgRwAAAAAABcQyAEAAAAAcAGBHAAAAAAAFxDIAQAAAABwAYEcAAAAAAAXEMgBAP9pFy5csAMHDtjp06fdLgV+xPsKAHgQEMgBAP8506ZNsxo1alhgYKAlT57csmXLZsOGDXO7LMQS7ysA4EGTwO0CAACIjc2bN9vQoUPt119/tZMnT1rq1KmtWrVq9sorr1jhwoWjrd+vXz979913rXHjxjZ27FhLkyaNeTwey5cvnwvVw194XwEADyKPJLldBAAAMfHDDz9Y69atLVWqVNaxY0fLmTOn7du3z8aNG2enTp2yb7/91po2beqsv3TpUqtataoNHTrU+vXr52Ll8CfeVwDAg4pADgB4IO3evduKFStm2bJls99++83Spk3r/O7kyZNWqVIlO3DggG3YsMFy5cplZmYNGza0sLAw++OPP9wqG3GA9xUA8KBiDDkA4IE0fPhwu3jxoo0ZM8YnjJuZpUmTxj7//HO7cOGCzxji5cuXW5EiRaxVq1aWKlUqCwgIsLJly9rMmTOddc6fP2/JkiWzHj16RNvmwYMHLX78+DZ06FAzM2vXrp3lyJEj2noej8def/115+d//vnHnnvuOcufP78FBARY6tSprUWLFrZv3z6fv1uyZIl5PB5bsmSJs2zVqlVWq1YtCwwMtGTJklnVqlVt2bJlPn/35ZdfmsfjsdWrVzvLTp48Ga0OM7NHH300Ws3Lli2zFi1aWLZs2Sxx4sSWNWtWe/HFF+3SpUvRntv3339vZcqUscDAQPN4PM6/9957L9q6t6rR+y9p0qRWtGhR++KLL3zWa9eunSVPnvxfH+vm53Un76vX8ePHrWPHjpY+fXpLkiSJFS9e3CZOnOizzr59+5zn9OGHH1r27NktICDAqlSpYps2bYpW782v59dff23x4sWzd955x1m2YcMGa9euneXKlcuSJEliGTJksA4dOtipU6f+9bkCAB5ujCEHADyQZs+ebTly5LBKlSrd8veVK1e2HDly2Ny5c51lp06dsjFjxljy5Mmte/fuljZtWvv666+tWbNmNnnyZGvdurUlT57cmjZtalOnTrUPPvjA4seP7/z9N998Y5LsiSeeuKtaV61aZX/++ae1atXKsmTJYvv27bPRo0db1apVbcuWLZY0adJb/t2uXbusatWqljRpUuvTp48lTZrUxo4dazVr1rSff/7ZKleufFd13M60adPs4sWL1rVrV0udOrWtXLnSPvnkEzt48KBNmzbNWe+vv/6yli1bWvHixe2dd96x4OBgO3nypL344ot3vK0PP/zQ0qRJY+Hh4TZ+/Hh79tlnLUeOHFazZs0Y138n76uZ2aVLl6xq1aq2a9cu69atm+XMmdOmTZtm7dq1szNnzkS7CDNp0iQ7d+6cPf/883b58mX76KOPrHr16rZx40ZLnz79LWtZuHChdejQwbp16+bTff7nn3+2PXv2WPv27S1Dhgy2efNmGzNmjG3evNmWL19uHo8nxs8fAPAAEwAAD5gzZ87IzNS4ceN/Xa9Ro0YyM4WHh0uSzExmpiVLljjrXLx4UQULFlSGDBl09epVSdJPP/0kM9P8+fN9Hq9YsWKqUqWK83P79u2VLVu2aNs1Mw0aNMhnGzf766+/ZGaaNGmSs+zXX3+VmenXX3+VJD322GOKHz++Nm3a5Kxz8uRJpU6dWqVLl3aWTZgwQWamVatWOctOnDgRrQ5JatCggbJnz+6z7Fb1DR06VB6PR//884+zrH///jIzHTlyxFm2d+9emZmGDx8e7TGi8ta4d+9eZ9mOHTtkZho2bJizrG3btkqWLNm/PtbNz+tO39cRI0bIzPT111876129elUhISFKnjy5s594n1NAQIAOHjzorLtixQqZmV588UWfer2v5+rVq5U8eXK1aNFCERERPjXf6jX+5ptvZGb67bff/vX5AgAeXnRZBwA8cM6dO2dmZoGBgf+6nvf34eHhzrKyZctalSpVnJ8DAgLsueees6NHj9ratWvNzKxmzZqWKVMmmzx5srPepk2bbMOGDfbkk086y9KlS2fHjx+3q1ev/msdAQEBzv9fu3bNTp06ZXny5LEUKVI424zq7Nmzdvz4cfv555+tTp06PrPFp06d2tq1a2dr1qyxY8eO/et271TU+i5cuGAnT560ChUqmCRbt26d87tz585ZvHjxLEWKFDHe1unTp+3kyZO2Z88e+/DDDy1+/Pg+74fXyZMn7eTJk3b58uU7etw7eV/nzZtnGTJkcFrMzcwSJkxo3bt3t/Pnz9vSpUt9HrNJkyaWOXNm5+dy5cpZ+fLlbd68edG2v2fPHmvQoIGVKFHCvvrqK4sXz/cUK+prfPnyZTt58qQ98sgjZma33AcAAP8NBHIAwAPHG7S9wfx2bhXcCxQoEG29ggULmpk5Y7rjxYtnTzzxhM2cOdMuXrxoZmaTJ0+2JEmSWIsWLZy/q1Chgl2+fNleffVVO3jwoBMib3bp0iUbOHCgZc2a1RInTmxp0qSxtGnT2pkzZ+zs2bPR1m/SpImlT5/ewsPDLX/+/P+z3tjav3+/tWvXzlKlSmXJkye3tGnTOuE2an0hISEWGRlpPXr0sN27d9vJkyft9OnTd7WtUqVKWdq0aS137tw2fvx4+/TTT61cuXI+61y4cMHSpk1radOmtYCAAMuWLZt99NFH//q4d/K+/vPPP5Y3b95oYdm73j///OOzPG/evNEeM1++fNFe9wsXLlidOnXs2LFjFhYWdsvu52FhYdajRw9Lnz69BQQEWNq0aS1nzpxmZrfcBwAA/w2MIQcAPHCCg4MtY8aMtmHDhn9db8OGDZY5c2YLCgoyM99Wyv/l6aeftuHDh9vMmTOtdevWNmXKFHv00UctODjYWadRo0bWoUMHGz58uA0fPvy2j/XCCy/YhAkTrGfPnhYSEmLBwcHm8XisVatWFhkZGW399957z/LmzWuNGze+43pjKiIiwmrVqmVhYWH28ssvW4ECBSxZsmR26NAha9eunU99rVq1srVr19onn3xiY8aMidH2vv76a0ufPr1dvnzZfvnlF3v++ectSZIk1q5dO2edJEmS2OzZs83sxkWV8ePHW8+ePS1jxozWsmXLaI95N+9rXDh58qQlS5bMZs+ebU2aNLGhQ4faoEGDfNZp2bKl/fnnn9anTx8rUaKEJU+e3CIjI61u3bq33AcAAP8NBHIAwAPp0UcftbFjx9rvv/9uFStWjPb7ZcuW2b59+6xz587Ospw5c9r27dujrbtt2zYzM5/ZsosUKWIlS5a0yZMnW5YsWWz//v32ySefRPvbcePG2cCBA2337t1OsKpVq5bPOt9//721bdvW3n//fWfZ5cuX7cyZM7d8bqVLl7YqVapY8uTJ77jemNq4caPt2LHDJk6caE8//bSz/Oeff462brx48ey9996zjRs32t69e23UqFF27Ngxn278/0toaKhT96OPPmqbN2+2oUOH+gTy+PHj+0zy1qBBA0uVKpUtWLDgloH8Tt/X7Nmz24YNGywyMtKnldy7Xvbs2X3+fufOndEec8eOHdFe96RJk9qCBQusQIEC9uKLL9rbb79tLVu2dFreT58+bYsXL7bBgwfbwIED//XxAQD/LXRZBwA8kPr06WMBAQHWuXPnaLeOCgsLsy5dujizk3vVr1/fVq5caX/++aez7PLlyzZ69GjLkCGDlS5d2udxnnrqKVu4cKGNGDHCUqdObfXq1btlLdmzZ7f
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1200x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Разделение признаков (features) и целевой переменной (target)\n",
|
|||
|
"X = df.drop(columns=['company']) # Признаки (все столбцы, кроме 'сompany')\n",
|
|||
|
"y = df['company'] # Целевая переменная (сompany)\n",
|
|||
|
"\n",
|
|||
|
"# Разбиение на обучающую (60%), валидационную (20%) и тестовую (20%) выборки\n",
|
|||
|
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n",
|
|||
|
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка размеров выборок\n",
|
|||
|
"print(f\"Размеры выборок:\")\n",
|
|||
|
"print(f\"Обучающая выборка: {X_train.shape[0]} записей\")\n",
|
|||
|
"print(f\"Контрольная выборка: {X_val.shape[0]} записей\")\n",
|
|||
|
"print(f\"Тестовая выборка: {X_test.shape[0]} записей\")\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация распределения марок в каждой выборке\n",
|
|||
|
"plt.figure(figsize=(12, 6))\n",
|
|||
|
"plt.subplot(1, 1, 1)\n",
|
|||
|
"plt.xticks(rotation=45) \n",
|
|||
|
"plt.hist(y_train, bins=20, color='blue', alpha=0.7)\n",
|
|||
|
"plt.title('Обучающая выборка')\n",
|
|||
|
"plt.xlabel('Марка')\n",
|
|||
|
"plt.ylabel('Количество')\n",
|
|||
|
"\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Данные не сбалансированы, так как существует большая разница в количестве наблюдений для разных марок. Это может привести к тому, что модель будет хуже предсказывать цены для марок, численность которых в выборке меньше, а для других - лучше. Применим методы приращения."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение company в обучающей выборке после oversampling:\n",
|
|||
|
"company\n",
|
|||
|
"POCO 82\n",
|
|||
|
"Vivo 82\n",
|
|||
|
"OPPO 82\n",
|
|||
|
"LG 82\n",
|
|||
|
"Realme 82\n",
|
|||
|
"Motorola 82\n",
|
|||
|
"Samsung 82\n",
|
|||
|
"Xiaomi 82\n",
|
|||
|
"Lava 82\n",
|
|||
|
"itel 82\n",
|
|||
|
"iQOO 82\n",
|
|||
|
"Poco 82\n",
|
|||
|
"Honor 82\n",
|
|||
|
"OnePlus 82\n",
|
|||
|
"Huawei 82\n",
|
|||
|
"TCL 82\n",
|
|||
|
"Google 82\n",
|
|||
|
"Nothing 82\n",
|
|||
|
"Asus 82\n",
|
|||
|
"Coolpad 82\n",
|
|||
|
"Itel 82\n",
|
|||
|
"Oppo 82\n",
|
|||
|
"Lenovo 82\n",
|
|||
|
"IQOO 82\n",
|
|||
|
"Gionee 82\n",
|
|||
|
"Tecno 82\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение company в контрольной выборке после oversampling:\n",
|
|||
|
"company\n",
|
|||
|
"Motorola 37\n",
|
|||
|
"Samsung 37\n",
|
|||
|
"TCL 37\n",
|
|||
|
"Poco 37\n",
|
|||
|
"itel 37\n",
|
|||
|
"Realme 37\n",
|
|||
|
"Vivo 37\n",
|
|||
|
"Xiaomi 37\n",
|
|||
|
"Oppo 37\n",
|
|||
|
"iQOO 37\n",
|
|||
|
"OPPO 37\n",
|
|||
|
"LG 37\n",
|
|||
|
"POCO 37\n",
|
|||
|
"Honor 37\n",
|
|||
|
"OnePlus 37\n",
|
|||
|
"Huawei 37\n",
|
|||
|
"Lava 37\n",
|
|||
|
"Google 37\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение company в тестовой выборке после oversampling:\n",
|
|||
|
"company\n",
|
|||
|
"Realme 30\n",
|
|||
|
"Samsung 30\n",
|
|||
|
"OPPO 30\n",
|
|||
|
"TCL 30\n",
|
|||
|
"Xiaomi 30\n",
|
|||
|
"iQOO 30\n",
|
|||
|
"Motorola 30\n",
|
|||
|
"Lenovo 30\n",
|
|||
|
"Vivo 30\n",
|
|||
|
"Honor 30\n",
|
|||
|
"Poco 30\n",
|
|||
|
"Huawei 30\n",
|
|||
|
"Oppo 30\n",
|
|||
|
"OnePlus 30\n",
|
|||
|
"Google 30\n",
|
|||
|
"Lava 30\n",
|
|||
|
"itel 30\n",
|
|||
|
"POCO 30\n",
|
|||
|
"Tecno 30\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение company в обучающей выборке после undersampling:\n",
|
|||
|
"company\n",
|
|||
|
"Asus 1\n",
|
|||
|
"Coolpad 1\n",
|
|||
|
"Gionee 1\n",
|
|||
|
"Google 1\n",
|
|||
|
"Honor 1\n",
|
|||
|
"Huawei 1\n",
|
|||
|
"IQOO 1\n",
|
|||
|
"Itel 1\n",
|
|||
|
"LG 1\n",
|
|||
|
"Lava 1\n",
|
|||
|
"Lenovo 1\n",
|
|||
|
"Motorola 1\n",
|
|||
|
"Nothing 1\n",
|
|||
|
"OPPO 1\n",
|
|||
|
"OnePlus 1\n",
|
|||
|
"Oppo 1\n",
|
|||
|
"POCO 1\n",
|
|||
|
"Poco 1\n",
|
|||
|
"Realme 1\n",
|
|||
|
"Samsung 1\n",
|
|||
|
"TCL 1\n",
|
|||
|
"Tecno 1\n",
|
|||
|
"Vivo 1\n",
|
|||
|
"Xiaomi 1\n",
|
|||
|
"iQOO 1\n",
|
|||
|
"itel 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение company в контрольной выборке после undersampling:\n",
|
|||
|
"company\n",
|
|||
|
"Google 1\n",
|
|||
|
"Honor 1\n",
|
|||
|
"Huawei 1\n",
|
|||
|
"LG 1\n",
|
|||
|
"Lava 1\n",
|
|||
|
"Motorola 1\n",
|
|||
|
"OPPO 1\n",
|
|||
|
"OnePlus 1\n",
|
|||
|
"Oppo 1\n",
|
|||
|
"POCO 1\n",
|
|||
|
"Poco 1\n",
|
|||
|
"Realme 1\n",
|
|||
|
"Samsung 1\n",
|
|||
|
"TCL 1\n",
|
|||
|
"Vivo 1\n",
|
|||
|
"Xiaomi 1\n",
|
|||
|
"iQOO 1\n",
|
|||
|
"itel 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение company в тестовой выборке после undersampling:\n",
|
|||
|
"company\n",
|
|||
|
"Google 1\n",
|
|||
|
"Honor 1\n",
|
|||
|
"Huawei 1\n",
|
|||
|
"Lava 1\n",
|
|||
|
"Lenovo 1\n",
|
|||
|
"Motorola 1\n",
|
|||
|
"OPPO 1\n",
|
|||
|
"OnePlus 1\n",
|
|||
|
"Oppo 1\n",
|
|||
|
"POCO 1\n",
|
|||
|
"Poco 1\n",
|
|||
|
"Realme 1\n",
|
|||
|
"Samsung 1\n",
|
|||
|
"TCL 1\n",
|
|||
|
"Tecno 1\n",
|
|||
|
"Vivo 1\n",
|
|||
|
"Xiaomi 1\n",
|
|||
|
"iQOO 1\n",
|
|||
|
"itel 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['company'].value_counts()\n",
|
|||
|
" print(f\"Распределение company в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop('company', axis=1)\n",
|
|||
|
" y = df['company']\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")\n",
|
|||
|
"\n",
|
|||
|
"def undersample(df):\n",
|
|||
|
" X = df.drop('company', axis=1)\n",
|
|||
|
" y = df['company']\n",
|
|||
|
" \n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42) # type: ignore\n",
|
|||
|
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_undersampled = undersample(train_df)\n",
|
|||
|
"val_df_undersampled = undersample(val_df)\n",
|
|||
|
"test_df_undersampled = undersample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_undersampled, \"обучающей выборке после undersampling\")\n",
|
|||
|
"check_balance(val_df_undersampled, \"контрольной выборке после undersampling\")\n",
|
|||
|
"check_balance(test_df_undersampled, \"тестовой выборке после undersampling\")\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Данные были сбалансированы. Теперь можно перейти к конструированию признаков. Поставлены следующие задачи:\n",
|
|||
|
"1. Классифицировать мобильные устройства по ценовым категориям (например, бюджетные, средний класс, флагманы).\n",
|
|||
|
"2. Определить, какие характеристики мобильных устройств наиболее сильно влияют на их рейтинг."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 81,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Определение категориальных признаков\n",
|
|||
|
"categorical_features = [\n",
|
|||
|
" 'Rating', 'Ram',\n",
|
|||
|
" 'Battery', 'Display', 'Camera', 'External_Memory', 'Android_version',\n",
|
|||
|
" 'Price', 'company', 'Inbuilt_memory', 'fast_charging',\n",
|
|||
|
" 'Screen_resolution', 'Processor'\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к обучающей выборке\n",
|
|||
|
"train_df_resampled_encoded = pd.get_dummies(train_df_undersampled, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к контрольной выборке\n",
|
|||
|
"val_df_encoded = pd.get_dummies(val_df_undersampled, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к тестовой выборке\n",
|
|||
|
"test_df_encoded = pd.get_dummies(test_df_undersampled, columns=categorical_features)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Дискретизация числовых признаков"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки после балансировки: (5600, 22)\n",
|
|||
|
"Размер контрольной выборки: (288, 22)\n",
|
|||
|
"Размер тестовой выборки: (411, 22)\n",
|
|||
|
"Unnamed: 0\n",
|
|||
|
"Name\n",
|
|||
|
"Rating\n",
|
|||
|
"Spec_score\n",
|
|||
|
"No_of_sim\n",
|
|||
|
"Ram\n",
|
|||
|
"Battery\n",
|
|||
|
"Display\n",
|
|||
|
"Camera\n",
|
|||
|
"External_Memory\n",
|
|||
|
"Android_version\n",
|
|||
|
"Price\n",
|
|||
|
"company\n",
|
|||
|
"Inbuilt_memory\n",
|
|||
|
"fast_charging\n",
|
|||
|
"Screen_resolution\n",
|
|||
|
"Processor\n",
|
|||
|
"Processor_name\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"import re\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Извлечение числовых значений из столбца Battery\n",
|
|||
|
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
|
|||
|
"df['Ram'] = df['Ram'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
|
|||
|
"df['Camera'] = df['Camera'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
|
|||
|
"\n",
|
|||
|
"# Удаление запятых из столбца Price и преобразование в числовой формат\n",
|
|||
|
"df['Price'] = df['Price'].str.replace(',', '').astype(float)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling к обучающей выборке (если это необходимо)\n",
|
|||
|
"X_train = train_df.drop('Price', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['Price'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Определение числовых признаков для дискретизации\n",
|
|||
|
"numerical_features = ['Spec_score', 'Battery', 'Ram', 'Camera' ]\n",
|
|||
|
"\n",
|
|||
|
"# Функция для дискретизации числовых признаков\n",
|
|||
|
"def discretize_features(df, features, bins=5, labels=False):\n",
|
|||
|
" for feature in features:\n",
|
|||
|
" try:\n",
|
|||
|
" df[f'{feature}_bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n",
|
|||
|
" except Exception as e:\n",
|
|||
|
" print(f\"Ошибка при дискретизации признака {feature}: {e}\")\n",
|
|||
|
" return df\n",
|
|||
|
"\n",
|
|||
|
"# Применение дискретизации к обучающей, контрольной и тестовой выборкам\n",
|
|||
|
"train_df_resampled = discretize_features(train_df_resampled, numerical_features)\n",
|
|||
|
"val_df = discretize_features(val_df, numerical_features)\n",
|
|||
|
"test_df = discretize_features(test_df, numerical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки после балансировки:\", train_df_resampled.shape)\n",
|
|||
|
"print(\"Размер контрольной выборки:\", val_df.shape)\n",
|
|||
|
"print(\"Размер тестовой выборки:\", test_df.shape)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Ручной синтез. Создание новых признаков на основе экспертных знаний и логики предметной области."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 93,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки после балансировки: (5600, 19)\n",
|
|||
|
"Размер контрольной выборки: (288, 19)\n",
|
|||
|
"Размер тестовой выборки: (411, 19)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование столбца Battery в числовой формат\n",
|
|||
|
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование столбцов Camera и Display в числовой формат\n",
|
|||
|
"df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n",
|
|||
|
"df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n",
|
|||
|
"\n",
|
|||
|
"# Удаление запятых из столбца Price и преобразование в числовой формат\n",
|
|||
|
"df['Price'] = df['Price'].str.replace(',', '').astype(float)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling к обучающей выборке (если это необходимо)\n",
|
|||
|
"X_train = train_df.drop('Price', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['Price'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового признака \"Camera_to_Display_Ratio\" на основе признаков \"Camera\" и \"Display\"\n",
|
|||
|
"train_df_resampled['Camera_to_Display_Ratio'] = train_df_resampled['Camera'] / train_df_resampled['Display']\n",
|
|||
|
"val_df['Camera_to_Display_Ratio'] = val_df['Camera'] / val_df['Display']\n",
|
|||
|
"test_df['Camera_to_Display_Ratio'] = test_df['Camera'] / test_df['Display']\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки после балансировки:\", train_df_resampled.shape)\n",
|
|||
|
"print(\"Размер контрольной выборки:\", val_df.shape)\n",
|
|||
|
"print(\"Размер тестовой выборки:\", test_df.shape)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 101,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки после балансировки: (5600, 19)\n",
|
|||
|
"Размер контрольной выборки: (288, 19)\n",
|
|||
|
"Размер тестовой выборки: (411, 19)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\utils\\extmath.py:1137: RuntimeWarning: invalid value encountered in divide\n",
|
|||
|
" updated_mean = (last_sum + new_sum) / updated_sample_count\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\utils\\extmath.py:1142: RuntimeWarning: invalid value encountered in divide\n",
|
|||
|
" T = new_sum / new_sample_count\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\utils\\extmath.py:1162: RuntimeWarning: invalid value encountered in divide\n",
|
|||
|
" new_unnormalized_variance -= correction**2 / new_sample_count\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"\n",
|
|||
|
"# Определение числовых признаков для масштабирования\n",
|
|||
|
"numerical_features_to_scale = ['Spec_score', 'No_of_sim', 'Ram', 'Battery', 'Display', 'Camera', 'Inbuilt_memory', 'Screen_resolution', 'Camera_to_Display_Ratio']\n",
|
|||
|
"\n",
|
|||
|
"# Удаление строковых значений из числовых признаков\n",
|
|||
|
"for feature in numerical_features_to_scale:\n",
|
|||
|
" train_df_resampled[feature] = pd.to_numeric(train_df_resampled[feature], errors='coerce')\n",
|
|||
|
" val_df[feature] = pd.to_numeric(val_df[feature], errors='coerce')\n",
|
|||
|
" test_df[feature] = pd.to_numeric(test_df[feature], errors='coerce')\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация StandardScaler\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование числовых признаков в обучающей выборке\n",
|
|||
|
"train_df_resampled[numerical_features_to_scale] = scaler.fit_transform(train_df_resampled[numerical_features_to_scale])\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование числовых признаков в контрольной и тестовой выборках\n",
|
|||
|
"val_df[numerical_features_to_scale] = scaler.transform(val_df[numerical_features_to_scale])\n",
|
|||
|
"test_df[numerical_features_to_scale] = scaler.transform(test_df[numerical_features_to_scale])\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки после балансировки:\", train_df_resampled.shape)\n",
|
|||
|
"print(\"Размер контрольной выборки:\", val_df.shape)\n",
|
|||
|
"print(\"Размер тестовой выборки:\", test_df.shape)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Конструирование признаков с применением фреймворка Featuretools"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Обучающая выборка после конструирования признаков:\n",
|
|||
|
" Unnamed: 0 Rating Spec_score No_of_sim Ram \\\n",
|
|||
|
"id \n",
|
|||
|
"0 305 4.70 86 Dual Sim, 3G, 4G, 5G, VoLTE, 12 GB RAM \n",
|
|||
|
"1 941 4.45 71 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM \n",
|
|||
|
"2 800 4.20 68 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM \n",
|
|||
|
"3 97 4.25 69 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM \n",
|
|||
|
"4 1339 4.30 74 Dual Sim, 3G, 4G, VoLTE, 6 GB RAM \n",
|
|||
|
"\n",
|
|||
|
" Battery External_Memory Android_version Price \\\n",
|
|||
|
"id \n",
|
|||
|
"0 5000 Android v12 NaN 30999.0 \n",
|
|||
|
"1 5000 Memory Card Supported, upto 1 TB 12 6999.0 \n",
|
|||
|
"2 5000 Memory Card Supported 12 8999.0 \n",
|
|||
|
"3 5000 Memory Card Supported 12 9999.0 \n",
|
|||
|
"4 5000 Memory Card Supported, upto 256 GB 12 8499.0 \n",
|
|||
|
"\n",
|
|||
|
" company Inbuilt_memory fast_charging \\\n",
|
|||
|
"id \n",
|
|||
|
"0 Realme 256 GB inbuilt 65W Fast Charging \n",
|
|||
|
"1 Motorola 64 GB inbuilt 10W Fast Charging \n",
|
|||
|
"2 Vivo 64 GB inbuilt 10W Fast Charging \n",
|
|||
|
"3 Vivo 128 GB inbuilt 10W Fast Charging \n",
|
|||
|
"4 Lava 128 GB inbuilt NaN \n",
|
|||
|
"\n",
|
|||
|
" Screen_resolution Processor \n",
|
|||
|
"id \n",
|
|||
|
"0 1080 x 2400 px Octa Core \n",
|
|||
|
"1 720 x 1600 px Octa Core \n",
|
|||
|
"2 720 x 1600 px Display with Water Drop Notch Octa Core \n",
|
|||
|
"3 720 x 1600 px Display with Water Drop Notch Octa Core \n",
|
|||
|
"4 1600 x 720 px Octa Core \n",
|
|||
|
"Контрольная выборка после конструирования признаков:\n",
|
|||
|
" Unnamed: 0 Rating Spec_score No_of_sim Ram \\\n",
|
|||
|
"id \n",
|
|||
|
"1028 <NA> NaN <NA> NaN NaN \n",
|
|||
|
"825 <NA> NaN <NA> NaN NaN \n",
|
|||
|
"900 <NA> NaN <NA> NaN NaN \n",
|
|||
|
"702 <NA> NaN <NA> NaN NaN \n",
|
|||
|
"230 1050 4.05 90 Dual Sim, 3G, 4G, 5G, VoLTE, 8 GB RAM \n",
|
|||
|
"\n",
|
|||
|
" Battery External_Memory Android_version Price company \\\n",
|
|||
|
"id \n",
|
|||
|
"1028 <NA> NaN NaN NaN NaN \n",
|
|||
|
"825 <NA> NaN NaN NaN NaN \n",
|
|||
|
"900 <NA> NaN NaN NaN NaN \n",
|
|||
|
"702 <NA> NaN NaN NaN NaN \n",
|
|||
|
"230 4500 Android v12 NaN 62990.0 Motorola \n",
|
|||
|
"\n",
|
|||
|
" Inbuilt_memory fast_charging Screen_resolution Processor \n",
|
|||
|
"id \n",
|
|||
|
"1028 NaN NaN NaN NaN \n",
|
|||
|
"825 NaN NaN NaN NaN \n",
|
|||
|
"900 NaN NaN NaN NaN \n",
|
|||
|
"702 NaN NaN NaN NaN \n",
|
|||
|
"230 128 GB inbuilt 125W Fast Charging 1080 x 2400 px Octa Core \n",
|
|||
|
"Тестовая выборка после конструирования признаков:\n",
|
|||
|
" Unnamed: 0 Rating Spec_score No_of_sim \\\n",
|
|||
|
"id \n",
|
|||
|
"427 187 4.40 91 Dual Sim, 3G, 4G, 5G, VoLTE, \n",
|
|||
|
"1088 <NA> NaN <NA> NaN \n",
|
|||
|
"668 592 4.45 91 Dual Sim, 3G, 4G, 5G, VoLTE, \n",
|
|||
|
"572 1130 4.60 75 Dual Sim, 3G, 4G, VoLTE, \n",
|
|||
|
"115 117 4.60 72 Dual Sim, 3G, 4G, VoLTE, \n",
|
|||
|
"\n",
|
|||
|
" Ram Battery External_Memory Android_version \\\n",
|
|||
|
"id \n",
|
|||
|
"427 12 GB RAM 5000 Memory Card Not Supported 14 \n",
|
|||
|
"1088 NaN <NA> NaN NaN \n",
|
|||
|
"668 12 GB RAM 4500 Android v12 NaN \n",
|
|||
|
"572 6 GB RAM 5000 Memory Card Supported, upto 1 TB 13 \n",
|
|||
|
"115 4 GB RAM 5000 Memory Card Supported, upto 1 TB 12 \n",
|
|||
|
"\n",
|
|||
|
" Price company Inbuilt_memory fast_charging \\\n",
|
|||
|
"id \n",
|
|||
|
"427 63999.0 Vivo 256 GB inbuilt 120W Fast Charging \n",
|
|||
|
"1088 NaN NaN NaN NaN \n",
|
|||
|
"668 54990.0 Honor 256 GB inbuilt 100W Fast Charging \n",
|
|||
|
"572 8499.0 Xiaomi 128 GB inbuilt 18W Fast Charging \n",
|
|||
|
"115 11580.0 Vivo 64 GB inbuilt 18W Fast Charging \n",
|
|||
|
"\n",
|
|||
|
" Screen_resolution Processor \n",
|
|||
|
"id \n",
|
|||
|
"427 1260 x 2800 px Octa Core \n",
|
|||
|
"1088 NaN NaN \n",
|
|||
|
"668 1200 x 2652 px Octa Core \n",
|
|||
|
"572 720 x 1600 px Octa Core \n",
|
|||
|
"115 720 x 1612 px Display with Water Drop Notch Octa Core \n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"import featuretools as ft\n",
|
|||
|
"import re\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='mobile_data')\n",
|
|||
|
"es = es.add_dataframe(dataframe_name='train', dataframe=train_df, index='id')\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='train', max_depth=2)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод первых нескольких строк для проверки\n",
|
|||
|
"print(\"Обучающая выборка после конструирования признаков:\")\n",
|
|||
|
"print(feature_matrix.head())\n",
|
|||
|
"print(\"Контрольная выборка после конструирования признаков:\")\n",
|
|||
|
"print(val_feature_matrix.head())\n",
|
|||
|
"print(\"Тестовая выборка после конструирования признаков:\")\n",
|
|||
|
"print(test_feature_matrix.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Оценка качества"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 110,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Feature Importance:\n",
|
|||
|
" feature importance\n",
|
|||
|
"4 Price 0.999443\n",
|
|||
|
"2 Spec_score 0.000227\n",
|
|||
|
"3 Battery 0.000146\n",
|
|||
|
"0 Unnamed: 0 0.000146\n",
|
|||
|
"1 Rating 0.000039\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"import featuretools as ft\n",
|
|||
|
"from sklearn.ensemble import RandomForestRegressor\n",
|
|||
|
"import re\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Оценка важности признаков\n",
|
|||
|
"X = feature_matrix\n",
|
|||
|
"y = train_df_resampled['Price']\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучение модели\n",
|
|||
|
"model = RandomForestRegressor(n_estimators=100, random_state=42)\n",
|
|||
|
"model.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Получение важности признаков\n",
|
|||
|
"importances = model.feature_importances_\n",
|
|||
|
"feature_names = feature_matrix.columns\n",
|
|||
|
"\n",
|
|||
|
"# Сортировка признаков по важности\n",
|
|||
|
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
|
|||
|
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Feature Importance:\")\n",
|
|||
|
"print(feature_importance)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 115,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 671\n",
|
|||
|
"Размер контрольной выборки: 288\n",
|
|||
|
"Размер тестовой выборки: 411\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\Алина\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Mean Squared Error: 53834536.21488374\n",
|
|||
|
"R2 Score: 0.9445638071244045\n",
|
|||
|
"Cross-validated Mean Squared Error: 311290473.964474\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA40AAAIjCAYAAACnNf4TAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABDwUlEQVR4nO3dd3RU1f7//9ckIRNIpaWAISGGqnQEA1IFQ5GiXhBESqRcpShy4QJSQxfhQxGJSIsFQUBFUUERQbwUQZqAEBBCESlKCxFJIDm/P/wxX8dkAwkhA+H5WGvWyuyzzz7vM5sseLHPOWOzLMsSAAAAAACZcHN1AQAAAACAOxehEQAAAABgRGgEAAAAABgRGgEAAAAARoRGAAAAAIARoREAAAAAYERoBAAAAAAYERoBAAAAAEaERgAAAACAEaERAAAAAGBEaAQA3HXi4+Nls9kyfQ0aNOi2HHPDhg0aOXKkzp8/f1vGvxXXPo8ffvjB1aVk28yZMxUfH+/qMgAAmfBwdQEAAGTXqFGjVLJkSae2Bx988LYca8OGDYqNjVWXLl0UEBBwW45xL5s5c6aKFCmiLl26uLoUAMA/EBoBAHetpk2bqnr16q4u45b88ccf8vb2dnUZLnPp0iUVKFDA1WUAAK6Dy1MBAHnWihUrVKdOHXl7e8vX11fNmzfXnj17nPr8+OOP6tKliyIiIuTl5aXg4GA999xzOnPmjKPPyJEjNWDAAElSyZIlHZfCHj58WIcPH5bNZsv00kqbzaaRI0c6jWOz2fTTTz/pmWeeUcGCBfXII484tr/33nuqVq2a8ufPr0KFCqldu3Y6duxYts69S5cu8vHx0dGjR/X444/Lx8dHxYsX1xtvvCFJ2rVrlxo2bChvb2+FhYXp/fffd9r/2iWv69at07///W8VLlxYfn5+6tSpk86dO5fheDNnztQDDzwgu92uYsWKqVevXhku5a1fv74efPBBbd26VXXr1lWBAgX0yiuvKDw8XHv27NG3337r+Gzr168vSTp79qz69++vChUqyMfHR35+fmratKl27tzpNPbatWtls9m0ePFijR07Vvfdd5+8vLz06KOP6ueff85Q7/fff69mzZqpYMGC8vb2VsWKFTVt2jSnPvv27dO//vUvFSpUSF5eXqpevbo+/fTTrE4FANz1WGkEANy1Lly4oN9//92prUiRIpKkd999V507d1Z0dLReffVVXbp0SXFxcXrkkUe0fft2hYeHS5JWrVqlQ4cOKSYmRsHBwdqzZ4/eeust7dmzR5s2bZLNZtOTTz6p/fv3a+HChZoyZYrjGEWLFtVvv/2W5brbtGmjUqVKady4cbIsS5I0duxYDRs2TG3btlW3bt3022+/6fXXX1fdunW1ffv2bF0Sm5aWpqZNm6pu3bqaOHGiFixYoN69e8vb21tDhgxRhw4d9OSTT+rNN99Up06dFBUVleFy3969eysgIEAjR45UQkKC4uLidOTIEUdIk/4Kw7GxsWrUqJFeeOEFR78tW7Zo/fr1ypcvn2O8M2fOqGnTpmrXrp2effZZBQUFqX79+urTp498fHw0ZMgQSVJQUJAk6dChQ1q2bJnatGmjkiVL6tSpU5o1a5bq1aunn376ScWKFXOqd8KECXJzc1P//v114cIFTZw4UR06dND333/v6LNq1So9/vjjCgkJ0UsvvaTg4GDt3btXn332mV566SVJ0p49e1S7dm0VL15cgwYNkre3txYvXqzWrVvrww8/1BNPPJHl+QCAu5YFAMBdZv78+ZakTF+WZVkXL160AgICrO7duzvtd/LkScvf39+p/dKlSxnGX7hwoSXJWrdunaPttddesyRZiYmJTn0TExMtSdb8+fMzjCPJGjFihOP9iBEjLElW+/btnfodPnzYcnd3t8aOHevUvmvXLsvDwyNDu+nz2LJli6Otc+fOliRr3LhxjrZz585Z+fPnt2w2m7Vo0SJH+759+zLUem3MatWqWampqY72iRMnWpKsTz75xLIsyzp9+rTl6elpPfbYY1ZaWpqj34wZMyxJ1rx58xxt9erVsyRZb775ZoZzeOCBB6x69eplaL98+bLTuJb112dut9utUaNGOdrWrFljSbLKlStnpaSkONqnTZtmSbJ27dplWZZlXb161SpZsqQVFhZmnTt3zmnc9PR0x8+PPvqoVaFCBevy5ctO22vVqmWVKlUqQ50AkJdxeSoA4K71xhtvaNWqVU4v6a+VpPPnz6t9+/b6/fffHS93d3fVrFlTa9ascYyRP39+x8+XL1/W77//rocffliStG3btttS9/PPP+/0/qOPPlJ6erratm3rVG9wcLBKlSrlVG9WdevWzfFzQECAypQpI29vb7Vt29bRXqZMGQUEBOjQoUMZ9u/Ro4fTSuELL7wgDw8PffHFF5Kkr7/+Wqmpqerbt6/c3P7fPyu6d+8uPz8/ff75507j2e12xcTE3HT9drvdMW5aWprOnDkjHx8flSlTJtP5iYmJkaenp+N9nTp1JMlxbtu3b1diYqL69u2bYfX22srp2bNn9c0336ht27a6ePGiYz7OnDmj6OhoHThwQMePH7/pcwCAux2XpwIA7lo1atTI9EE4Bw4ckCQ1bNgw0/38/PwcP589e1axsbFatGiRTp8+7dTvwoULOVjt//PPS0APHDggy7JUqlSpTPv/PbRlhZeXl4oWLerU5u/vr/vuu88RkP7entm9iv+sycfHRyEhITp8+LAk6ciRI5L+Cp5/5+npqYiICMf2a4oXL+4U6m4kPT1d06ZN08yZM5WYmKi0tDTHtsKFC2foX6JECaf3BQsWlCTHuR08eFDS9Z+y+/PPP8uyLA0bNkzDhg3LtM/p06dVvHjxmz4PALibERoBAHlOenq6pL/uawwODs6w3cPj//3117ZtW23YsEEDBgxQ5cqV5ePjo/T0dDVp0sQxzvX8M3xd8/dw809/X928Vq/NZtOKFSvk7u6eob+Pj88N68hMZmNdr936/++vvJ3+ee43Mm7cOA0bNkzPPfecRo8erUKFCsnNzU19+/bNdH5y4tyujdu/f39FR0dn2icyMvKmxwOAux2hEQCQ59x///2SpMDAQDVq1MjY79y5c1q9erViY2M1fPhwR/u1lcq/M4XDaytZ/3xS6D9X2G5Ur2VZKlmypEqXLn3T++WGAwcOqEGDBo73ycnJOnHihJo1ayZJCgsLkyQlJCQoIiLC0S81NVWJiYnX/fz/zvT5Ll26VA0aNNDcuXOd2s+fP+94IFFWXPuzsXv3bmNt184jX758N10/AORl3NMIAMhzoqOj5efnp3HjxunKlSsZtl974um1Val/rkJNnTo1wz7Xvkvxn+HQz89PRYoU0bp165zaZ86cedP1Pvnkk3J3d1dsbGyGWizLcvr6j9z21ltvOX2GcXFxunr1qpo2bSpJatSokTw9PTV9+nSn2ufOnasLFy6oefPmN3Ucb2/vDJ+t9Ncc/fMzWbJkSbbvKaxatapKliypqVOnZjjeteMEBgaqfv36mjVrlk6cOJFhjOw8MRcA7masNAIA8hw/Pz/FxcWpY8eOqlq1qtq1a6eiRYvq6NGj+vzzz1W7dm3NmDFDfn5+jq+juHLliooXL66vvvpKiYmJGcasVq2aJGnIkCFq166d8uXLpxYtWsjb21vdunXThAkT1K1bN1WvXl3r1q3T/v37b7re+++/X2PGjNHgwYN1+PBhtW7dWr6+vkpMTNTHH3+sHj16qH///jn2+WRFamqqHn30UbVt21YJCQmaOXOmHnnkEbVs2VLSX187MnjwYMXGxqpJkyZq2bKlo99DDz2kZ5999qaOU61aNcXFxWnMmDGKjIxUYGCgGjZsqMcff1yjRo1STEyMatWqpV27dmnBggVOq5pZ4ebmpri4OLVo0UKVK1dWTEyMQkJCtG/fPu3Zs0dffvmlpL8esvTII4+oQoUK6t69uyIiInTq1Clt3LhRv/zyS4bviQSAvIzQCADIk5555hkVK1ZMEyZM0GuvvaaUlBQVL15cderUcXp65/vvv68+ffrojTfekGVZeuyxx7RixYoM3//30EM
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Train Mean Squared Error: 40281623.425488226\n",
|
|||
|
"Train R2 Score: 0.9581963040734582\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA4IAAAIjCAYAAABWPqWeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADArklEQVR4nOzdeXxM9/oH8M+smUkmmYhsEkGssVMq1FK9UlGqV2ntu1pascVeu9pV7RVapYvW0qq2Wkot5dp3ghBLUJEEkZlsk9nO7w+/nGYkSEgyk+Tzfr3mXvM933PmOYeGZ77LIxEEQQARERERERGVGFJ7B0BERERERESFi4kgERERERFRCcNEkIiIiIiIqIRhIkhERERERFTCMBEkIiIiIiIqYZgIEhERERERlTBMBImIiIiIiEoYJoJEREREREQlDBNBIiIiIiKiEoaJIBEROTSJRILp06fbOwy7a9myJVq2bCm+j4mJgUQiwfr16+0W05OejLGgOOK9ExEVNUwEiYhKkM8//xwSiQTBwcEvfI3Y2FhMnz4dZ8+ezb/AHNz+/fshkUjEl0KhQMWKFdG7d2/cuHHD3uHlyeHDhzF9+nQkJSXZLYYKFSrYPE9vb280b94cP//8s91iIiIqaeT2DoCIiArPhg0bUKFCBRw/fhzXrl1D5cqV83yN2NhYzJgxAxUqVEC9evXyP0gHNnz4cLz66qswmUw4ffo01qxZg99//x0XLlyAn59focZSvnx5pKenQ6FQ5Om8w4cPY8aMGejbty/c3d0LJrhcqFevHkaPHg3g8Z+p1atXo2PHjli1ahWGDBnyzHNf9N6JiOhfHBEkIiohbt68icOHD+Ozzz6Dl5cXNmzYYO+QipzmzZujZ8+e6NevH5YvX45PP/0UiYmJ+Prrr596TmpqaoHEIpFIoFKpIJPJCuT6Bc3f3x89e/ZEz549MW7cOBw6dAguLi5YvHjxU88xm80wGo1F/t6JiBwBE0EiohJiw4YNKFWqFNq1a4f33nvvqYlgUlISRo0ahQoVKsDJyQlly5ZF79698eDBA+zfvx+vvvoqAKBfv37i1L7MtVoVKlRA3759s13zybVjRqMRU6dORYMGDaDVauHi4oLmzZtj3759eb6v+Ph4yOVyzJgxI9uxK1euQCKRYMWKFQAAk8mEGTNmoEqVKlCpVChdujSaNWuG3bt35/lzAeA///kPgMdJNgBMnz4dEokEly5dQvfu3VGqVCk0a9ZM7P/dd9+hQYMGUKvV8PDwQNeuXXHnzp1s112zZg0qVaoEtVqNRo0a4eDBg9n6PG2dXFRUFDp37gwvLy+o1WpUq1YNkyZNEuMbO3YsACAwMFD8/YuJiSmQGPPC19cX1atXF59l5v19+umnWLJkCSpVqgQnJydcunTphe490927d9G/f3/4+PjAyckJNWvWxFdfffVSsRMRFUWcGkpEVEJs2LABHTt2hFKpRLdu3bBq1SqcOHFCTOwAICUlBc2bN8fly5fRv39/vPLKK3jw4AF+/fVX/PPPP6hevTpmzpyJqVOnYtCgQWjevDkA4LXXXstTLHq9Hl9++SW6deuGgQMHIjk5GWvXrkVoaCiOHz+epymnPj4+eP3117F582ZMmzbN5timTZsgk8nw/vvvA3icCM2dOxcffPABGjVqBL1ej5MnT+L06dN4880383QPAHD9+nUAQOnSpW3a33//fVSpUgVz5syBIAgAgNmzZ2PKlCno3LkzPvjgA9y/fx/Lly9HixYtcObMGXGa5tq1azF48GC89tprGDlyJG7cuIF33nkHHh4eCAgIeGY858+fR/PmzaFQKDBo0CBUqFAB169fx2+//YbZs2ejY8eOuHr1Kn744QcsXrwYnp6eAAAvL69Ci/FpTCYT7ty5k+1Zrlu3DgaDAYMGDYKTkxM8PDxgtVrzfO/A4y8NGjduDIlEgrCwMHh5eWHHjh0YMGAA9Ho9Ro4c+UKxExEVSQIRERV7J0+eFAAIu3fvFgRBEKxWq1C2bFlhxIgRNv2mTp0qABC2bt2a7RpWq1UQBEE4ceKEAEBYt25dtj7ly5cX+vTpk6399ddfF15//XXxvdlsFjIyMmz6PHr0SPDx8RH69+9v0w5AmDZt2jPvb/Xq1QIA4cKFCzbtNWrUEP7zn/+I7+vWrSu0a9fumdfKyb59+wQAwldffSXcv39fiI2NFX7//XehQoUKgkQiEU6cOCEIgiBMmzZNACB069bN5vyYmBhBJpMJs2fPtmm/cOGCIJfLxXaj0Sh4e3sL9erVs3k+a9asEQDYPMObN29m+31o0aKF4OrqKty6dcvmczJ/7wRBEBYuXCgAEG7evFngMT5N+fLlhdatWwv3798X7t+/L5w7d07o2rWrAEAYNmyYzf25ubkJCQkJNue/6L0PGDBAKFOmjPDgwQObPl27dhW0Wq2Qlpb23NiJiIoLTg0lIioBNmzYAB8fH7zxxhsAHq8v69KlCzZu3AiLxSL2++mnn1C3bl28++672a4hkUjyLR6ZTAalUgkAsFqtSExMhNlsRsOGDXH69Ok8X69jx46Qy+XYtGmT2BYZGYlLly6hS5cuYpu7uzsuXryI6OjoF4q7f//+8PLygp+fH9q1a4fU1FR8/fXXaNiwoU2/Jzc72bp1K6xWKzp37owHDx6IL19fX1SpUkWcEnvy5EkkJCRgyJAh4vMBgL59+0Kr1T4ztvv37+PAgQPo378/ypUrZ3MsN793hRFjVrt27YKXlxe8vLxQt25dbNmyBb169cL8+fNt+nXq1EkcsXya3Ny7IAj46aef0L59ewiCYHOPoaGh0Ol0L/Rnj4ioqOLUUCKiYs5isWDjxo144403xPVXABAcHIxFixZhz549aN26NYDHUx07depUKHF9/fXXWLRoEaKiomAymcT2wMDAPF/L09MTrVq1wubNm/HJJ58AeDwtVC6Xo2PHjmK/mTNn4r///S+qVq2KWrVqoU2bNujVqxfq1KmTq8+ZOnUqmjdvDplMBk9PT1SvXh1yefa/Sp+8h+joaAiCgCpVquR43czdL2/dugUA2fpllqt4lswyFrVq1crVvTypMGLMKjg4GLNmzYJEIoGzszOqV6+e4y6mufnzkJt7v3//PpKSkrBmzRqsWbMmxz4JCQm5C56IqBhgIkhEVMzt3bsX9+7dw8aNG7Fx48Zsxzds2CAmgi/raSNPFovFZofH7777Dn379kWHDh0wduxYeHt7QyaTYe7cueK6u7zq2rUr+vXrh7Nnz6JevXrYvHkzWrVqJa6DA4AWLVrg+vXr+OWXX7Br1y58+eWXWLx4MSIiIvDBBx889zNq166NkJCQ5/ZTq9U2761WKyQSCXbs2JHjTpcajSYXd1iwCjtGT0/PF3qWLypzXWHPnj3Rp0+fHPvk9gsBIqLigIkgEVExt2HDBnh7e2PlypXZjm3duhU///wzIiIioFarUalSJURGRj7zes+aZliqVKkcC5XfunXLZrToxx9/RMWKFbF161ab6z252UtedOjQAYMHDxanh169ehUTJ07M1s/DwwP9+vVDv379kJKSghYtWmD69Om5SgRfVKVKlSAIAgIDA1G1atWn9itfvjyAx6NzmTuSAo83Url58ybq1q371HMzn++L/v4VRowFJTf37uXlBVdXV1gsllwloERExR3XCBIRFWPp6enYunUr3n77bbz33nvZXmFhYUhOTsavv/4K4PF6rHPnzuHnn3/Odi3h/3e/dHFxAYAcE75KlSrh6NGjMBqNYtv27duzlR/IHHHKvCYAHDt2DEeOHHnhe3V3d0doaCg2b96MjRs3QqlUokOHDjZ9Hj58aPNeo9GgcuXKyMjIeOHPzY2OHTtCJpNhxowZNvcMPH4GmXE1bNgQXl5eiIiIsHmG69evz/F5Z+Xl5YUWLVrgq6++wu3bt7N9Rqan/f4VRowFJTf3LpPJ0KlTJ/z00085Joz3798vlFiJiBwFRwSJiIqxX3/9FcnJyXjnnXdyPN64cWOxuHyXLl0wduxY/Pjjj3j//ffRv39/NGjQAImJifj1118RERGBunXrolKlSnB3d0dERAR
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование столбца Battery в числовой формат\n",
|
|||
|
"df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование столбца Display в числовой формат\n",
|
|||
|
"df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n",
|
|||
|
"df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n",
|
|||
|
"df['Inbuilt_memory'] = pd.to_numeric(df['Inbuilt_memory'], errors='coerce')\n",
|
|||
|
"df['fast_charging'] = pd.to_numeric(df['fast_charging'], errors='coerce')\n",
|
|||
|
"\n",
|
|||
|
"# Удаление запятых из столбца Price и преобразование в числовой формат\n",
|
|||
|
"df['Price'] = df['Price'].str.replace(',', '').astype(float)\n",
|
|||
|
"\n",
|
|||
|
"# Удаление столбцов с текстовыми значениями, которые не могут быть преобразованы в числа\n",
|
|||
|
"df = df.drop(columns=['Name', 'company', 'Android_version', 'Processor_name', 'External_Memory', 'No_of_sim', 'Ram', 'Screen_resolution', 'Processor' ])\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='mobile_data')\n",
|
|||
|
"es = es.add_dataframe(dataframe_name='mobile', dataframe=train_df, index='id')\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков с уменьшенной глубиной\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='mobile', max_depth=1)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n",
|
|||
|
"\n",
|
|||
|
"# Удаление строк с NaN\n",
|
|||
|
"feature_matrix = feature_matrix.dropna()\n",
|
|||
|
"val_feature_matrix = val_feature_matrix.dropna()\n",
|
|||
|
"test_feature_matrix = test_feature_matrix.dropna()\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"X_train = feature_matrix.drop('Price', axis=1)\n",
|
|||
|
"y_train = feature_matrix['Price']\n",
|
|||
|
"X_val = val_feature_matrix.drop('Price', axis=1)\n",
|
|||
|
"y_val = val_feature_matrix['Price']\n",
|
|||
|
"X_test = test_feature_matrix.drop('Price', axis=1)\n",
|
|||
|
"y_test = test_feature_matrix['Price']\n",
|
|||
|
"\n",
|
|||
|
"# Выбор модели\n",
|
|||
|
"model = RandomForestRegressor(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучение модели\n",
|
|||
|
"model.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Предсказание и оценка\n",
|
|||
|
"y_pred = model.predict(X_test)\n",
|
|||
|
"\n",
|
|||
|
"mse = mean_squared_error(y_test, y_pred)\n",
|
|||
|
"r2 = r2_score(y_test, y_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Mean Squared Error: {mse}\")\n",
|
|||
|
"print(f\"R2 Score: {r2}\")\n",
|
|||
|
"\n",
|
|||
|
"# Кросс-валидация\n",
|
|||
|
"scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
|
|||
|
"mse_cv = -scores.mean()\n",
|
|||
|
"print(f\"Cross-validated Mean Squared Error: {mse_cv}\")\n",
|
|||
|
"\n",
|
|||
|
"# Анализ важности признаков\n",
|
|||
|
"feature_importances = model.feature_importances_\n",
|
|||
|
"feature_names = X_train.columns\n",
|
|||
|
"\n",
|
|||
|
"importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n",
|
|||
|
"importance_df = importance_df.sort_values(by='Importance', ascending=False)\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.barplot(x='Importance', y='Feature', data=importance_df)\n",
|
|||
|
"plt.title('Feature Importance')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Проверка на переобучение\n",
|
|||
|
"y_train_pred = model.predict(X_train)\n",
|
|||
|
"\n",
|
|||
|
"mse_train = mean_squared_error(y_train, y_train_pred)\n",
|
|||
|
"r2_train = r2_score(y_train, y_train_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Train Mean Squared Error: {mse_train}\")\n",
|
|||
|
"print(f\"Train R2 Score: {r2_train}\")\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация результатов\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(y_test, y_pred, alpha=0.5)\n",
|
|||
|
"plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n",
|
|||
|
"plt.xlabel('Actual Price')\n",
|
|||
|
"plt.ylabel('Predicted Price')\n",
|
|||
|
"plt.title('Actual vs Predicted Price')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|