AIM-PIbd-32-Kurbanova-A-A/lab_2/lab_2.ipynb

1801 lines
481 KiB
Plaintext
Raw Normal View History

2024-11-01 23:46:45 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Начало лабораторной работы №2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Цены на мобильные телефоны"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd \n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"from sklearn.preprocessing import LabelEncoder"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Unnamed: 0', 'Name', 'Rating', 'Spec_score', 'No_of_sim', 'Ram',\n",
" 'Battery', 'Display', 'Camera', 'External_Memory', 'Android_version',\n",
" 'Price', 'company', 'Inbuilt_memory', 'fast_charging',\n",
" 'Screen_resolution', 'Processor', 'Processor_name'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"data = pd.read_csv(\"../static/csv/mobile phone price prediction.csv\",delimiter=',')\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проблема область: Данные о мобильных телефонах, включая их характеристику.\\\n",
"Объект наблюдения: Мобильные телефоны.\\\n",
"Атрибуты: Имя, рейтинг, оценка, поддержка двух SIM-карт, оперативная память, аккумулятор, дисплей, камера, внешняя память, версия Android телефона, цена, компания производителя, разрешение экрана, харатеристика процессора, название процессора.\\\n",
"Пример бизнес-цели: \n",
"1. Анализ данных: Изучение и очистка данных для выявления закономерностей и корреляций между характеристиками мобильных телефонов и их ценами.\n",
"2. Разработка модели: Создание и обучение модели машинного обучения, которая будет прогнозировать цены на мобильные телефоны на основе их характеристик.\n",
"3. Внедрение: Интеграция модели в систему ценообразования компании для автоматического расчета цен на мобильные телефоны.\n",
"\n",
"\n",
"Актуальность: Данный датасет является актуальным и ценным ресурсом для компаний, занимающихся продажей мобильных телефонов, а также для исследователей и инвесторов, поскольку он предоставляет обширную информацию о ценах и характеристиках мобильных телефонов на вторичном рынке. Эти данные могут быть использованы для разработки моделей прогнозирования цен, анализа рыночных тенденций и принятия обоснованных бизнес-решений."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: xlabel='Spec_score', ylabel='Price'>"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA3AAAAINCAYAAAB/IZ18AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAClRElEQVR4nOzdfXxT9dk/8E/SJE3SNmltpFClWG0FhBZQFLWFTWUiohvYuYluN09u95ygTjcBFUURBJx7AHVucz7s/glObwab+LBbZYogQ8Uqz0iVAa5QaGmT5ql5/P2BiSTn5KTntElPks/79fJ13+ac0+/35KQuV6/v97o04XA4DCIiIiIiIlI9bV9PgIiIiIiIiLqHARwREREREVGGYABHRERERESUIRjAERERERERZQgGcERERERERBmCARwREREREVGGYABHRERERESUIRjAERERERERZQgGcGkUDofhcDjA3ulERERERKQEA7g06uzshNVqRWdnZ19PhYiIiIiIMhADOCIiIiIiogzBAI6IiIiIiChDMIAjIiIiIiLKEAzgiIiIiIiIMgQDOCIiIiIiogzBAI6IiIiIiChDMIAjIiIiIiLKEAzgiIiIiIiIMgQDOCIiIiIiogzBAI6IiIiIiChDMIAjIiIiIiLKEAzgiIiIiIiIMgQDOCIiIiIiogzBAI6IiIiIiChDMIAjIiIiIiLKEAzgiIiIiIiIMoSurydARERERETZw+72odXpg8Prh8Wkh63AAKvZ0NfTyhoM4IiIiIiIqFc0d3gwd812vLe/NfrauGobljbUorzY1Iczyx5cQklERERERD1md/sEwRsAbNzfinlrtsPu9vXRzLILAzgiIiIiIuqxVqdPELxFbNzfilYnA7jewACOiIiIiIh6zOH1Sx7vTHKcuocBHBERERER9ZjFqJc8XpTkOHUPAzgiIiIiIuoxW6EB46ptosfGVdtgK2Qlyt7AAI6IiIiIiHrMajZgaUOtIIgbV23DsoZathLoJZpwOBzu60nkCofDAavVCrvdDovF0tfTISIiIiLqdZE+cJ1eP4qMetgK2QeuN7EPHBERERER9RqrmQFbKnEJJRERERERUYZgAEdERERERJQhGMARERERERFlCAZwREREREREGYIBHBERERERUYZgAEdERERERJQhGMARERERERFlCAZwREREREREGYIBHBERERERUYZgAEdERERERJQhGMARERERERFlCAZwREREREREGYIBHBERERERUYZgAEdERERERJQhGMARERERERFlCAZwREREREREGYIBHBERERERUYZgAEdERERERJQhGMARERERERFlCAZwREREREREGaJPA7hHHnkEF154IYqKitCvXz9MnjwZ+/btiznH6/Xi1ltvRWlpKQoLC9HQ0ICWlpaYcw4dOoRJkybBbDajX79++MUvfoFAIBBzzjvvvIPzzz8f+fn5qKqqwnPPPSeYzxNPPIGzzjoLRqMRY8aMwQcffCB7LkREREREJI/d7cPnx5xoPNSOz487YXf7+npKqtWnAdy7776LW2+9Ff/617/w5ptvwu/348orr4TL5Yqe87Of/QyvvPIKXn75Zbz77rtobm7GddddFz0eDAYxadIk+Hw+vP/++3j++efx3HPP4f7774+ec+DAAUyaNAmXXXYZPvnkE9xxxx24+eab8Y9//CN6zl/+8hfceeedeOCBB/Dxxx9jxIgRmDBhAo4dO9btuRARERERkTzNHR7MXt2IK371LqY8+T6ueOxdzFndiOYOT19PTZU04XA43NeTiDh+/Dj69euHd999F+PGjYPdbsfpp5+OVatW4bvf/S4AYO/evRg6dCi2bNmCiy++GK+//jquueYaNDc3o6ysDADw1FNPYe7cuTh+/DgMBgPmzp2LV199FTt37oyOdcMNN6CjowNvvPEGAGDMmDG48MIL8fjjjwMAQqEQBg4ciDlz5mDevHndmksyDocDVqsVdrsdFoulV987IiIiIqJMY3f7MHt1I97b3yo4Nq7ahpVTR8FqNvTBzNRLVXvg7HY7AOC0004DAGzbtg1+vx/jx4+PnjNkyBBUVFRgy5YtAIAtW7agpqYmGrwBwIQJE+BwOLBr167oOaf+jMg5kZ/h8/mwbdu2mHO0Wi3Gjx8fPac7c4nX1dUFh8MR8w8REREREZ3U6vSJBm8AsHF/K1qdXEoZTzUBXCgUwh133IG6ujoMHz4cAHD06FEYDAYUFxfHnFtWVoajR49Gzzk1eIscjxyTOsfhcMDj8aC1tRXBYFD0nFN/RrK5xHvkkUdgtVqj/wwcOLCb7wYRERERUfZzeP2SxzuTHM9Fqgngbr31VuzcuRMvvvhiX0+l18yfPx92uz36z+HDh/t6SkREREREqmEx6iWPFyU5notUEcDNnj0b69evxz//+U+ceeaZ0df79+8Pn8+Hjo6OmPNbWlrQv3//6DnxlSAj/57sHIvFApPJBJvNhry8PNFzTv0ZyeYSLz8/HxaLJeYfIiIiIiI6yVZowLhqm+ixcdU22Aq5/y1enwZw4XAYs2fPxtq1a7FhwwZUVlbGHL/gggug1+vx9ttvR1/bt28fDh06hEsuuQQAcMkll2DHjh0x1SLffPNNWCwWnHfeedFzTv0ZkXMiP8NgMOCCCy6IOScUCuHtt9+OntOduRARERERUfdZzQYsbagVBHHjqm1Y1lDLAiYi+rQK5U9/+lOsWrUKf/vb3zB48ODo61arFSaTCQBwyy234LXXXsNzzz0Hi8WCOXPmAADef/99ACfbCIwcORLl5eVYvnw5jh49ih/+8Ie4+eabsWTJEgAn2wgMHz4ct956K2bOnIkNGzbgtttuw6uvvooJEyYAONlGYNq0afj973+Piy66CL/5zW/w0ksvYe/evdG9ccnmkgyrUBIRERERCdndPrQ6fej0+lFk1MNWaGDwlkCfBnAajUb09WeffRbTp08HcLJ59l133YXVq1ejq6sLEyZMwJNPPhmzbPHgwYO45ZZb8M4776CgoADTpk3D0qVLodPpoue88847+NnPfobdu3fjzDPPxIIFC6JjRDz++ON49NFHcfToUYwcORIrVqzAmDFjose7MxcpDOCIiIiIiKgnVNUHLtsxgCMiIiIiop5QRRETIiIiIiIiSo4BHBERERERUYZgAEdERERERJQhGMARERERERFlCAZwREREREREGUKX/BQiIiIiIspFkf5sDq8fFpMetgL2Z+trDOCIiIiIiEigucODuWu24739rdHXxlXbsLShFuXFpj6cWW7jEkoiIiIiIophd/sEwRsAbNzfinlrtsPu9vXRzIgBHBERERERxWh1+gTBW8TG/a1odTKA6ysM4IiIiIiIKIbD65c83pnkOKUOAzgiIiIiIophMeoljxclOU6pwwCOiIiIiIhi2AoNGFdtEz02rtoGWyErUfYVBnBERERERBTDajZgaUOtIIgbV23DsoZathLoQ5pwOBzu60nkCofDAavVCrvdDovF0tfTISIiIqIcI7evW+T8Tq8fRUY9bIXsA9fX2AeOiIiIiCgHKOnrZjUzYFMbLqEkIiIiIspy7OuWPRjAERERERFlOfZ1yx4M4IiIiIiIshz7umUPBnBERERERFmOfd2yBwM4IiIiIqIsx75u2YMBHBERERFRlrOaDXh48nDUV5XGvF5fVYqHJw9npckMwj5wacQ+cERERETUF+xuH+56+VMMGWDBqIHF6AqEkK/TovFwB/YdceCX149gEJch2AeOiIiIiCjLtTp9eGvPMby151jC4wzgMgOXUBIRERERZTlWocweDOCIiIiIiLIcq1BmDwZwRERERERZjlUoswcDOCIiIiKiLGc1G7C0oVYQxI2rtmFZQy33v2UQVqFMI1ahJCIiIqK+ZHf70Or0odPrR5FRD1uhgcFbhmEVSiIiIiKiHGE1M2DLdAzgiIiIiIgyUCSb5vD6YTHpYStgcJYLGMAREREREWWY5g4P5q7Zjvf2t0ZfG1dtw9KGWpQXm/pwZpRqLGJCRERERJRB7G6fIHgDgI37WzFvzXbY3b4+mhmlAwM
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Загрузка данных\n",
"data = pd.read_csv(\"../static/csv/mobile phone price prediction.csv\",delimiter=',')\n",
"data.drop(['Unnamed: 0'], axis=1, inplace=True)\n",
"data['Price'] = data['Price'].str.replace(',', '').astype(float)\n",
"data.describe(include='all')\n",
"f, ax = plt.subplots(figsize=(10,6))\n",
"sns.despine(f)\n",
"sns.scatterplot(data=data, x='Spec_score', y='Price')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"При проверке на шум можно заметить выброс в 75 оценке. Цена там запредельная.\n",
"\n",
"Для удаления выбросов из датасета можно использовать метод межквартильного размаха. Зашумленность не очень высокая. Покрытие данных высокое и подошло бы для поставленной задачи по актуальности."
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIjCAYAAABswtioAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXwU9f0/8NfM7M7e2dwH4QiEI1wtBREBKVaRSD3qfVYBr9YC1qtVf16oX0WlilIVqlVAaivaqq22CAiiFtCieAUJBCQm5IQcu5u9Zmfm8/sj7pLNQXaTSXaSvJ+PRx6a3eGz77k2+97P5/P+cIwxBkIIIYQQQgghvY5PdACEEEIIIYQQMlBRQkYIIYQQQgghCUIJGSGEEEIIIYQkCCVkhBBCCCGEEJIglJARQgghhBBCSIJQQkYIIYQQQgghCUIJGSGEEEIIIYQkCCVkhBBCCCGEEJIglJARQgghhBBCSIJQQkYI0aXS0lJwHIe1a9cmOpQo7733HiZNmgSz2QyO49DY2JjokEg/wHEcli5d2uuvq6oqJkyYgEceeaTLbaxfvx4FBQUwGo1ITk7WLjjSxmmnnYbTTjst0WH0qry8PCxYsCDy+/bt28FxHLZv397rsZxyyin4/e9/3+uvS/o/SsgI6WXffPMNLr74YgwbNgxmsxm5ubk488wz8cc//rHHXvOvf/0rnn766TaPV1ZWYunSpfjyyy977LVbC/8xDf8YjUaMGDEC11xzDb777jtNXmPnzp1YunSp5slSXV0dLr30UlgsFjz33HNYv349bDZbh9sn4lyT3nfaaadFXdOpqamYOnUqXn75ZaiqmujwTuhvf/sbysvLsXjx4naff/7558FxHKZNm9bu88XFxViwYAHy8/Px4osv4oUXXoDP58PSpUt79QNz+Auc8A/P80hNTcW8efOwa9euXoujvwuFQli5ciWmTp0Kh8MBu92OqVOnYuXKlQiFQl1ut6fes7V255134rnnnkN1dXWiQyH9DSOE9JodO3YwURTZyJEj2cMPP8xefPFFdv/997O5c+ey/Pz8Hnvds88+mw0bNqzN47t372YA2Jo1a3rstVv74IMPGAB28803s/Xr17OXX36ZLV68mImiyFJTU1lFRQVjjLHDhw93Obbly5czAOzw4cOaxr5x40YGgG3ZsqXTbRN1rknvmz17Nhs8eDBbv349W79+PXvqqafYpEmTGAB25513xtSG3+9noVCohyNt68c//jG78cYbO3x+xowZLC8vjwFgJSUlbZ5ftWpVm+eOHj3KALAHHnigJ0JuV/j94oorrmDr169na9euZf/v//0/lpyczEwmE/v66697LZaeFAwGWTAYTMhrNzU1sdmzZzMA7JxzzmHPPvsse/7559l5553HALDZs2ezpqamLrV9ovfsYcOGsfnz50d+VxSF+f1+pihKF/ek6xRFYdnZ2ey+++7r9dcm/ZshIVkgIQPUI488AqfTid27d7cZ2lNbW5uYoHqA1+s9Yc8RAMyaNQsXX3wxAGDhwoUYPXo0br75Zqxbtw533313b4QZt/A5imVY1kA51z3N5/PBarUmOoxOOZ1O/PKXv4z8/qtf/QpjxozBs88+i4cffhhGo7HNv1FVFZIkwWw2w2w292a4AIAvvvgCX331FZ588sl2nz98+DB27tyJN998E7/61a/w6quv4oEHHojaJp57ortieV+ZPHly1HmYNWsW5s2bh1WrVuH555/v6RB7nCiKCXvt2267DR9++CH++Mc/RvWo3nTTTXjuueewePFi3HHHHVi1alWPxsHzvKb3SyzXVcvXvvjii/HKK6/gwQcfBMdxmsVBBrhEZ4SEDCRjxoxhp512Wszbr1+/nk2dOpVZLBaWnJzMZs2axTZt2hR5/u2332Y///nPWU5ODhNFkY0YMYI99NBDTJblyDbhbzRb/gwbNizSU9X6p2WP1CeffMIKCwtZUlISs1gs7Kc//Sn773//GxXjAw88wACwvXv3siuuuIIlJyezSZMmdbhP4dd94403oh4vKipiANgNN9zAGOu4h2zr1q3s1FNPZVarlTmdTnbeeeexb7/9tk08rX866y17/fXX2eTJk5nZbGZpaWnsqquuYkeOHDnhcWz5rW1r8ZxrAGzRokXsL3/5Cxs9ejQzmUxs8uTJ7MMPP2yz7ZEjR9jChQtZZmYmE0WRjRs3jr300ktttvP7/eyBBx5go0aNYiaTiWVnZ7MLLriAHTx4MKaYGGPswIED7MILL2RZWVnMZDKx3Nxcdtlll7HGxsao7Tq7Thlj7LnnnmPjxo1joiiynJwc9pvf/IY1NDREbTN79mw2fvx49tlnn7FZs2Yxi8XCfvvb3zLGGAsEAuz+++9n+fn5TBRFNnjwYPa73/2OBQKBE+7DokWLmM1mY16vt81zl19+OcvKyorcL7t372Zz585laWlpzGw2s7y8PLZw4cJOj1M47tYuvvhiBiDS69vyPI8bN44ZDAb21ltvRZ5r3aN05MgRdu2110bu77y8PPbrX/86qoekoaGB/fa3v2WDBw9moiiy/Px89thjj8XUe3D//fczURSZJEntPv/www+zlJQUFgwG2U033cRGjRoV9fywYcPavSfau/9a7tu+ffvYRRddxFJSUpjJZGJTpkxh//znP6PaXrNmDQPAtm/fzm666SaWkZHBkpOTO9yX8PvF8uXLox5vampiANjcuXOjHo/luLVs89lnn2XDhw9nFouFnXnmmaysrIypqsoeeughlpuby8xmMzvvvPNYXV1dm9g6u/bjuUZnz57NZs+eHXk+/H66YcMG9n//938sNzeXmUwmdvrpp7fboxneD7PZzKZOnco++uijNm22p7y8nAmCwE4//fQOt/nZz37GDAYDKy8vjzp+7Y1yaHlNdPae3bqHLLzPH3zwQVSb3f17VVVVxRYsWMByc3OZKIosOzubnXfeeW3+dvzzn/9kANiePXtOeMwIiQf1kBHSi4YNG4Zdu3ahqKgIEyZMOOG2Dz74IJYuXYoZM2bgoYcegiiK+PTTT7Ft2zbMnTsXALB27VrY7XbcdtttsNvt2LZtG+6//3643W4sX74cAHDPPffA5XLhyJEjWLFiBQDAbrdj7NixeOihh3D//ffjxhtvxKxZswAAM2bMAABs27YN8+bNw5QpU/DAAw+A53msWbMGp59+Oj7++GOcfPLJUfFecsklGDVqFB599FEwxuI+NocOHQIApKWldbjN+++/j3nz5mHEiBFYunQp/H4//vjHP2LmzJnYs2cP8vLycOGFF+LAgQP429/+hhUrViA9PR0AkJGR0WG7a9euxcKFCzF16lQsW7YMNTU1eOaZZ7Bjxw588cUXSE5Oxj333IMxY8bghRdewEMPPYThw4cjPz+/wzbjOdcA8OGHH2LDhg24+eabYTKZ8Pzzz+Oss87C//73v8i/r6mpwSmnnAKO47B48WJkZGRg48aNuO666+B2u3HLLbcAABRFwTnnnIOtW7fi8ssvx29/+1t4PB5s2bIFRUVFJ4w7TJIkFBYWIhgMYsmSJcjOzkZFRQXeffddNDY2wul0AojtOl26dCkefPBBzJkzBzfddBP279+PVatWYffu3dixY0dU71FdXR3mzZuHyy+/HL/85S+RlZUFVVVx3nnn4b///S9uvPFGjB07Ft988w1WrFiBAwcO4O233+5wPy677DI899xz+Pe//41LLrkk8rjP58M777yDBQsWQBAE1NbWYu7cucjIyMBdd92F5ORklJaW4s033+z0WHXku+++gyAIUb1H27Ztw+uvv47FixcjPT0deXl57f7byspKnHzyyWhsbMSNN96IgoICVFRU4O9//zt8Ph9EUYTP58Ps2bNRUVGBX/3qVxg6dCh27tyJu+++G1VVVe3OG21p586dmDBhQru9dwDw6quv4sILL4Qoirjiiisi52zq1KkAgKeffhqvvPIK3nrrLaxatQp2ux0TJ07EKaecgptuugkXXHABLrzwQgDAj370IwDA3r17MXPmTOTm5uKuu+6CzWbD66+/jvPPPx//+Mc/cMEFF0TF8Jv
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Количество строк до удаления выбросов: 1370\n",
"Количество строк после удаления выбросов: 1256\n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
"\n",
"df['Spec_score'] = df['Spec_score'].astype(int)\n",
"df['Price'] = df['Price'].str.replace(',', '').astype(float)\n",
"# Выбор столбцов для анализа\n",
"column1 = 'Spec_score'\n",
"column2 = 'Price'\n",
"\n",
"\n",
"# Функция для удаления выбросов\n",
"def remove_outliers(df, column):\n",
" Q1 = df[column].quantile(0.25)\n",
" Q3 = df[column].quantile(0.75)\n",
" IQR = Q3 - Q1\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]\n",
"\n",
"# Удаление выбросов для каждого столбца\n",
"df_cleaned = df.copy()\n",
"for column in [column1, column2]:\n",
" df_cleaned = remove_outliers(df_cleaned, column)\n",
"\n",
"# Построение точечной диаграммы после удаления выбросов\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df_cleaned[column1], df_cleaned[column2], alpha=0.5)\n",
"plt.xlabel(column1)\n",
"plt.ylabel(column2)\n",
"plt.title(f'Scatter Plot of {column1} vs {column2} (After Removing Outliers)')\n",
"plt.show()\n",
"\n",
"# Вывод количества строк до и после удаления выбросов\n",
"print(f\"Количество строк до удаления выбросов: {len(df)}\")\n",
"print(f\"Количество строк после удаления выбросов: {len(df_cleaned)}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Теперь очистим датасет от пустых строк"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Общая информация о датасете:\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1370 entries, 0 to 1369\n",
"Data columns (total 18 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Unnamed: 0 1370 non-null int64 \n",
" 1 Name 1370 non-null object \n",
" 2 Rating 1370 non-null float64\n",
" 3 Spec_score 1370 non-null int64 \n",
" 4 No_of_sim 1370 non-null object \n",
" 5 Ram 1370 non-null object \n",
" 6 Battery 1370 non-null object \n",
" 7 Display 1370 non-null object \n",
" 8 Camera 1370 non-null object \n",
" 9 External_Memory 1370 non-null object \n",
" 10 Android_version 927 non-null object \n",
" 11 Price 1370 non-null object \n",
" 12 company 1370 non-null object \n",
" 13 Inbuilt_memory 1351 non-null object \n",
" 14 fast_charging 1281 non-null object \n",
" 15 Screen_resolution 1368 non-null object \n",
" 16 Processor 1342 non-null object \n",
" 17 Processor_name 1370 non-null object \n",
"dtypes: float64(1), int64(2), object(15)\n",
"memory usage: 192.8+ KB\n",
"None\n",
"Общая информация о датасете:\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1370 entries, 0 to 1369\n",
"Data columns (total 18 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Unnamed: 0 1370 non-null int64 \n",
" 1 Name 1370 non-null object \n",
" 2 Rating 1370 non-null float64\n",
" 3 Spec_score 1370 non-null int64 \n",
" 4 No_of_sim 1370 non-null object \n",
" 5 Ram 1370 non-null object \n",
" 6 Battery 1370 non-null object \n",
" 7 Display 1370 non-null object \n",
" 8 Camera 1370 non-null object \n",
" 9 External_Memory 1370 non-null object \n",
" 10 Android_version 927 non-null object \n",
" 11 Price 1370 non-null object \n",
" 12 company 1370 non-null object \n",
" 13 Inbuilt_memory 1351 non-null object \n",
" 14 fast_charging 1281 non-null object \n",
" 15 Screen_resolution 1368 non-null object \n",
" 16 Processor 1342 non-null object \n",
" 17 Processor_name 1370 non-null object \n",
"dtypes: float64(1), int64(2), object(15)\n",
"memory usage: 192.8+ KB\n",
"None\n",
"\n",
"Таблица анализа пропущенных значений:\n",
" Количество пропущенных значений \\\n",
"Unnamed: 0 0 \n",
"Name 0 \n",
"Rating 0 \n",
"Spec_score 0 \n",
"No_of_sim 0 \n",
"Ram 0 \n",
"Battery 0 \n",
"Display 0 \n",
"Camera 0 \n",
"External_Memory 0 \n",
"Android_version 443 \n",
"Price 0 \n",
"company 0 \n",
"Inbuilt_memory 19 \n",
"fast_charging 89 \n",
"Screen_resolution 2 \n",
"Processor 28 \n",
"Processor_name 0 \n",
"\n",
" Процент пропущенных значений \n",
"Unnamed: 0 0.000000 \n",
"Name 0.000000 \n",
"Rating 0.000000 \n",
"Spec_score 0.000000 \n",
"No_of_sim 0.000000 \n",
"Ram 0.000000 \n",
"Battery 0.000000 \n",
"Display 0.000000 \n",
"Camera 0.000000 \n",
"External_Memory 0.000000 \n",
"Android_version 32.335766 \n",
"Price 0.000000 \n",
"company 0.000000 \n",
"Inbuilt_memory 1.386861 \n",
"fast_charging 6.496350 \n",
"Screen_resolution 0.145985 \n",
"Processor 2.043796 \n",
"Processor_name 0.000000 \n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n",
"\n",
"# Вывод общей информации о датасете\n",
"print(\"Общая информация о датасете:\")\n",
"print(df.info())\n",
"\n",
"# Вывод общей информации о датасете\n",
"print(\"Общая информация о датасете:\")\n",
"print(df.info())\n",
"\n",
"# Вывод таблицы анализа пропущенных значений\n",
"missing_values = df.isnull().sum()\n",
"missing_values_percentage = (missing_values / len(df)) * 100\n",
"missing_data = pd.concat([missing_values, missing_values_percentage], axis=1, keys=['Количество пропущенных значений', 'Процент пропущенных значений'])\n",
"\n",
"print(\"\\nТаблица анализа пропущенных значений:\")\n",
"print(missing_data)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(817, 18)\n",
"Unnamed: 0 False\n",
"Name False\n",
"Rating False\n",
"Spec_score False\n",
"No_of_sim False\n",
"Ram False\n",
"Battery False\n",
"Display False\n",
"Camera False\n",
"External_Memory False\n",
"Android_version False\n",
"Price False\n",
"company False\n",
"Inbuilt_memory False\n",
"fast_charging False\n",
"Screen_resolution False\n",
"Processor False\n",
"Processor_name False\n",
"dtype: bool\n"
]
}
],
"source": [
"df.dropna(inplace=True)\n",
"\n",
"print(df.shape)\n",
"\n",
"print(df.isnull().any())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Создадим выборки."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 822\n",
"Размер контрольной выборки: 274\n",
"Размер тестовой выборки: 274\n"
]
}
],
"source": [
"# Разделение на обучающую и тестовую выборки\n",
"train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проанализируем сбалансированность выборки"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение company в обучающей выборке:\n",
"company\n",
"Vivo 118\n",
"Realme 111\n",
"Samsung 98\n",
"Motorola 77\n",
"Xiaomi 56\n",
"Honor 54\n",
"Poco 45\n",
"Huawei 43\n",
"OnePlus 43\n",
"iQOO 29\n",
"OPPO 24\n",
"Oppo 19\n",
"Lava 15\n",
"TCL 13\n",
"Google 13\n",
"POCO 11\n",
"Lenovo 10\n",
"itel 9\n",
"Asus 9\n",
"Tecno 7\n",
"LG 5\n",
"Nothing 5\n",
"Gionee 5\n",
"Itel 2\n",
"Coolpad 1\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение company в контрольной выборке:\n",
"company\n",
"Samsung 44\n",
"Vivo 36\n",
"Realme 35\n",
"Motorola 26\n",
"Xiaomi 20\n",
"OnePlus 17\n",
"iQOO 17\n",
"Honor 16\n",
"Poco 13\n",
"Huawei 9\n",
"Google 6\n",
"OPPO 5\n",
"Nothing 5\n",
"POCO 4\n",
"Asus 4\n",
"TCL 4\n",
"itel 3\n",
"Oppo 3\n",
"Lenovo 2\n",
"Lava 2\n",
"Tecno 2\n",
"IQOO 1\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение company в тестовой выборке:\n",
"company\n",
"Realme 40\n",
"Samsung 39\n",
"Vivo 32\n",
"Motorola 24\n",
"Honor 18\n",
"Poco 17\n",
"OnePlus 15\n",
"Xiaomi 14\n",
"iQOO 11\n",
"Huawei 10\n",
"OPPO 9\n",
"TCL 9\n",
"Asus 8\n",
"Oppo 5\n",
"Nothing 5\n",
"POCO 4\n",
"Google 4\n",
"Tecno 4\n",
"Lenovo 2\n",
"Lava 2\n",
"LG 1\n",
"Itel 1\n",
"Name: count, dtype: int64\n",
"\n"
]
}
],
"source": [
"def check_balance(df, name):\n",
" counts = df['company'].value_counts()\n",
" print(f\"Распределение company в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выборки не сбалансированы, и для улучшения качества модели рекомендуется провести аугментацию данных."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение company в обучающей выборке после oversampling:\n",
"company\n",
"TCL 118\n",
"Vivo 118\n",
"Realme 118\n",
"Samsung 118\n",
"Huawei 118\n",
"LG 118\n",
"POCO 118\n",
"Xiaomi 118\n",
"Motorola 118\n",
"Honor 118\n",
"Poco 118\n",
"Lava 118\n",
"OPPO 118\n",
"Tecno 118\n",
"OnePlus 118\n",
"Oppo 118\n",
"Asus 118\n",
"iQOO 118\n",
"Google 118\n",
"itel 118\n",
"Lenovo 118\n",
"Nothing 118\n",
"Gionee 118\n",
"Itel 118\n",
"Coolpad 118\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение company в контрольной выборке после oversampling:\n",
"company\n",
"Xiaomi 44\n",
"Motorola 44\n",
"Honor 44\n",
"Samsung 44\n",
"OnePlus 44\n",
"Vivo 44\n",
"iQOO 44\n",
"Nothing 44\n",
"Lenovo 44\n",
"Realme 44\n",
"Poco 44\n",
"Oppo 44\n",
"OPPO 44\n",
"Huawei 44\n",
"Google 44\n",
"POCO 44\n",
"Lava 44\n",
"itel 44\n",
"TCL 44\n",
"Tecno 44\n",
"Asus 44\n",
"IQOO 44\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение company в тестовой выборке после oversampling:\n",
"company\n",
"iQOO 40\n",
"OnePlus 40\n",
"Asus 40\n",
"Honor 40\n",
"Vivo 40\n",
"Samsung 40\n",
"Xiaomi 40\n",
"Motorola 40\n",
"Realme 40\n",
"Poco 40\n",
"Lenovo 40\n",
"TCL 40\n",
"OPPO 40\n",
"Oppo 40\n",
"Huawei 40\n",
"Lava 40\n",
"Tecno 40\n",
"Google 40\n",
"POCO 40\n",
"LG 40\n",
"Itel 40\n",
"Nothing 40\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение company в обучающей выборке после undersampling:\n",
"company\n",
"Asus 1\n",
"Coolpad 1\n",
"Gionee 1\n",
"Google 1\n",
"Honor 1\n",
"Huawei 1\n",
"Itel 1\n",
"LG 1\n",
"Lava 1\n",
"Lenovo 1\n",
"Motorola 1\n",
"Nothing 1\n",
"OPPO 1\n",
"OnePlus 1\n",
"Oppo 1\n",
"POCO 1\n",
"Poco 1\n",
"Realme 1\n",
"Samsung 1\n",
"TCL 1\n",
"Tecno 1\n",
"Vivo 1\n",
"Xiaomi 1\n",
"iQOO 1\n",
"itel 1\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение company в контрольной выборке после undersampling:\n",
"company\n",
"Asus 1\n",
"Google 1\n",
"Honor 1\n",
"Huawei 1\n",
"IQOO 1\n",
"Lava 1\n",
"Lenovo 1\n",
"Motorola 1\n",
"Nothing 1\n",
"OPPO 1\n",
"OnePlus 1\n",
"Oppo 1\n",
"POCO 1\n",
"Poco 1\n",
"Realme 1\n",
"Samsung 1\n",
"TCL 1\n",
"Tecno 1\n",
"Vivo 1\n",
"Xiaomi 1\n",
"iQOO 1\n",
"itel 1\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение company в тестовой выборке после undersampling:\n",
"company\n",
"Asus 1\n",
"Google 1\n",
"Honor 1\n",
"Huawei 1\n",
"Itel 1\n",
"LG 1\n",
"Lava 1\n",
"Lenovo 1\n",
"Motorola 1\n",
"Nothing 1\n",
"OPPO 1\n",
"OnePlus 1\n",
"Oppo 1\n",
"POCO 1\n",
"Poco 1\n",
"Realme 1\n",
"Samsung 1\n",
"TCL 1\n",
"Tecno 1\n",
"Vivo 1\n",
"Xiaomi 1\n",
"iQOO 1\n",
"Name: count, dtype: int64\n",
"\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"# Разделение на обучающую и тестовую выборки\n",
"train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"def check_balance(df, name):\n",
" counts = df['company'].value_counts()\n",
" print(f\"Распределение company в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"def oversample(df):\n",
" X = df.drop('company', axis=1)\n",
" y = df['company']\n",
" \n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_oversampled = oversample(train_df)\n",
"val_df_oversampled = oversample(val_df)\n",
"test_df_oversampled = oversample(test_df)\n",
"\n",
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")\n",
"\n",
"def undersample(df):\n",
" X = df.drop('company', axis=1)\n",
" y = df['company']\n",
" \n",
" undersampler = RandomUnderSampler(random_state=42) # type: ignore\n",
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_undersampled = undersample(train_df)\n",
"val_df_undersampled = undersample(val_df)\n",
"test_df_undersampled = undersample(test_df)\n",
"\n",
"check_balance(train_df_undersampled, \"обучающей выборке после undersampling\")\n",
"check_balance(val_df_undersampled, \"контрольной выборке после undersampling\")\n",
"check_balance(test_df_undersampled, \"тестовой выборке после undersampling\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Цены на автомобили\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['ID', 'Price', 'Levy', 'Manufacturer', 'Model', 'Prod. year',\n",
" 'Category', 'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',\n",
" 'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color',\n",
" 'Airbags'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"df = pd.read_csv(\"..//static//csv//car_price_prediction.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проблемная область: Данные о ценах на автомобили, включая их характеристики\n",
"\n",
"Объект наблюдения: автомобиль\n",
"\n",
"Атрибуты: идентификатор, цена, налог, производитель, модель, год производства, категория, наличие кожаного салона, тип топлива, объем двигателя, пробег автомобиля, количество цилиндров в двигателе, тип коробки передач, тип привода, количество дверей, расположение руля, цвет, количество подушек безопасностей.\n",
"\n",
"Пример бизнес-цели: \n",
"1. Анализ данных: Изучение и очистка данных для выявления закономерностей и корреляций между характеристиками автомобилей и их ценами.\n",
"2. Разработка модели: Создание и обучение модели машинного обучения, которая будет прогнозировать цены на автомобили на основе их характеристик.\n",
"3. Внедрение: Интеграция модели в систему ценообразования компании для автоматического расчета цен на автомобили.\n",
"\n",
"\n",
"Актуальность: Данный датасет является актуальным и ценным ресурсом для компаний, занимающихся продажей автомобилей, а также для исследователей и инвесторов, поскольку он предоставляет обширную информацию о ценах и характеристиках автомобилей на вторичном рынке. Эти данные могут быть использованы для разработки моделей прогнозирования цен, анализа рыночных тенденций и принятия обоснованных бизнес-решений."
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABWlUlEQVR4nO3dd3gU5f7+8XsT0kNCS6EECCAlgDQBg0JAEIgcBNGDIl0F9ICKYOOIUjwaFQuoKHr8AgrHI6ICCgqG3qIUAWkiYChKQieBAAlJnt8f/LKHJWU2Ickm8H5d1166M8/OfObZyTL3zsyzNmOMEQAAAAAgV26uLgAAAAAASjqCEwAAAABYIDgBAAAAgAWCEwAAAABYIDgBAAAAgAWCEwAAAABYIDgBAAAAgAWCEwAAAABYIDgBAAAAgAWCEwCUEgcOHJDNZtPMmTNdXYqDxYsXq2nTpvL29pbNZtOZM2eKbF2DBg1SzZo1i2z5NyJX7lczZ86UzWbTgQMHin3dAJBfBCcALrd9+3bdd999qlGjhry9vVW1alXdeeedeu+994psnZ9//rkmT56cbfqRI0c0fvx4bd26tcjWfbWVK1fKZrPZHx4eHqpVq5YGDBigP/74o1DWsX79eo0fP77QQ83JkyfVu3dv+fj4aOrUqZo1a5b8/PxybJt1kJz18Pb2Vt26dTVixAgdPXq0UOsqTcaPH+/QL76+voqIiNDYsWOVnJzs6vIKxauvvqr58+e7ugxJUlxcnNzc3DRmzJgc57/++uuy2WxatGhRMVcGoKQr4+oCANzY1q9frw4dOqh69eoaMmSIQkNDdfjwYf3000+aMmWKHn/88SJZ7+eff64dO3Zo5MiRDtOPHDmiCRMmqGbNmmratGmRrDs3TzzxhFq2bKlLly7pl19+0ccff6xFixZp+/btqlKlyjUte/369ZowYYIGDRqkcuXKFU7BkjZu3KizZ8/q5ZdfVqdOnZx6zcSJExUeHq6LFy9q7dq1+vDDD/X9999rx44d8vX1zfO1//73v5WZmVkYpZc4H374ofz9/XXu3Dn9+OOPeuWVV7R8+XKtW7dONpvN1eVdk1dffVX33Xefevbs6TC9f//+euCBB+Tl5VVstURGRmrYsGF666231K9fPzVs2NA+7+DBg5o4caL+/ve/q1u3bsVWE4DSgeAEwKVeeeUVBQYGauPGjdkO6I8dO+aaoopASkpKrmdisrRt21b33XefJGnw4MGqW7eunnjiCX366ae5fjvualnvUX7CWHR0tG655RZJ0iOPPKKKFSvq7bff1oIFC9SnT58cX5PVfx4eHtdcc0l13333qVKlSpKkRx99VPfee6+++eYb/fTTT4qMjMzxNefPn7cMmyWZu7u73N3di329r732mhYsWKBhw4ZpzZo19mD6+OOPy8PDQ1OmTCmWOkr7+wfcaLhUD4BL7d+/Xw0bNszxwDs4ODjbtNmzZ6tVq1by9fVV+fLl1a5dO/3444/2+QsWLFC3bt1UpUoVeXl5qXbt2nr55ZeVkZFhb9O+fXstWrRIBw8etF8eVbNmTa1cuVItW7aUdDm4ZM278t6Pn3/+WV27dlVgYKB8fX0VFRWldevWOdSYdenVrl279OCDD6p8+fK6/fbb8903d9xxhyQpPj4+z3bLly9X27Zt5efnp3LlyqlHjx7avXu3Qz3PPPOMJCk8PNy+XVb3lcydO1ctWrSQj4+PKlWqpH79+umvv/6yz2/fvr0GDhwoSWrZsqVsNpsGDRp0zds5aNAg+fv7a//+/brrrrtUtmxZ9e3b1z7v6nucMjMzNWXKFDVu3Fje3t4KCgpS165dtWnTJod2s2fPtm9PhQoV9MADD+jw4cN51vbVV1/JZrNp1apV2eZ99NFHstls2rFjhyQpMTFRgwcPVrVq1eTl5aXKlSurR48eBb5/5+p+ad++vRo1aqTNmzerXbt28vX11T//+U9JlwPsww8/rJCQEHl7e6tJkyb69NNPsy3zzJkzGjRokAIDA1WuXDkNHDgwx8s327dvr/bt22ebXpD+t9lsSklJ0aeffmrf97L2k9zucfrggw/UsGFDeXl5qUqVKho+fHi2OrP6Y9euXerQoYN8fX1VtWpVvfHGGxY9KwUGBmrKlClat26dPvnkE0nSvHnz9N133+m1115T5cqVlZmZqcmTJ6thw4by9vZWSEiIhg0bptOnTzssy5nPnCvrzen9A1A6cMYJgEvVqFFDcXFx2rFjhxo1apRn2wkTJmj8+PFq06aNJk6cKE9PT/38889avny5OnfuLOnygZi/v79GjRolf39/LV++XC+99JKSk5M1adIkSdILL7ygpKQk/fnnn3rnnXckSf7+/mrQoIEmTpyol156SUOHDlXbtm0lSW3atJF0OaBER0erRYsWGjdunNzc3DRjxgzdcccdWrNmjVq1auVQ79///nfddNNNevXVV2WMyXff7N+/X5JUsWLFXNssXbpU0dHRqlWrlsaPH68LFy7ovffe02233aZffvlFNWvWVK9evfT777/rv//9r9555x37WY2goKBclztz5kwNHjxYLVu2VExMjI4ePWo/0NyyZYvKlSunF154QfXq1dPHH39sv/yudu3ahbKd6enp6tKli26//Xa9+eabeX4r//DDD2vmzJmKjo7WI488ovT0dK1Zs0Y//fST/czWK6+8ohdffFG9e/fWI488ouPHj+u9995Tu3bt7NuTk27dusnf319ffvmloqKiHObNmTNHDRs2tO+39957r3bu3KnHH39cNWvW1LFjxxQbG6tDhw4VaECLnPrl5MmTio6O1gMPPKB+/fopJCREFy5cUPv27bVv3z6NGDFC4eHhmjt3rgYNGqQzZ87oySeflCQZY9SjRw+tXbtWjz76qBo0aKB58+bZw29BWfX/rFmz9Mgjj6hVq1YaOnSoJOW5n4wfP14TJkxQp06d9Nhjj2nPnj368MMPtXHjRq1bt87hrOPp06fVtWtX9erVS71799ZXX32l5557To0bN1Z0dHSedWddjvfcc8+pY8eOevLJJ9WmTRsNGzZMkjRs2DD738ETTzyh+Ph4vf/++9qyZYtDHc585mTJ6f0DUIoYAHChH3/80bi7uxt3d3cTGRlpnn32WbNkyRKTlpbm0G7v3r3Gzc3N3HPPPSYjI8NhXmZmpv3/z58/n20dw4YNM76+vubixYv2ad26dTM1atTI1nbjxo1GkpkxY0a2ddx0002mS5cu2dYXHh5u7rzzTvu0cePGGUmmT58+TvXBihUrjCQzffp0c/z4cXPkyBGzaNEiU7NmTWOz2czGjRuNMcbEx8dnq61p06YmODjYnDx50j5t27Ztxs3NzQwYMMA+bdKkSUaSiY+Pt6wnLS3NBAcHm0aNGpkLFy7Ypy9cuNBIMi+99JJ92owZM4wke415yWq7dOlSc/z4cXP48GHzxRdfmIoVKxofHx/z559/GmOMGThwoJFknn/++WzLGDhwoMP7tnz5ciPJPPHEE9naZr1PBw4cMO7u7uaVV15xmL99+3ZTpkyZbNOv1qdPHxMcHGzS09Pt0xISEoybm5uZOHGiMcaY06dPG0lm0qRJlv1wtaz9Zc+ePeb48eMmPj7efPTRR8bLy8uEhISYlJQUY4wxUVFRRpKZNm2aw+snT55sJJnZs2fbp6WlpZnIyEjj7+9vkpOTjTHGzJ8/30gyb7zxhr1denq6adu2bbb9KioqykRFRWWrtSD9b4wxfn5+ZuDAgdnaZO0TWfvlsWPHjKenp+ncubPD3/n7779v/xu5skZJ5rPPPrNPS01NNaGhoebee+/Ntq6cHDhwwPj5+ZkKFSoYDw8Ps337dmOMMWvWrDGSzH/+8x+H9osXL8423dnPnNzePwClB5fqAXCpO++8U3Fxcbr77ru1bds2vfHGG+rSpYuqVq2qb7/91t5u/vz5yszM1EsvvSQ3N8ePritvnPfx8bH//9mzZ3XixAm1bdtW58+f12+//VbgOrdu3aq9e/fqwQcf1MmTJ3XixAmdOHFCKSkp6tixo1avXp1t0IJHH300X+t46KGHFBQUpCp
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"# Преобразуем год производства в целочисленный тип\n",
"df['Prod. year'] = df['Prod. year'].astype(int)\n",
"\n",
"# Визуализация данных\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df['Prod. year'], df['Price'])\n",
"plt.xlabel('Production Year')\n",
"plt.ylabel('Price')\n",
"plt.title('Scatter Plot of Price vs Production Year')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Зашумленность не очень высокая. Покрытие данных высокое и подошло бы для поставленной задачи по актуальности."
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выбросы:\n",
" ID Price Levy Manufacturer Model Prod. year Category \\\n",
"41 45797488 45734 1091 HYUNDAI H1 2016 Universal \n",
"72 45797480 43952 1249 HYUNDAI H1 2017 Universal \n",
"75 45624039 42337 - FORD Mustang 2016 Cabriolet \n",
"112 45731735 44752 1091 HYUNDAI H1 2016 Universal \n",
"172 45802937 43880 891 HYUNDAI Santa FE 2016 Jeep \n",
"... ... ... ... ... ... ... ... \n",
"19000 45646433 43278 1514 LEXUS GX 460 2011 Jeep \n",
"19056 45802290 44843 1091 HYUNDAI H1 2016 Universal \n",
"19089 45810098 44611 891 HONDA Civic 2016 Sedan \n",
"19136 45731793 41811 1249 HYUNDAI H1 2017 Universal \n",
"19175 45804283 42883 900 JEEP Compass 2015 Jeep \n",
"\n",
" Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
"41 Yes Diesel 2.5 61057 km 4.0 \n",
"72 Yes Diesel 2.5 111643 km 4.0 \n",
"75 Yes Petrol 2.3 Turbo 75000 km 4.0 \n",
"112 Yes Diesel 2.5 86000 km 4.0 \n",
"172 Yes Diesel 2 113700 km 4.0 \n",
"... ... ... ... ... ... \n",
"19000 Yes Petrol 4.6 160138 km 8.0 \n",
"19056 Yes Diesel 2.5 133687 km 4.0 \n",
"19089 Yes Petrol 2 44914 km 4.0 \n",
"19136 Yes Diesel 2.5 146644 km 4.0 \n",
"19175 Yes Petrol 2.4 62200 km 4.0 \n",
"\n",
" Gear box type Drive wheels Doors Wheel Color Airbags \n",
"41 Automatic Front 04-May Left wheel Black 4 \n",
"72 Automatic Front 04-May Left wheel Grey 4 \n",
"75 Tiptronic Rear 02-Mar Left wheel Silver 6 \n",
"112 Automatic Front 04-May Left wheel Grey 4 \n",
"172 Automatic Front 04-May Left wheel Silver 4 \n",
"... ... ... ... ... ... ... \n",
"19000 Automatic 4x4 04-May Left wheel White 0 \n",
"19056 Automatic Front 04-May Left wheel Grey 4 \n",
"19089 Automatic Front 04-May Left wheel Silver 4 \n",
"19136 Automatic Front 04-May Left wheel Black 4 \n",
"19175 Automatic Front 04-May Left wheel White 4 \n",
"\n",
"[627 rows x 18 columns]\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIjCAYAAABswtioAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADhNklEQVR4nOzdeVxU5f4H8M/MwLA6o6AIboBY6YRKmhopagqhkm2WWbnmnlrmvWW2uLWQVldb1exXdjVvZmVmKoaaO6VXRCXUzEBTQRQUlMWBmfP7gzsTw8ycOTADs/B5v16+7mXOM895zuFA8+X5Pt9HJgiCACIiIiIiImpwcmcPgIiIiIiIqLFiQEZEREREROQkDMiIiIiIiIichAEZERERERGRkzAgIyIiIiIichIGZERERERERE7CgIyIiIiIiMhJGJARERERERE5CQMyIiIiIiIiJ2FARkT1LicnBzKZDKtWrXL2UEykpKQgJiYGvr6+kMlkuHbtWr2da+zYsYiIiKi3/hsjZz5Xq1atgkwmQ05OToOf2108/fTTSEhIqPP7Dx06hLvvvhsBAQGQyWTIyMhw3OA8REREBMaOHWv8eteuXZDJZNi1a5fxtcb4u6d///7o37+/8Wtn/q4YMWIEhg8f3uDnJffCgIzIDsePH8cjjzyC8PBw+Pr6onXr1khISMAHH3xQb+dcu3Ytli5davb6xYsXMX/+/Ab90GL4j7/hn7e3N9q3b4/Ro0fjzz//dMg5Dhw4gPnz5zs8WCooKMDw4cPh5+eHjz76CKtXr0ZAQIDFtoYP34Z/vr6+uPXWWzF9+nRcunTJoeNyJ/Pnzze5L/7+/tBoNHjllVdQXFzs7OE5xJtvvonvv//e2cMAAKSlpUEul2POnDkWjy9atAgymQybN29u4JGZy87OxqeffoqXXnrJ4vETJ04Yf5Ys/WxXVFTg0UcfRWFhIZYsWYLVq1cjPDwcH3/8cYN/qI6IiMB9991n8Zjhd+A333zToGNyJ4IgYPXq1ejbty+aNm0Kf39/dO7cGQsXLkRJSUmd+83KysL8+fNd/o8is2fPxrfffoujR486eyjkygQiqpP9+/cLSqVS6NChg/Daa68JK1euFObOnSvce++9QlRUVL2dNykpSQgPDzd7/dChQwIA4fPPP6+3c9f0888/CwCEZ555Rli9erXw2WefCdOnTxeUSqUQFBQkXLhwQRAEQcjOzq7z2N5++20BgJCdne3QsW/dulUAIKSmptps+/nnnwsAhIULFwqrV68WVq5cKYwZM0aQy+VCZGSkUFJSYrMPrVYrlJeXO2LoLmPevHkCAGHZsmXC6tWrhWXLlgkPPfSQAECIjY0V9Hp9vZ7fnudKqoCAAGHMmDFmr1dWVgplZWX1fo01TZkyRfD29hYyMzNNXs/JyRH8/f2FRx99tEHHY82zzz4r3HrrrVaPv/TSS0JoaKjg4+MjrFy50uz4iRMnBABmx26//XahX79+jh6uqPDwcCEpKcniMcPvwPXr1zfomAzCw8NNnk/DeH7++Wfja8783VNZWSkMHz5cACDExcUJS5YsEVasWCGMHDlSkMvlQnR0tJCXl1envtevX292rQb9+vUzeU70er1QVlYmVFZW1vFK7NOzZ09h1KhRTjk3uQcvZwSBRJ7gjTfegFqtxqFDh9C0aVOTY/n5+c4ZVD0oKSmxOnNkEBcXh0ceeQQAMG7cONx666145pln8MUXX1j9a76zGb5HNb93YgYPHow777wTADBhwgQEBwfjX//6FzZu3IjHH3/c4nsM98/b29vuMbuqRx55BM2bNwcATJkyBcOGDcN3332HX375BbGxsRbfU1paCn9//4YcpkMpFAooFIoGP+9bb72FjRs3YvLkydi7dy9kMhkAYMaMGfD29sZ7773XIOMQ+/5VVFTgyy+/xJQpUyweFwQBa9euxRNPPIHs7Gx8+eWXmDBhgkmbuvx81lVlZSX0ej2USmW9n8sZnPm7Z/Hixfj666/xz3/+E2+//bbx9UmTJmH48OF48MEHMXbsWGzdurVex2GYjXUUKf9drG748OGYN28ePv74YwQGBjpsHOQ5mLJIVEdnzpzB7bffbvEDQ0hIiNlra9asQc+ePeHv749mzZqhb9+++Omnn4zHN27ciKSkJLRq1Qo+Pj6IiorCa6+9Bp1OZ2zTv39/bN68GWfPnjWmiUVERGDXrl3o0aMHgKqAyHCsemrPr7/+ikGDBkGtVsPf3x/9+vXD/v37TcZoSEHLysrCE088gWbNmqFPnz61vjcDBgwAUJW2JGbnzp2Ii4tDQEAAmjZtigceeAAnTpwwGc/zzz8PAIiMjDRel60UlfXr16N79+7w8/ND8+bNMXLkSFy4cMF4vH///hgzZgwAoEePHpDJZCbrMOp6nWPHjkVgYCDOnDmDIUOGoEmTJnjyySeNx2qu49Dr9XjvvffQuXNn+Pr6okWLFhg0aBD++9//mrRbs2aN8XqCgoIwYsQI/PXXX6Jj++abbyCTybB7926zYytWrIBMJkNmZiYAIC8vD+PGjUObNm3g4+ODsLAwPPDAA3VOBap5X/r374/o6GgcPnwYffv2hb+/vzGVLT8/H+PHj0fLli3h6+uLrl274osvvjDr89q1axg7dizUajWaNm2KMWPGWEx1q7l2xKAu918mk6GkpARffPGF8dkzPCfW1pB9/PHHuP322+Hj44NWrVph2rRpZuM03I+srCzcc8898Pf3R+vWrbF48WIbdxZQq9V47733sH//fnz66acAgA0bNmDTpk146623EBYWBr1ej6VLl+L222+Hr68vWrZsicmTJ+Pq1asmfUn5nVN9vJa+f5bs27cPV65cQXx8vMXj+/fvR05ODkaMGIERI0Zgz549OH/+vPH42LFj0a9fPwDAo48+CplMhv79+yMiIgK//fYbdu/ebfx+VP9eX7t2DTNnzkTbtm3h4+ODDh06YNGiRdDr9cY2hrVE77zzDpYuXYqoqCj4+PggKyvL5r2X6uzZs3j66adx2223wc/PD8HBwXj00UfNnhXDM7R//37MmjULLVq0QEBAAB566CFcvnzZpK0gCHj99dfRpk0b+Pv745577sFvv/0maTw1n/3q9+CTTz4x3oMePXrg0KFDZu9fv349NBoNfH19ER0djQ0bNkhal1ZWVoa3334bt956K5KTk82ODx06FGPGjEFKSgp++eUX4+symQzz5883a199vdyqVavw6KOPAgDuuece4/NQfe1cddbWkJ08eRKPPPIIgoKC4OvrizvvvBM//PCDSRvD92n37t14+umnERISgjZt2gAArl+/jpkzZyIiIgI+Pj4ICQlBQkIC0tPTTfpISEhASUkJUlNTxW4ZNWKcISOqo/DwcKSlpSEzMxPR0dGibRcsWID58+fj7rvvxsKFC6FUKvHrr79i586duPfeewFU/dIPDAzErFmzEBgYiJ07d2Lu3LkoLi42/mXx5ZdfRlFREc6fP48lS5YAAAIDA9GpUycsXLgQc+fOxaRJkxAXFwcAuPvuuwFUBT6DBw9G9+7dMW/ePMjlcnz++ecYMGAA9u7di549e5qM99FHH8Utt9yCN998E4Ig1PrenDlzBgAQHBxstc327dsxePBgtG/fHvPnz0dZWRk++OAD9O7dG+np6YiIiMDDDz+M33//Hf/5z3+wZMkS4yxMixYtrPa7atUqjBs3Dj169EBycjIuXbpk/AB75MgRNG3aFC+//DJuu+02fPLJJ1i4cCEiIyMRFRXlkOusrKxEYmIi+vTpg3feeUd0Fmj8+PFYtWoVBg8ejAkTJqCyshJ79+7FL7/8YpyJe+ONN/Dqq69i+PDhmDBhAi5fvowPPvgAffv2NV6PJUlJSQgMDMTXX39t/HBrsG7dOtx+++3G53bYsGH47bffMGPGDERERCA/Px+pqak4d+5cnYoBWLovBQUFGDx4MEaMGIGRI0eiZcuWKCsrQ//+/fHHH39g+vTpiIyMxPr16zF27Fhcu3YNzz77LICqD6MPPPAA9u3bhylTpqBTp07YsGGDMaiuK1v3f/Xq1ZgwYQJ69uy
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Преобразуем год производства в целочисленный тип\n",
"df['Prod. year'] = df['Prod. year'].astype(int)\n",
"\n",
"# Статистический анализ для определения выбросов\n",
"Q1 = df['Price'].quantile(0.25)\n",
"Q3 = df['Price'].quantile(0.75)\n",
"IQR = Q3 - Q1\n",
"\n",
"# Определение порога для выбросов\n",
"threshold = 1.5 * IQR\n",
"outliers = (df['Price'] < (Q1 - threshold)) | (df['Price'] > (Q3 + threshold))\n",
"\n",
"# Вывод выбросов\n",
"print(\"Выбросы:\")\n",
"print(df[outliers])\n",
"\n",
"# Обработка выбросов\n",
"# В данном случае мы заменим выбросы на медианное значение\n",
"median_price = df['Price'].median()\n",
"df.loc[outliers, 'Price'] = median_price\n",
"\n",
"# Визуализация данных после обработки\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df['Prod. year'], df['Price'])\n",
"plt.xlabel('Production Year')\n",
"plt.ylabel('Price')\n",
"plt.title('Scatter Plot of Price vs Production Year (After Handling Outliers)')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Смртрем, есть ли пропущенные значения. Пропущенных данных не обнаружено."
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ID 0\n",
"Price 0\n",
"Levy 0\n",
"Manufacturer 0\n",
"Model 0\n",
"Prod. year 0\n",
"Category 0\n",
"Leather interior 0\n",
"Fuel type 0\n",
"Engine volume 0\n",
"Mileage 0\n",
"Cylinders 0\n",
"Gear box type 0\n",
"Drive wheels 0\n",
"Doors 0\n",
"Wheel 0\n",
"Color 0\n",
"Airbags 0\n",
"dtype: int64\n",
"\n"
]
}
],
"source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n",
"\n",
"print()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Теперь создадим выборки.\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 11542\n",
"Размер контрольной выборки: 3847\n",
"Размер тестовой выборки: 3848\n"
]
}
],
"source": [
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//car_price_prediction.csv\")\n",
"\n",
"# Разделение данных на обучающую и временную выборки\n",
"train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)\n",
"\n",
"# Разделение остатка на контрольную и тестовую выборки\n",
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
"\n",
"# Проверка размеров выборок\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))\n",
"\n",
"# Сохранение выборок в файлы\n",
"train_df.to_csv(\"..//static//csv//train_data.csv\", index=False)\n",
"val_df.to_csv(\"..//static//csv//val_data.csv\", index=False)\n",
"test_df.to_csv(\"..//static//csv//test_data.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проанализируем сбалансированность выборки."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение Category в обучающей выборке:\n",
"Category\n",
"Sedan 5289\n",
"Jeep 3246\n",
"Hatchback 1684\n",
"Minivan 396\n",
"Coupe 318\n",
"Universal 216\n",
"Microbus 184\n",
"Goods wagon 151\n",
"Pickup 31\n",
"Cabriolet 20\n",
"Limousine 7\n",
"Name: count, dtype: int64\n",
"Процент автомобилей категории 'Седан': 45.82%\n",
"Процент автомобилей категории 'Джип': 28.12%\n",
"\n",
"Распределение Category в контрольной выборке:\n",
"Category\n",
"Sedan 1697\n",
"Jeep 1109\n",
"Hatchback 608\n",
"Minivan 129\n",
"Coupe 105\n",
"Universal 73\n",
"Microbus 57\n",
"Goods wagon 42\n",
"Pickup 17\n",
"Cabriolet 9\n",
"Limousine 1\n",
"Name: count, dtype: int64\n",
"Процент автомобилей категории 'Седан': 44.11%\n",
"Процент автомобилей категории 'Джип': 28.83%\n",
"\n",
"Распределение Category в тестовой выборке:\n",
"Category\n",
"Sedan 1750\n",
"Jeep 1118\n",
"Hatchback 555\n",
"Minivan 122\n",
"Coupe 109\n",
"Universal 75\n",
"Microbus 65\n",
"Goods wagon 40\n",
"Cabriolet 7\n",
"Pickup 4\n",
"Limousine 3\n",
"Name: count, dtype: int64\n",
"Процент автомобилей категории 'Седан': 45.48%\n",
"Процент автомобилей категории 'Джип': 29.05%\n",
"\n",
"Необходима аугментация данных для балансировки классов.\n",
"Необходима аугментация данных для балансировки классов.\n",
"Необходима аугментация данных для балансировки классов.\n"
]
}
],
"source": [
"train_df = pd.read_csv(\"..//static//csv//train_data.csv\")\n",
"val_df = pd.read_csv(\"..//static//csv//val_data.csv\")\n",
"test_df = pd.read_csv(\"..//static//csv//test_data.csv\")\n",
"\n",
"# Оценка сбалансированности\n",
"def check_balance(df, name):\n",
" counts = df['Category'].value_counts()\n",
" print(f\"Распределение Category в {name}:\")\n",
" print(counts)\n",
" print(f\"Процент автомобилей категории 'Седан': {counts['Sedan'] / len(df) * 100:.2f}%\")\n",
" print(f\"Процент автомобилей категории 'Джип': {counts['Jeep'] / len(df) * 100:.2f}%\")\n",
" print()\n",
"\n",
"# Определение необходимости аугментации данных\n",
"def need_augmentation(df):\n",
" counts = df['Category'].value_counts()\n",
" ratio = counts['Sedan'] / counts['Jeep']\n",
" if ratio > 1.5 or ratio < 0.67:\n",
" print(\"Необходима аугментация данных для балансировки классов.\")\n",
" else:\n",
" print(\"Аугментация данных не требуется.\")\n",
" \n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")\n",
"\n",
"need_augmentation(train_df)\n",
"need_augmentation(val_df)\n",
"need_augmentation(test_df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"По результатам анализа требуется приращение."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Оверсэмплинг:\n",
"Распределение Category в обучающей выборке:\n",
"Category\n",
"Jeep 5289\n",
"Hatchback 5289\n",
"Sedan 5289\n",
"Goods wagon 5289\n",
"Cabriolet 5289\n",
"Universal 5289\n",
"Minivan 5289\n",
"Microbus 5289\n",
"Coupe 5289\n",
"Pickup 5289\n",
"Limousine 5289\n",
"Name: count, dtype: int64\n",
"Процент автомобилей категории 'Седан': 9.09%\n",
"Процент автомобилей категории 'Джип': 9.09%\n",
"\n",
"Распределение Category в контрольной выборке:\n",
"Category\n",
"Jeep 1697\n",
"Sedan 1697\n",
"Minivan 1697\n",
"Coupe 1697\n",
"Hatchback 1697\n",
"Goods wagon 1697\n",
"Universal 1697\n",
"Microbus 1697\n",
"Pickup 1697\n",
"Cabriolet 1697\n",
"Limousine 1697\n",
"Name: count, dtype: int64\n",
"Процент автомобилей категории 'Седан': 9.09%\n",
"Процент автомобилей категории 'Джип': 9.09%\n",
"\n",
"Распределение Category в тестовой выборке:\n",
"Category\n",
"Jeep 1750\n",
"Hatchback 1750\n",
"Sedan 1750\n",
"Coupe 1750\n",
"Minivan 1750\n",
"Goods wagon 1750\n",
"Microbus 1750\n",
"Universal 1750\n",
"Cabriolet 1750\n",
"Pickup 1750\n",
"Limousine 1750\n",
"Name: count, dtype: int64\n",
"Процент автомобилей категории 'Седан': 9.09%\n",
"Процент автомобилей категории 'Джип': 9.09%\n",
"\n",
"Андерсэмплинг:\n",
"Распределение Category в обучающей выборке:\n",
"Category\n",
"Cabriolet 7\n",
"Coupe 7\n",
"Goods wagon 7\n",
"Hatchback 7\n",
"Jeep 7\n",
"Limousine 7\n",
"Microbus 7\n",
"Minivan 7\n",
"Pickup 7\n",
"Sedan 7\n",
"Universal 7\n",
"Name: count, dtype: int64\n",
"Процент автомобилей категории 'Седан': 9.09%\n",
"Процент автомобилей категории 'Джип': 9.09%\n",
"\n",
"Распределение Category в контрольной выборке:\n",
"Category\n",
"Cabriolet 1\n",
"Coupe 1\n",
"Goods wagon 1\n",
"Hatchback 1\n",
"Jeep 1\n",
"Limousine 1\n",
"Microbus 1\n",
"Minivan 1\n",
"Pickup 1\n",
"Sedan 1\n",
"Universal 1\n",
"Name: count, dtype: int64\n",
"Процент автомобилей категории 'Седан': 9.09%\n",
"Процент автомобилей категории 'Джип': 9.09%\n",
"\n",
"Распределение Category в тестовой выборке:\n",
"Category\n",
"Cabriolet 3\n",
"Coupe 3\n",
"Goods wagon 3\n",
"Hatchback 3\n",
"Jeep 3\n",
"Limousine 3\n",
"Microbus 3\n",
"Minivan 3\n",
"Pickup 3\n",
"Sedan 3\n",
"Universal 3\n",
"Name: count, dtype: int64\n",
"Процент автомобилей категории 'Седан': 9.09%\n",
"Процент автомобилей категории 'Джип': 9.09%\n",
"\n"
]
}
],
"source": [
"\n",
"# Загрузка данных\n",
"train_df = pd.read_csv(\"..//static//csv//train_data.csv\")\n",
"val_df = pd.read_csv(\"..//static//csv//val_data.csv\")\n",
"test_df = pd.read_csv(\"..//static//csv//test_data.csv\")\n",
"\n",
"# Преобразование категориальных признаков в числовые\n",
"def encode(df):\n",
" label_encoders = {}\n",
" for column in df.select_dtypes(include=['object']).columns:\n",
" if column != 'Category': # Пропускаем целевую переменную\n",
" le = LabelEncoder()\n",
" df[column] = le.fit_transform(df[column])\n",
" label_encoders[column] = le\n",
" return label_encoders\n",
"\n",
"# Преобразование целевой переменной в числовые значения\n",
"def encode_target(df):\n",
" le = LabelEncoder()\n",
" df['Category'] = le.fit_transform(df['Category'])\n",
" return le\n",
"\n",
"# Применение кодирования\n",
"label_encoders = encode(train_df)\n",
"encode(val_df)\n",
"encode(test_df)\n",
"\n",
"# Кодирование целевой переменной\n",
"le_target = encode_target(train_df)\n",
"encode_target(val_df)\n",
"encode_target(test_df)\n",
"\n",
"# Проверка типов данных\n",
"def check_data_types(df):\n",
" for column in df.columns:\n",
" if df[column].dtype == 'object':\n",
" print(f\"Столбец '{column}' содержит строковые данные.\")\n",
"\n",
"check_data_types(train_df)\n",
"check_data_types(val_df)\n",
"check_data_types(test_df)\n",
"\n",
"# Функция для выполнения oversampling\n",
"def oversample(df):\n",
" if 'Category' not in df.columns:\n",
" print(\"Столбец 'Category' отсутствует.\")\n",
" return df\n",
" \n",
" X = df.drop('Category', axis=1)\n",
" y = df['Category']\n",
" \n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"# Функция для выполнения undersampling\n",
"def undersample(df):\n",
" if 'Category' not in df.columns:\n",
" print(\"Столбец 'Category' отсутствует.\")\n",
" return df\n",
" \n",
" X = df.drop('Category', axis=1)\n",
" y = df['Category']\n",
" \n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"# Применение oversampling и undersampling к каждой выборке\n",
"train_df_oversampled = oversample(train_df)\n",
"val_df_oversampled = oversample(val_df)\n",
"test_df_oversampled = oversample(test_df)\n",
"\n",
"train_df_undersampled = undersample(train_df)\n",
"val_df_undersampled = undersample(val_df)\n",
"test_df_undersampled = undersample(test_df)\n",
"\n",
"# Обратное преобразование целевой переменной в строковые метки\n",
"def decode_target(df, le_target):\n",
" df['Category'] = le_target.inverse_transform(df['Category'])\n",
"\n",
"decode_target(train_df_oversampled, le_target)\n",
"decode_target(val_df_oversampled, le_target)\n",
"decode_target(test_df_oversampled, le_target)\n",
"\n",
"decode_target(train_df_undersampled, le_target)\n",
"decode_target(val_df_undersampled, le_target)\n",
"decode_target(test_df_undersampled, le_target)\n",
"\n",
"# Проверка результатов\n",
"def check_balance(df, name):\n",
" if 'Category' not in df.columns:\n",
" print(f\"Столбец 'Category' отсутствует в {name}.\")\n",
" return\n",
" \n",
" counts = df['Category'].value_counts()\n",
" print(f\"Распределение Category в {name}:\")\n",
" print(counts)\n",
" \n",
" if 'Sedan' in counts and 'Jeep' in counts:\n",
" print(f\"Процент автомобилей категории 'Седан': {counts['Sedan'] / len(df) * 100:.2f}%\")\n",
" print(f\"Процент автомобилей категории 'Джип': {counts['Jeep'] / len(df) * 100:.2f}%\")\n",
" else:\n",
" print(\"Отсутствуют одна или обе категории (Седан/Внедорожник).\")\n",
" print()\n",
"\n",
"# Проверка сбалансированности после oversampling\n",
"print(\"Оверсэмплинг:\")\n",
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
"\n",
"# Проверка сбалансированности после undersampling\n",
"print(\"Андерсэмплинг:\")\n",
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
"check_balance(test_df_undersampled, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Цены на кофе"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"df = pd.read_csv(\"..//static//csv//Starbucks Dataset.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проблемная область: Данные о ценах на акции кофе Starbucks Corporation.\n",
"\n",
"Объект наблюдения: цены на акции кофе.\n",
"\n",
"Атрибуты: дата , цена открытия , самая высокая цена дня, самая низкая цена дня, цена закрытия , скорректированная цена закрытия и объем торгов.\n",
"\n",
"Пример бизнес-цели: \n",
"1. Анализ данных: Изучение и очистка данных для выявления закономерностей и корреляций между объёмом торгов и цены на акции кофе Starbucks Corporation.\n",
"2. Разработка модели: Создание и обучение модели машинного обучения, которая будет прогнозировать цены на акции кофе Starbucks Corporation.\n",
"3. Внедрение: Интеграция модели в систему ценообразования компании для автоматического расчета цен на акции кофе.\n",
"\n",
"\n",
"Актуальность:Эти данные бесценны для проведения исторического анализа , прогнозирования будущей динамики акций и понимания рыночных тенденций, связанных с акциями Starbucks."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0EAAAIjCAYAAADFthA8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABniUlEQVR4nO3deXhTVf7H8U/SnS4pZWvZC4JYAUE2kU0FRhBR3ECEn4g4jgwu6Liho4A4IjqOu7iDCghuiBsoO8OOFEQEEbAsQstOWwpdaO7vj04iadM2adOk7X2/nqfPAzc3uSe5bXI/Oed8j8UwDEMAAAAAYBLWQDcAAAAAAPyJEAQAAADAVAhBAAAAAEyFEAQAAADAVAhBAAAAAEyFEAQAAADAVAhBAAAAAEyFEAQAAADAVAhBAAAAAEyFEAQAVcyePXtksVg0ffr0QDfFxYIFC9SuXTuFh4fLYrHo5MmTFXas6dOny2KxaM+ePc5tl112mS677LIKPw7KZtmyZbJYLFq2bFmgmwIAhCAAlcfPP/+sG2+8UU2aNFF4eLgaNGigvn376tVXX62wY86aNUsvvfRSke0HDx7UhAkTtHnz5go7dmGOi0THT0hIiJo1a6Zbb71Vv//+u0+OsXr1ak2YMMHnAeXYsWMaPHiwIiIi9Prrr+ujjz5SZGRkqfd74403ZLFY1KVLF5+2pzT5+fmaNm2aLrvsMsXFxSksLExNmzbVyJEj9eOPP/q1LZVR27Zt1bhxYxmGUew+3bp1U7169XT27Fk/tgwAfIMQBKBSWL16tTp27KiffvpJf/3rX/Xaa6/pjjvukNVq1csvv1xhxy0pBE2cONGvIcjh3nvv1UcffaS3335bAwYM0Jw5c9SpUycdPHiw3I+9evVqTZw40echaMOGDcrMzNSkSZM0atQoDR8+XCEhIaXeb+bMmWratKnWr1+vXbt2lasNP/zwg3744YdS9ztz5oyuvvpq3X777TIMQ4899pimTp2qW2+9VWvWrFHnzp31xx9/lKstVd2wYcO0f/9+/fe//3V7+549e7RmzRoNGTJEwcHBfm4dAJQf71wAKoV//etfstls2rBhg2JjY11uO3z4cGAaVQGysrJK7SHp0aOHbrzxRknSyJEj1bJlS91777364IMPNG7cOH8002uOc1T43JUkJSVFq1ev1hdffKG//e1vmjlzpsaPH1/mNoSGhnq030MPPaQFCxboxRdf1NixY11uGz9+vF588cUyt6G6uOWWWzRu3DjNmjVLPXv2LHL7xx9/LMMwNGzYsAC0DgDKj54gAJXC7t27deGFF7q9iK5bt26RbTNmzFDnzp1Vo0YN1axZUz179nTpBZg3b54GDBig+vXrKywsTM2bN9ekSZOUn5/v3Oeyyy7Tt99+q7179zqHoDVt2lTLli1Tp06dJBWEEMdt587BWbdunfr16yebzaYaNWqoV69eWrVqlUsbJ0yYIIvFom3btumWW25RzZo11b17d69fmyuuuEJSQWgoyZIlS9SjRw9FRkYqNjZW1157rbZv3+7SnoceekiSlJiY6Hxepc13+fTTT9WhQwdFRESodu3aGj58uA4cOOC8/bLLLtOIESMkSZ06dZLFYtFtt91W6vOaOXOmatasqQEDBujGG2/UzJkz3e73yy+/6IorrlBERIQaNmyop59+Wna7vch+nswJ+uOPP/TWW2+pb9++RQKQJAUFBenBBx9Uw4YNS3ycN954QxdeeKHCwsJUv359jRkzpkjv2s6dO3XDDTcoPj5e4eHhatiwoW6++Walp6e77Ddjxgzn6xsXF6ebb75Z+/fvL/H4n332mSwWi5YvX17ktrfeeksWi0Vbt26VJKWlpWnkyJFq2LChwsLClJCQoGuvvbbE896oUSP17NlTn332mfLy8orcPmvWLDVv3tw5jHHTpk3q37+/YmJiFBUVpd69e2vt2rUlPgdJatq0qdvflcLn0jFU9JNPPtHEiRPVoEEDRUdH68Ybb1R6erpycnI0duxY1a1bV1FRURo5cqRycnKKPG5ZXmsA1RM9QQAqhSZNmmjNmjXaunWrWrduXeK+EydO1IQJE3TppZfqqaeeUmhoqNatW6clS5boL3/5i6SCCe1RUVF64IEHFBUVpSVLlujJJ59URkaGnn/+eUnS448/rvT0dP3xxx/Ob/+joqJ0wQUX6KmnntKTTz6pO++8Uz169JAkXXrppZIKwkb//v3VoUMHjR8/XlarVdOmTdMVV1yh//73v+rcubNLe2+66Sa1aNFCzzzzTIlzLIqze/duSVKtWrWK3WfRokXq37+/mjVrpgkTJujMmTN69dVX1a1bNyUnJ6tp06a6/vrr9dtvv+njjz/Wiy++qNq1a0uS6tSpU+zjTp8+XSNHjlSnTp00efJkHTp0SC+//LJWrVqlTZs2KTY2Vo8//rjOP/98vf3223rqqaeUmJio5s2bl/q8Zs6cqeuvv16hoaEaOnSopk6dqg0bNjgDqFRwAX/55Zfr7NmzevTRRxUZGam3335bERERnr58LubPn6+zZ8/q//7v/8p0f6kgTE6cOFF9+vTR6NGjtWPHDmfbV61apZCQEOXm5urKK69UTk6O7rnnHsXHx+vAgQP65ptvdPLkSdlsNkkFPaBPPPGEBg8erDvuuENHjhzRq6++qp49ezpfX3cGDBigqKgoffLJJ+rVq5fLbXPmzNGFF17o/Du64YYb9Msvv+iee+5R06ZNdfjwYS1cuFD79u1T06ZNi32ew4YN05133qnvv/9eV199tXP7zz//rK1bt+rJJ5+UVBBSe/TooZiYGD388MMKCQnRW2+9pcsuu0zLly/36XyvyZMnKyIiQo8++qh27dqlV199VSEhIbJarTpx4oQmTJigtWvXavr06UpMTHS2USr7aw2gmjIAoBL44YcfjKCgICMoKMjo2rWr8fDDDxvff/+9kZub67Lfzp07DavValx33XVGfn6+y212u93579OnTxc5xt/+9jejRo0aRnZ2tnPbgAEDjCZNmhTZd8OGDYYkY9q0aUWO0aJFC+PKK68scrzExESjb9++zm3jx483JBlDhw716DVYunSpIcl4//33jSNHjhgHDx40vv32W6Np06aGxWIxNmzYYBiGYaSkpBRpW7t27Yy6desax44dc2776aefDKvVatx6663Obc8//7whyUhJSSm1Pbm5uUbdunWN1q1bG2fOnHFu/+abbwxJxpNPPuncNm3aNEOSs42l+fHHHw1JxsKFCw3DKHhdGzZsaNx3330u+40dO9aQZKxbt8657fDhw4bNZivyPHr16mX06tWrxOPef//9hiRj06ZNHrXT8bwcxzl8+LARGhpq/OUvf3H5/Xvttdec584wDGPTpk2GJOPTTz8t9rH37NljBAUFGf/6179ctv/8889GcHBwke2FDR061Khbt65x9uxZ57bU1FTDarUaTz31lGEYhnHixAlDkvH888979HzPdfz4cSMsLKzI7++jjz5qSDJ27NhhGIZhDBo0yAgNDTV2797t3OfgwYNGdHS00bNnT+c2x+/30qVLnduaNGlijBgxosixC59Lx31bt27t8p4wdOhQw2KxGP3793e5f9euXV3+rsv7WgOofhgOB6BS6Nu3r9asWaNrrrlGP/30k5577jldeeWVatCggb766ivnfl9++aXsdruefPJJWa2ub2EWi8X573N7CjIzM3X06FH16NFDp0+f1q+//lrmdm7evFk7d+7ULbfcomPHjuno0aM6evSosrKy1Lt3b61YsaLIUK277rrLq2PcfvvtqlOnjurXr68BAwYoKytLH3zwgTp27Oh2/9TUVG3evFm33Xab4uLinNvbtm2rvn376rvvvvP+iUr68ccfdfjwYf39739XeHi4c/uAAQPUqlUrffvtt2V6XKmgF6hevXq6/PLLJRWcuyFDhmj27NkuQxa/++47XXLJJS69a3Xq1CnzXJSMjAxJUnR0dJnuv2jRIuXm5mrs2LEuv39//etfFRMT43xNHD0933//vU6fPu32sb744gvZ7XYNHjzY+Xt09OhRxcfHq0WLFlq6dGm
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//Starbucks Dataset.csv\")\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df['Adj Close'], df['Volume'])\n",
"plt.xlabel('Adj Close')\n",
"plt.ylabel('Volume')\n",
"plt.title('Scatter Plot of Adj Close vs Volume')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выброс присутствует. Сделаем очистку данных.\n",
"\n",
"Для удаления выбросов из датасета можно использовать метод межквартильного размаха. Зашумленность не очень высокая. Покрытие данных высокое и подошло бы для поставленной задачи по актуальности."
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydd3gU5fbHv7thU0klQkIRIkVK6IJgkCYoRRHLVVGvil5s4FXsqHhBVOR6/aFXVBQFCwKKiiglSgyKQBAUIoRwFWICCgklCQmkk53fH2GWbTPzTp/dPZ/n8XlkMzvz7pR3znnPOd9j4ziOA0EQBEEQBEEQBCGI3ewBEARBEARBEARBWB1ynAiCIAiCIAiCICQgx4kgCIIgCIIgCEICcpwIgiAIgiAIgiAkIMeJIAiCIAiCIAhCAnKcCIIgCIIgCIIgJCDHiSAIgiAIgiAIQgJynAiCIAiCIAiCICQgx4kgCIIgCIIgCEICcpwIgiAUUlRUBJvNhvfff9/soXiQmZmJPn36IDIyEjabDSdPntTtWO+//z5sNhuKiopcnw0fPhzDhw/X/TiEMr7//nvYbDZ8//33Zg9FMf/+97/RtWtXOJ1ORd+fNWsWbDabxqMyjw4dOuCOO+5w/TszMxPNmzfH8ePHzRsUQQQh5DgRBOHDnj17cP3116N9+/aIjIxEmzZtMHr0aLz++uu6HXPZsmV49dVXfT4/cuQIZs2ahdzcXN2O7Q1vWPL/ORwOXHDBBbjtttvwxx9/aHKMrVu3YtasWZo7NaWlpbjhhhsQFRWFN954Ax999BFiYmIkv/fmm2/CZrPh4osv1nQ8UjQ2NmLJkiUYPnw4kpKSEBERgQ4dOmDy5Mn4+eefDR2LFenVqxfOP/98cBwnuE1GRgZatWqFM2fOGDgy86isrMS8efPwxBNPwG73NGNqa2sxf/58XHzxxYiPj0dkZCS6dOmCadOm4ffffzdpxMYzZswYdOrUCXPnzjV7KAQRVJDjRBCEB1u3bsVFF12EX3/9FVOmTMGCBQvwj3/8A3a7Ha+99ppuxxVznGbPnm2o48Tzz3/+Ex999BHeeecdjB8/Hp988gkGDBiAI0eOqN731q1bMXv2bM0dpx07duDUqVOYM2cO7rrrLtx6661wOByS3/v444/RoUMHbN++HQcOHFA1hm+//Rbffvut5HY1NTW48sorceedd4LjODz11FN46623cNtttyEnJwcDBw7EX3/9pWosgc4tt9yCP//8Ez/++KPfvxcVFSEnJwc33ngjmjVrZvDozGHx4sU4c+YMJk2a5PH5iRMnMGTIEDz88MNo2bIlnnvuObzxxhuYOHEivvrqK6Snp5s0YnO455578Pbbb+PUqVNmD4UggobQmGUJgmDmhRdeQHx8PHbs2IGEhASPvx07dsycQelAVVWVZCTm0ksvxfXXXw8AmDx5Mrp06YJ//vOf+OCDDzBjxgwjhikb/hp5XzsxCgsLsXXrVnzxxRe455578PHHH+Nf//qX4jGEh4czbffYY48hMzMT8+fPx0MPPeTxt3/961+YP3++4jEECzfffDNmzJiBZcuWYejQoT5/X758OTiOwy233GLC6MxhyZIlmDBhAiIjIz0+v+OOO7Br1y589tlnuO666zz+NmfOHDz99NNGDtN0rrvuOjzwwANYuXIl7rzzTrOHQxBBAUWcCILwoKCgAD169PBreLds2dLns6VLl2LgwIGIjo5GYmIihg4d6hFtWL16NcaPH4/WrVsjIiICHTt2xJw5c9DY2OjaZvjw4Vi7di0OHjzoSo/r0KEDvv/+ewwYMABAk+PC/829puinn37CmDFjEB8fj+joaAwbNgxbtmzxGCNfz5Cfn4+bb74ZiYmJGDJkiOxzM3LkSABNjoYY2dnZuPTSSxETE4OEhARcffXV2Ldvn8d4HnvsMQBAWlqa63dJ1e+sXLkS/fv3R1RUFJKTk3Hrrbfi8OHDrr8PHz4ct99+OwBgwIABsNlsHnUPQnz88cdITEzE+PHjcf311+Pjjz/2u93evXsxcuRIREVFoW3btnj++ef91piw1Dj99ddfePvttzF69GgfpwkAwsLC8Oijj6Jt27ai+3nzzTfRo0cPREREoHXr1pg6dapPFG///v247rrrkJKSgsjISLRt2xY33XQTKioqPLZbunSp6/wmJSXhpptuwp9//il6/M8++ww2mw0//PCDz9/efvtt2Gw25OXlAQBKSkowefJktG3bFhEREUhNTcXVV18tet3btWuHoUOH4rPPPkNDQ4PP35ctW4aOHTu6Uix37dqFsWPHIi4uDs2bN8dll12Gbdu2if4GwLdGhsf7WvJprJ9++ilmz56NNm3aIDY2Ftdffz0qKipQV1eHhx56CC1btkTz5s0xefJk1NXV+exXybkGmp693bt3Y9SoUR6f//TTT1i7di3uuusuH6cJACIiIvCf//xHcv8s4/rxxx/xt7/9Deeffz4iIiLQrl07TJ8+HTU1NR7b3XHHHWjevDkOHz6MiRMnonnz5jjvvPPw6KOPesx/AOB0OvHqq6+iR48eiIyMRKtWrXDPPfegvLzcYzuO4/D888+jbdu2iI6OxogRI7B3716/v6Vly5bo1asXVq9eLfm7CYJggyJOBEF40L59e+Tk5CAvL08ytWX27NmYNWsWLrnkEjz33HMIDw/HTz/9hOzsbFx++eUAmor6mzdvjocffhjNmzdHdnY2nn32WVRWVuLll18GADz99NOoqKjAX3/95YoyNG/eHN26dcNzzz2HZ599FnfffTcuvfRSAMAll1wCoMlBGTt2LPr3749//etfsNvtWLJkCUaOHIkff/wRAwcO9Bjv3/72N3Tu3BkvvviiaM2IEAUFBQCAFi1aCG6TlZWFsWPH4oILLsCsWbNQU1OD119/HRkZGdi5cyc6dOiAa6+9Fr///juWL1+O+fPnIzk5GQBw3nnnCe73/fffx+TJkzFgwADMnTsXR48exWuvvYYtW7Zg165dSEhIwNNPP40LL7wQ77zzDp577jmkpaWhY8eOkr/r448/xrXXXovw8HBMmjQJb731Fnbs2OFyWoEmo3/EiBE4c+YMnnzyScTExOCdd95BVFQU6+nzYP369Thz5gz+/ve/K/o+0OSAzp49G6NGjcJ9992H3377zTX2LVu2wOFwoL6+HldccQXq6urwwAMPICUlBYcPH8aaNWtw8uRJxMfHA2iKtM6cORM33HAD/vGPf+D48eN4/fXXMXToUNf59cf48ePRvHlzfPrppxg2bJjH3z755BP06NHD9Rxdd9112Lt3Lx544AF06NABx44dw4YNG3Do0CF06NBB8HfecsstuPvuu/HNN9/gyiuvdH2+Z88e5OXl4dlnnwXQ5NheeumliIuLw+OPPw6Hw4G3334bw4cPxw8//KBp/drcuXMRFRWFJ598EgcOHMDrr78Oh8MBu92O8vJyzJo1C9u2bcP777+PtLQ01xgB5ecaaEpxBYB+/fp5fP7VV18BgKr7iXVcK1euRHV1Ne677z60aNEC27dvx+uvv46//voLK1eu9NhnY2MjrrjiClx88cX4z3/+g6ysLLzyyivo2LEj7rvvPtd299xzj+sZ/+c//4nCwkIsWLAAu3btct3LAPDss8/i+eefx7hx4zBu3Djs3LkTl19+Oerr6/3+pv79++PLL79UfE4IgvCCIwiCcOPbb7/lwsLCuLCwMG7w4MHc448/zn3zzTdcfX29x3b79+/n7HY7d80113CNjY0ef3M6na7/r66u9jnGPffcw0VHR3O1tbWuz8aPH8+1b9/eZ9sdO3ZwALglS5b4HKNz587cFVdc4XO8tLQ0bvTo0a7P/vWvf3EAuEmTJjGdg40bN3IAuMWLF3PHjx/njhw5wq1du5br0KEDZ7PZuB07dnAcx3GFhYU+Y+vTpw/XsmVLrrS01PXZr7/+ytntdu62225zffbyyy9zALjCwkLJ8dTX13MtW7bk0tPTuZqaGtfna9as4QBwzz77rOuzJUuWcABcY5Ti559/5gBwGzZs4Diu6by2bduWe/DBBz22e+ihhzg
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//Starbucks Dataset.csv\")\n",
"\n",
"# Функция для удаления выбросов с использованием IQR\n",
"def remove_outliers_iqr(df, column):\n",
" Q1 = df[column].quantile(0.25)\n",
" Q3 = df[column].quantile(0.75)\n",
" IQR = Q3 - Q1\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]\n",
"\n",
"# Удаление выбросов для столбцов 'Adj Close' и 'Volume'\n",
"df_cleaned = remove_outliers_iqr(df, 'Adj Close')\n",
"df_cleaned = remove_outliers_iqr(df_cleaned, 'Volume')\n",
"\n",
"# Построение графика для очищенных данных\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df_cleaned['Adj Close'], df_cleaned['Volume'])\n",
"plt.xlabel('Adj Close')\n",
"plt.ylabel('Volume')\n",
"plt.title('Scatter Plot of Adj Close vs Volume (Cleaned)')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Теперь посмотрим, если пустые значения. Пустых значений не оказалось."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Date 0\n",
"Open 0\n",
"High 0\n",
"Low 0\n",
"Close 0\n",
"Adj Close 0\n",
"Volume 0\n",
"dtype: int64\n",
"\n",
"Date False\n",
"Open False\n",
"High False\n",
"Low False\n",
"Close False\n",
"Adj Close False\n",
"Volume False\n",
"dtype: bool\n",
"\n"
]
}
],
"source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n",
"\n",
"print()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n",
"\n",
"print()\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Теперь создадим выборки."
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 4821\n",
"Размер контрольной выборки: 1607\n",
"Размер тестовой выборки: 1608\n"
]
}
],
"source": [
"\n",
"# Выбор признаков и целевой переменной\n",
"X = df.drop('Volume', axis=1) # Признаки (все столбцы, кроме 'volume')\n",
"y = df['Volume'] # Целевая переменная ('volume')\n",
"\n",
"# Разбиение данных на обучающую и оставшуюся часть (контрольную + тестовую)\n",
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n",
"\n",
"# Разбиение оставшейся части на контрольную и тестовую выборки\n",
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n",
"\n",
"# Вывод размеров выборок\n",
"print(f\"Размер обучающей выборки: {X_train.shape[0]}\")\n",
"print(f\"Размер контрольной выборки: {X_val.shape[0]}\")\n",
"print(f\"Размер тестовой выборки: {X_test.shape[0]}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проанализируем сбалансированность выборки."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение Price в обучающей выборке:\n",
"Volume\n",
"1847800 1\n",
"1875200 1\n",
"1910400 1\n",
"1949200 1\n",
"2019200 1\n",
" ..\n",
"143464800 1\n",
"152007800 1\n",
"230883200 1\n",
"295411200 1\n",
"585508800 1\n",
"Name: count, Length: 4697, dtype: int64\n",
"Процент положительных значений: 100.00%\n",
"Процент отрицательных значений: 0.00%\n",
"\n",
"Необходима аугментация данных для балансировки классов.\n",
"\n",
"Распределение Price в контрольной выборке:\n",
"Volume\n",
"2380800 1\n",
"2407200 1\n",
"2412800 1\n",
"2547200 1\n",
"2659200 1\n",
" ..\n",
"85356800 1\n",
"87072000 1\n",
"111773600 1\n",
"114960000 1\n",
"155107200 1\n",
"Name: count, Length: 1593, dtype: int64\n",
"Процент положительных значений: 100.00%\n",
"Процент отрицательных значений: 0.00%\n",
"\n",
"Необходима аугментация данных для балансировки классов.\n",
"\n",
"Распределение Price в тестовой выборке:\n",
"Volume\n",
"1504000 1\n",
"2011200 1\n",
"2073600 1\n",
"2169700 1\n",
"2432000 1\n",
" ..\n",
"67067400 1\n",
"75863200 1\n",
"81587200 1\n",
"131420600 1\n",
"224358400 1\n",
"Name: count, Length: 1593, dtype: int64\n",
"Процент положительных значений: 100.00%\n",
"Процент отрицательных значений: 0.00%\n",
"\n",
"Необходима аугментация данных для балансировки классов.\n",
"\n"
]
}
],
"source": [
"\n",
"# Функция для анализа распределения и вывода результатов\n",
"def analyze_distribution(data, title):\n",
" print(f\"Распределение Price в {title}:\")\n",
" distribution = data.value_counts().sort_index()\n",
" print(distribution)\n",
" total = len(data)\n",
" positive_count = (data > 0).sum()\n",
" negative_count = (data < 0).sum()\n",
" positive_percent = (positive_count / total) * 100\n",
" negative_percent = (negative_count / total) * 100\n",
" print(f\"Процент положительных значений: {positive_percent:.2f}%\")\n",
" print(f\"Процент отрицательных значений: {negative_percent:.2f}%\")\n",
" print(\"\\nНеобходима аугментация данных для балансировки классов.\\n\")\n",
"\n",
"# Анализ распределения для каждой выборки\n",
"analyze_distribution(y_train, \"обучающей выборке\")\n",
"analyze_distribution(y_val, \"контрольной выборке\")\n",
"analyze_distribution(y_test, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выборка недостаточно сбалансирована. Выполним аугментацию данных."
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение Volume в обучающей выборке после oversampling:\n",
"Volume\n",
"1847800 3\n",
"1875200 3\n",
"1910400 3\n",
"1949200 3\n",
"2019200 3\n",
" ..\n",
"143464800 3\n",
"152007800 3\n",
"230883200 3\n",
"295411200 3\n",
"585508800 3\n",
"Name: count, Length: 4697, dtype: int64\n",
"Процент положительных значений: 100.00%\n",
"Процент отрицательных значений: 0.00%\n",
"Распределение Volume в контрольной выборке:\n",
"Volume\n",
"2380800 1\n",
"2407200 1\n",
"2412800 1\n",
"2547200 1\n",
"2659200 1\n",
" ..\n",
"85356800 1\n",
"87072000 1\n",
"111773600 1\n",
"114960000 1\n",
"155107200 1\n",
"Name: count, Length: 1593, dtype: int64\n",
"Процент положительных значений: 100.00%\n",
"Процент отрицательных значений: 0.00%\n",
"Распределение Volume в тестовой выборке:\n",
"Volume\n",
"1504000 1\n",
"2011200 1\n",
"2073600 1\n",
"2169700 1\n",
"2432000 1\n",
" ..\n",
"67067400 1\n",
"75863200 1\n",
"81587200 1\n",
"131420600 1\n",
"224358400 1\n",
"Name: count, Length: 1593, dtype: int64\n",
"Процент положительных значений: 100.00%\n",
"Процент отрицательных значений: 0.00%\n"
]
}
],
"source": [
"# Применение oversampling к обучающей выборке\n",
"oversampler = RandomOverSampler(random_state=42)\n",
"X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)\n",
"\n",
"# Функция для анализа распределения и вывода результатов\n",
"def analyze_distribution(data, title):\n",
" print(f\"Распределение Volume в {title}:\")\n",
" distribution = data.value_counts().sort_index()\n",
" print(distribution)\n",
" total = len(data)\n",
" positive_count = (data > 0).sum()\n",
" negative_count = (data < 0).sum()\n",
" positive_percent = (positive_count / total) * 100\n",
" negative_percent = (negative_count / total) * 100\n",
" print(f\"Процент положительных значений: {positive_percent:.2f}%\")\n",
" print(f\"Процент отрицательных значений: {negative_percent:.2f}%\")\n",
"\n",
"# Анализ распределения для каждой выборки\n",
"analyze_distribution(y_train_resampled, \"обучающей выборке после oversampling\")\n",
"analyze_distribution(y_val, \"контрольной выборке\")\n",
"analyze_distribution(y_test, \"тестовой выборке\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}