1304 lines
206 KiB
Plaintext
1304 lines
206 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Лабораторная работа №2"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 34,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd \n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
"from sklearn.preprocessing import LabelEncoder\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### **Цены на автомобили**\n",
|
|||
|
"https://www.kaggle.com/datasets/deepcontractor/car-price-prediction-challenge\n",
|
|||
|
"\n",
|
|||
|
"Этот набор данных предоставляет подробную информацию о продаже автомобилей, включая их уникальные идентификаторы, цены, сборы и налоги, а также характеристики производителя и модели. В данных представлены год производства, категория автомобиля, наличие кожаного салона, тип топлива, объем двигателя, пробег, количество цилиндров, тип коробки передач, привод, количество дверей, расположение руля, цвет и количество подушек безопасности. Эти данные могут быть использованы для анализа рынка автомобилей, прогнозирования цен на основе различных факторов, а также для изучения влияния технических и визуальных характеристик на стоимость автомобилей."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выгрузка данных из csv файла \"Цены на автомобили\" в датафрейм"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['ID', 'Price', 'Levy', 'Manufacturer', 'Model', 'Prod. year',\n",
|
|||
|
" 'Category', 'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',\n",
|
|||
|
" 'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color',\n",
|
|||
|
" 'Airbags'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df1 = pd.read_csv(\"..//static//csv//car_price_prediction.csv\")\n",
|
|||
|
"print(df1.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 15,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABWlUlEQVR4nO3dd3gU5f7+8XsT0kNCS6EECCAlgDQBg0JAEIgcBNGDIl0F9ICKYOOIUjwaFQuoKHr8AgrHI6ICCgqG3qIUAWkiYChKQieBAAlJnt8f/LKHJWU2Ickm8H5d1166M8/OfObZyTL3zsyzNmOMEQAAAAAgV26uLgAAAAAASjqCEwAAAABYIDgBAAAAgAWCEwAAAABYIDgBAAAAgAWCEwAAAABYIDgBAAAAgAWCEwAAAABYIDgBAAAAgAWCEwCUEgcOHJDNZtPMmTNdXYqDxYsXq2nTpvL29pbNZtOZM2eKbF2DBg1SzZo1i2z5NyJX7lczZ86UzWbTgQMHin3dAJBfBCcALrd9+3bdd999qlGjhry9vVW1alXdeeedeu+994psnZ9//rkmT56cbfqRI0c0fvx4bd26tcjWfbWVK1fKZrPZHx4eHqpVq5YGDBigP/74o1DWsX79eo0fP77QQ83JkyfVu3dv+fj4aOrUqZo1a5b8/PxybJt1kJz18Pb2Vt26dTVixAgdPXq0UOsqTcaPH+/QL76+voqIiNDYsWOVnJzs6vIKxauvvqr58+e7ugxJUlxcnNzc3DRmzJgc57/++uuy2WxatGhRMVcGoKQr4+oCANzY1q9frw4dOqh69eoaMmSIQkNDdfjwYf3000+aMmWKHn/88SJZ7+eff64dO3Zo5MiRDtOPHDmiCRMmqGbNmmratGmRrDs3TzzxhFq2bKlLly7pl19+0ccff6xFixZp+/btqlKlyjUte/369ZowYYIGDRqkcuXKFU7BkjZu3KizZ8/q5ZdfVqdOnZx6zcSJExUeHq6LFy9q7dq1+vDDD/X9999rx44d8vX1zfO1//73v5WZmVkYpZc4H374ofz9/XXu3Dn9+OOPeuWVV7R8+XKtW7dONpvN1eVdk1dffVX33Xefevbs6TC9f//+euCBB+Tl5VVstURGRmrYsGF666231K9fPzVs2NA+7+DBg5o4caL+/ve/q1u3bsVWE4DSgeAEwKVeeeUVBQYGauPGjdkO6I8dO+aaoopASkpKrmdisrRt21b33XefJGnw4MGqW7eunnjiCX366ae5fjvualnvUX7CWHR0tG655RZJ0iOPPKKKFSvq7bff1oIFC9SnT58cX5PVfx4eHtdcc0l13333qVKlSpKkRx99VPfee6+++eYb/fTTT4qMjMzxNefPn7cMmyWZu7u73N3di329r732mhYsWKBhw4ZpzZo19mD6+OOPy8PDQ1OmTCmWOkr7+wfcaLhUD4BL7d+/Xw0bNszxwDs4ODjbtNmzZ6tVq1by9fVV+fLl1a5dO/3444/2+QsWLFC3bt1UpUoVeXl5qXbt2nr55ZeVkZFhb9O+fXstWrRIBw8etF8eVbNmTa1cuVItW7aUdDm4ZM278t6Pn3/+WV27dlVgYKB8fX0VFRWldevWOdSYdenVrl279OCDD6p8+fK6/fbb8903d9xxhyQpPj4+z3bLly9X27Zt5efnp3LlyqlHjx7avXu3Qz3PPPOMJCk8PNy+XVb3lcydO1ctWrSQj4+PKlWqpH79+umvv/6yz2/fvr0GDhwoSWrZsqVsNpsGDRp0zds5aNAg+fv7a//+/brrrrtUtmxZ9e3b1z7v6nucMjMzNWXKFDVu3Fje3t4KCgpS165dtWnTJod2s2fPtm9PhQoV9MADD+jw4cN51vbVV1/JZrNp1apV2eZ99NFHstls2rFjhyQpMTFRgwcPVrVq1eTl5aXKlSurR48eBb5/5+p+ad++vRo1aqTNmzerXbt28vX11T//+U9JlwPsww8/rJCQEHl7e6tJkyb69NNPsy3zzJkzGjRokAIDA1WuXDkNHDgwx8s327dvr/bt22ebXpD+t9lsSklJ0aeffmrf97L2k9zucfrggw/UsGFDeXl5qUqVKho+fHi2OrP6Y9euXerQoYN8fX1VtWpVvfHGGxY9KwUGBmrKlClat26dPvnkE0nSvHnz9N133+m1115T5cqVlZmZqcmTJ6thw4by9vZWSEiIhg0bptOnTzssy5nPnCvrzen9A1A6cMYJgEvVqFFDcXFx2rFjhxo1apRn2wkTJmj8+PFq06aNJk6cKE9PT/38889avny5OnfuLOnygZi/v79GjRolf39/LV++XC+99JKSk5M1adIkSdILL7ygpKQk/fnnn3rnnXckSf7+/mrQoIEmTpyol156SUOHDlXbtm0lSW3atJF0OaBER0erRYsWGjdunNzc3DRjxgzdcccdWrNmjVq1auVQ79///nfddNNNevXVV2WMyXff7N+/X5JUsWLFXNssXbpU0dHRqlWrlsaPH68LFy7ovffe02233aZffvlFNWvWVK9evfT777/rv//9r9555x37WY2goKBclztz5kwNHjxYLVu2VExMjI4ePWo/0NyyZYvKlSunF154QfXq1dPHH39sv/yudu3ahbKd6enp6tKli26//Xa9+eabeX4r//DDD2vmzJmKjo7WI488ovT0dK1Zs0Y//fST/czWK6+8ohdffFG9e/fWI488ouPHj+u9995Tu3bt7NuTk27dusnf319ffvmloqKiHObNmTNHDRs2tO+39957r3bu3KnHH39cNWvW1LFjxxQbG6tDhw4VaECLnPrl5MmTio6O1gMPPKB+/fopJCREFy5cUPv27bVv3z6NGDFC4eHhmjt3rgYNGqQzZ87oySeflCQZY9SjRw+tXbtWjz76qBo0aKB58+bZw29BWfX/rFmz9Mgjj6hVq1YaOnSoJOW5n4wfP14TJkxQp06d9Nhjj2nPnj368MMPtXHjRq1bt87hrOPp06fVtWtX9erVS71799ZXX32l5557To0bN1Z0dHSedWddjvfcc8+pY8eOevLJJ9WmTRsNGzZMkjRs2DD738ETTzyh+Ph4vf/++9qyZYtDHc585mTJ6f0DUIoYAHChH3/80bi7uxt3d3cTGRlpnn32WbNkyRKTlpbm0G7v3r3Gzc3N3HPPPSYjI8NhXmZmpv3/z58/n20dw4YNM76+vubixYv2ad26dTM1atTI1nbjxo1GkpkxY0a2ddx0002mS5cu2dYXHh5u7rzzTvu0cePGGUmmT58+TvXBihUrjCQzffp0c/z4cXPkyBGzaNEiU7NmTWOz2czGjRuNMcbEx8dnq61p06YmODjYnDx50j5t27Ztxs3NzQwYMMA+bdKkSUaSiY+Pt6wnLS3NBAcHm0aNGpkLFy7Ypy9cuNBIMi+99JJ92owZM4wke415yWq7dOlSc/z4cXP48GHzxRdfmIoVKxofHx/z559/GmOMGThwoJFknn/++WzLGDhwoMP7tnz5ciPJPPHEE9naZr1PBw4cMO7u7uaVV15xmL99+3ZTpkyZbNOv1qdPHxMcHGzS09Pt0xISEoybm5uZOHGiMcaY06dPG0lm0qRJlv1wtaz9Zc+ePeb48eMmPj7efPTRR8bLy8uEhISYlJQUY4wxUVFRRpKZNm2aw+snT55sJJnZs2fbp6WlpZnIyEjj7+9vkpOTjTHGzJ8/30gyb7zxhr1denq6adu2bbb9KioqykRFRWWrtSD9b4wxfn5+ZuDAgdnaZO0TWfvlsWPHjKenp+ncubPD3/n7779v/xu5skZJ5rPPPrNPS01NNaGhoebee+/Ntq6cHDhwwPj5+ZkKFSoYDw8Ps337dmOMMWvWrDGSzH/+8x+H9osXL8423dnPnNzePwClB5fqAXCpO++8U3Fxcbr77ru1bds2vfHGG+rSpYuqVq2qb7/91t5u/vz5yszM1EsvvSQ3N8ePritvnPfx8bH//9mzZ3XixAm1bdtW58+f12+//VbgOrdu3aq9e/fqwQcf1MmTJ3XixAmdOHFCKSkp6tixo1avXp1t0IJHH300X+t46KGHFBQUpCp
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Преобразуем год производства в целочисленный тип\n",
|
|||
|
"df1['Prod. year'] = df1['Prod. year'].astype(int)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df1['Prod. year'], df1['Price'])\n",
|
|||
|
"plt.xlabel('Production Year')\n",
|
|||
|
"plt.ylabel('Price')\n",
|
|||
|
"plt.title('Scatter Plot of Price vs Production Year')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Зашумленность не очень высокая. Покрытие данных высокое и подошло бы для поставленной задачи по актуальности."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 16,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы:\n",
|
|||
|
" ID Price Levy Manufacturer Model Prod. year \\\n",
|
|||
|
"14 45732604 59464 891 HYUNDAI Santa FE 2016 \n",
|
|||
|
"36 45369569 51746 1077 TOYOTA CHR Limited 2019 \n",
|
|||
|
"47 45732544 55390 1017 HYUNDAI Santa FE 2017 \n",
|
|||
|
"56 44316016 87112 - MERCEDES-BENZ GLA 250 2019 \n",
|
|||
|
"73 45732043 53154 891 HYUNDAI Santa FE 2016 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"19144 45733642 56814 1017 HYUNDAI Sonata 2017 \n",
|
|||
|
"19161 45677230 64290 - LEXUS RX 450 F SPORT 2012 \n",
|
|||
|
"19180 45803164 63886 1076 HYUNDAI Sonata 2020 \n",
|
|||
|
"19188 45571892 61154 579 TOYOTA RAV 4 2017 \n",
|
|||
|
"19211 45802856 50037 891 HYUNDAI Santa FE 2016 \n",
|
|||
|
"\n",
|
|||
|
" Category Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
|
|||
|
"14 Jeep Yes Diesel 2 76000 km 4.0 \n",
|
|||
|
"36 Jeep No Petrol 2 10200 km 4.0 \n",
|
|||
|
"47 Jeep Yes Diesel 2 100734 km 4.0 \n",
|
|||
|
"56 Jeep Yes Petrol 2.0 Turbo 5323 km 4.0 \n",
|
|||
|
"73 Jeep Yes Diesel 2 84506 km 4.0 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"19144 Sedan Yes Petrol 2 67365 km 4.0 \n",
|
|||
|
"19161 Jeep Yes Hybrid 3.5 97000 km 6.0 \n",
|
|||
|
"19180 Sedan Yes LPG 2 5305 km 4.0 \n",
|
|||
|
"19188 Jeep No Hybrid 2.5 71234 km 4.0 \n",
|
|||
|
"19211 Jeep Yes Diesel 2 121902 km 4.0 \n",
|
|||
|
"\n",
|
|||
|
" Gear box type Drive wheels Doors Wheel Color Airbags \n",
|
|||
|
"14 Automatic Front 04-May Left wheel White 4 \n",
|
|||
|
"36 Tiptronic Front 04-May Left wheel Red 12 \n",
|
|||
|
"47 Automatic Front 04-May Left wheel Black 4 \n",
|
|||
|
"56 Tiptronic 4x4 04-May Left wheel Grey 0 \n",
|
|||
|
"73 Automatic Front 04-May Left wheel Silver 4 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"19144 Automatic Front 04-May Left wheel Black 4 \n",
|
|||
|
"19161 Variator 4x4 04-May Left wheel Black 12 \n",
|
|||
|
"19180 Automatic Front 04-May Left wheel Silver 4 \n",
|
|||
|
"19188 Tiptronic 4x4 04-May Left wheel White 12 \n",
|
|||
|
"19211 Automatic Front 04-May Left wheel Black 4 \n",
|
|||
|
"\n",
|
|||
|
"[1073 rows x 18 columns]\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIjCAYAAABswtioAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADQtUlEQVR4nOzdeXhTVfoH8G+SNk3XQAulZWtDQSGURZClQotiK0sFHFFEh0VFREQU0RFxYREV0Zmfu6h1FAd0VBwRECiCqCwWQWqRUkDEFgS6QAtt6ULa5v7+qDc2TXJz06TN0u/neXhmmnty77k3NzFvzjnvqxAEQQARERERERG1OKW7O0BERERERNRaMSAjIiIiIiJyEwZkREREREREbsKAjIiIiIiIyE0YkBEREREREbkJAzIiIiIiIiI3YUBGRERERETkJgzIiIiIiIiI3IQBGRERERERkZswICOiZpeXlweFQoFVq1a5uytm0tPT0b9/f2g0GigUCly8eLHZjnXnnXciNja22fbfGrnzvlq1ahUUCgXy8vJa/Nje4v7770dKSkqTn79//35cc801CA4OhkKhQFZWlus65yNiY2Nx5513mv7+7rvvoFAo8N1335kea42fPddeey2uvfZa09/u/KyYPHkyJk2a1OLHJe/CgIzICYcOHcItt9yCmJgYaDQadOrUCSkpKXj99deb7Zgff/wxXnnlFYvHz549iyVLlrTolxbxP/7iP39/f3Tr1g3Tpk3D77//7pJj/PDDD1iyZInLg6Xi4mJMmjQJgYGBePPNN7F69WoEBwdbbSt++Rb/aTQaXHHFFXjggQdQWFjo0n55kyVLlphdl6CgIOj1ejz11FMoKytzd/dc4vnnn8eXX37p7m4AADIyMqBUKrFw4UKr21esWAGFQoFNmza1cM8s5ebm4r333sMTTzxhdfuRI0dM7yVr7+2amhrceuutKCkpwcsvv4zVq1cjJiYGb731Vot/qY6NjcWNN95odZv4Gfj555+3aJ+8iSAIWL16NZKSktCmTRsEBQWhT58+eOaZZ1BRUdHk/ebk5GDJkiUe/6PIggUL8L///Q8HDx50d1fIkwlE1CR79uwR1Gq10L17d2HZsmVCWlqasGjRIuGGG24Q4uLimu24qampQkxMjMXj+/fvFwAIH3zwQbMdu7Fvv/1WACA8+OCDwurVq4X3339feOCBBwS1Wi2Eh4cLZ86cEQRBEHJzc5vct5deekkAIOTm5rq071u2bBEACNu2bbPb9oMPPhAACM8884ywevVqIS0tTZg+fbqgVCoFnU4nVFRU2N2HwWAQqqurXdF1j7F48WIBgLBy5Uph9erVwsqVK4W//e1vAgAhISFBMBqNzXp8Z+4ruYKDg4Xp06dbPF5bWytUVVU1+zk2dt999wn+/v5Cdna22eN5eXlCUFCQcOutt7Zof2x56KGHhCuuuMLm9ieeeEKIiooSAgIChLS0NIvtR44cEQBYbOvdu7cwYsQIV3dXUkxMjJCammp1m/gZuHbt2hbtkygmJsbs/hT78+2335oec+dnT21trTBp0iQBgJCYmCi8/PLLwjvvvCNMmTJFUCqVQnx8vFBQUNCkfa9du9biXEUjRowwu0+MRqNQVVUl1NbWNvFMnDN48GBh6tSpbjk2eQc/dwSBRL7gueeeg1arxf79+9GmTRuzbUVFRe7pVDOoqKiwOXIkSkxMxC233AIAuOuuu3DFFVfgwQcfxIcffmjz13x3E1+jxq+dlDFjxuDqq68GANxzzz2IiIjA//3f/2H9+vW4/fbbrT5HvH7+/v5O99lT3XLLLWjXrh0A4L777sPEiRPxxRdfYO/evUhISLD6nMrKSgQFBbVkN11KpVJBpVK1+HFfeOEFrF+/HrNmzcKuXbugUCgAAHPnzoW/vz9effXVFumH1OtXU1ODjz76CPfdd5/V7YIg4OOPP8Ydd9yB3NxcfPTRR7jnnnvM2jTl/dlUtbW1MBqNUKvVzX4sd3DnZ8+LL76Izz77DI8++iheeukl0+P33nsvJk2ahJtuugl33nkntmzZ0qz9EEdjXUXOfxcbmjRpEhYvXoy33noLISEhLusH+Q5OWSRqohMnTqB3795WvzBERkZaPLZmzRoMHjwYQUFBaNu2LZKSkvD111+btq9fvx6pqano2LEjAgICEBcXh2XLlqGurs7U5tprr8WmTZtw8uRJ0zSx2NhYfPfddxg0aBCA+oBI3NZwas+PP/6I0aNHQ6vVIigoCCNGjMCePXvM+ihOQcvJycEdd9yBtm3bYvjw4Q5fm5EjRwKon7YkZceOHUhMTERwcDDatGmDCRMm4MiRI2b9+cc//gEA0Ol0pvOyN0Vl7dq1GDhwIAIDA9GuXTtMmTIFZ86cMW2/9tprMX36dADAoEGDoFAozNZhNPU877zzToSEhODEiRMYO3YsQkND8fe//920rfE6DqPRiFdffRV9+vSBRqNB+/btMXr0aPz0009m7dasWWM6n/DwcEyePBl//PGHZN8+//xzKBQKfP/99xbb3nnnHSgUCmRnZwMACgoKcNddd6Fz584ICAhAdHQ0JkyY0OSpQI2vy7XXXov4+HgcOHAASUlJCAoKMk1lKyoqwowZM9ChQwdoNBr069cPH374ocU+L168iDvvvBNarRZt2rTB9OnTrU51a7x2RNSU669QKFBRUYEPP/zQdO+J94mtNWRvvfUWevfujYCAAHTs2BFz5syx6Kd4PXJycnDdddchKCgInTp1wosvvmjnygJarRavvvoq9uzZg/feew8AsG7dOmzcuBEvvPACoqOjYTQa8corr6B3797QaDTo0KEDZs2ahQsXLpjtS85nTsP+Wnv9rNm9ezfOnz+P5ORkq9v37NmDvLw8TJ48GZMnT8bOnTtx+vRp0/Y777wTI0aMAADceuutUCgUuPbaaxEbG4vDhw/j+++/N70eDV/rixcvYt68eejSpQsCAgLQvXt3rFixAkaj0dRGXEv0z3/+E6+88gri4uIQEBCAnJwcu9derpMnT+L+++/HlVdeicDAQERERODWW2+1uFfEe2jPnj2YP38+2rdvj+DgYPztb3/DuXPnzNoKgoBnn30WnTt3RlBQEK677jocPnxYVn8a3/sNr8G7775rugaDBg3C/v37LZ6/du1a6PV6aDQaxMfHY926dbLWpVVVVeGll17CFVdcgeXLl1tsHzduHKZPn4709HTs3bvX9LhCocCSJUss2jdcL7dq1SrceuutAIDrrrvOdD80XDvXkK01ZEePHsUtt9yC8PBwaDQaXH311diwYYNZG/F1+v7773H//fcjMjISnTt3BgCUl5dj3rx5iI2NRUBAACIjI5GSkoLMzEyzfaSkpKCiogLbtm2TumTUinGEjKiJYmJikJGRgezsbMTHx0u2Xbp0KZYsWYJrrrkGzzzzDNRqNX788Ufs2LEDN9xwA4D6D/2QkBDMnz8fISEh2LFjBxYtWoSysjLTL4tPPvkkSktLcfr0abz88ssAgJCQEPTq1QvPPPMMFi1ahHvvvReJiYkAgGuuuQZAfeAzZswYDBw4EIsXL4ZSqcQHH3yAkSNHYteuXRg8eLBZf2+99Vb06NEDzz//PARBcPjanDhxAgAQERFhs8327dsxZswYdOvWDUuWLEFVVRVef/11DBs2DJmZmYiNjcXNN9+MX3/9Ff/973/x8ssvm0Zh2rdvb3O/q1atwl133YVBgwZh+fLlKCwsNH2B/fnnn9GmTRs8+eSTuPLKK/Huu+/imWeegU6nQ1xcnEvOs7a2FqNGjcLw4cPxz3/+U3IUaMaMGVi1ahXGjBmDe+65B7W1tdi1axf27t1rGol77rnn8PTTT2PSpEm45557cO7cObz++utISkoynY81qampCAkJwWeffWb6civ69NNP0bt3b9N9O3HiRBw+fBhz585FbGwsioqKsG3bNpw6dapJyQCsXZfi4mKMGTMGkydPxpQpU9ChQwdUVVXh2muvxW+//YYHHngAOp0Oa9euxZ133omLFy/ioYceAlD/ZXTChAnYvXs37rvvPvTq1Qvr1q0zBdVNZe/
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Преобразуем год производства в целочисленный тип\n",
|
|||
|
"df1['Prod. year'] = df1['Prod. year'].astype(int)\n",
|
|||
|
"\n",
|
|||
|
"# Статистический анализ для определения выбросов\n",
|
|||
|
"Q1 = df1['Price'].quantile(0.25)\n",
|
|||
|
"Q3 = df1['Price'].quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"# Определение порога для выбросов\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"outliers = (df1['Price'] < (Q1 - threshold)) | (df1['Price'] > (Q3 + threshold))\n",
|
|||
|
"\n",
|
|||
|
"# Вывод выбросов\n",
|
|||
|
"print(\"Выбросы:\")\n",
|
|||
|
"print(df1[outliers])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка выбросов\n",
|
|||
|
"# В данном случае мы заменим выбросы на медианное значение\n",
|
|||
|
"median_price = df1['Price'].median()\n",
|
|||
|
"df1.loc[outliers, 'Price'] = median_price\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df1['Prod. year'], df1['Price'])\n",
|
|||
|
"plt.xlabel('Production Year')\n",
|
|||
|
"plt.ylabel('Price')\n",
|
|||
|
"plt.title('Scatter Plot of Price vs Production Year (After Handling Outliers)')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Очистим от строк с пустыми значениями наш датасет"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 18,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"Количество удаленных строк: 0\n",
|
|||
|
"\n",
|
|||
|
"DataFrame после удаления строк с пропущенными значениями:\n",
|
|||
|
" ID Price Levy Manufacturer Model Prod. year Category \\\n",
|
|||
|
"0 45654403 13328 1399 LEXUS RX 450 2010 Jeep \n",
|
|||
|
"1 44731507 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
|
|||
|
"2 45774419 8467 - HONDA FIT 2006 Hatchback \n",
|
|||
|
"3 45769185 3607 862 FORD Escape 2011 Jeep \n",
|
|||
|
"4 45809263 11726 446 HONDA FIT 2014 Hatchback \n",
|
|||
|
"... ... ... ... ... ... ... ... \n",
|
|||
|
"19232 45798355 8467 - MERCEDES-BENZ CLK 200 1999 Coupe \n",
|
|||
|
"19233 45778856 15681 831 HYUNDAI Sonata 2011 Sedan \n",
|
|||
|
"19234 45804997 26108 836 HYUNDAI Tucson 2010 Jeep \n",
|
|||
|
"19235 45793526 5331 1288 CHEVROLET Captiva 2007 Jeep \n",
|
|||
|
"19236 45813273 470 753 HYUNDAI Sonata 2012 Sedan \n",
|
|||
|
"\n",
|
|||
|
" Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
|
|||
|
"0 Yes Hybrid 3.5 186005 km 6.0 \n",
|
|||
|
"1 No Petrol 3 192000 km 6.0 \n",
|
|||
|
"2 No Petrol 1.3 200000 km 4.0 \n",
|
|||
|
"3 Yes Hybrid 2.5 168966 km 4.0 \n",
|
|||
|
"4 Yes Petrol 1.3 91901 km 4.0 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"19232 Yes CNG 2.0 Turbo 300000 km 4.0 \n",
|
|||
|
"19233 Yes Petrol 2.4 161600 km 4.0 \n",
|
|||
|
"19234 Yes Diesel 2 116365 km 4.0 \n",
|
|||
|
"19235 Yes Diesel 2 51258 km 4.0 \n",
|
|||
|
"19236 Yes Hybrid 2.4 186923 km 4.0 \n",
|
|||
|
"\n",
|
|||
|
" Gear box type Drive wheels Doors Wheel Color Airbags \n",
|
|||
|
"0 Automatic 4x4 04-May Left wheel Silver 12 \n",
|
|||
|
"1 Tiptronic 4x4 04-May Left wheel Black 8 \n",
|
|||
|
"2 Variator Front 04-May Right-hand drive Black 2 \n",
|
|||
|
"3 Automatic 4x4 04-May Left wheel White 0 \n",
|
|||
|
"4 Automatic Front 04-May Left wheel Silver 4 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"19232 Manual Rear 02-Mar Left wheel Silver 5 \n",
|
|||
|
"19233 Tiptronic Front 04-May Left wheel Red 8 \n",
|
|||
|
"19234 Automatic Front 04-May Left wheel Grey 4 \n",
|
|||
|
"19235 Automatic Front 04-May Left wheel Black 4 \n",
|
|||
|
"19236 Automatic Front 04-May Left wheel White 12 \n",
|
|||
|
"\n",
|
|||
|
"[19237 rows x 18 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Удаление строк с пропущенными значениями\n",
|
|||
|
"df_dropna = df1.dropna()\n",
|
|||
|
"\n",
|
|||
|
"# Вывод количества удаленных строк\n",
|
|||
|
"num_deleted_rows = len(df1) - len(df_dropna)\n",
|
|||
|
"print(f\"\\nКоличество удаленных строк: {num_deleted_rows}\")\n",
|
|||
|
"\n",
|
|||
|
"print(\"\\nDataFrame после удаления строк с пропущенными значениями:\")\n",
|
|||
|
"print(df_dropna)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Теперь создадим выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 11542\n",
|
|||
|
"Размер контрольной выборки: 3847\n",
|
|||
|
"Размер тестовой выборки: 3848\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//car_price_prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и временную выборки\n",
|
|||
|
"train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение остатка на контрольную и тестовую выборки\n",
|
|||
|
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Сохранение выборок в файлы\n",
|
|||
|
"train_df.to_csv(\"..//static//csv//train_data.csv\", index=False)\n",
|
|||
|
"val_df.to_csv(\"..//static//csv//val_data.csv\", index=False)\n",
|
|||
|
"test_df.to_csv(\"..//static//csv//test_data.csv\", index=False)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проанализируем сбалансированность выборок"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 32,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Category в обучающей выборке:\n",
|
|||
|
"Category\n",
|
|||
|
"Sedan 5289\n",
|
|||
|
"Jeep 3246\n",
|
|||
|
"Hatchback 1684\n",
|
|||
|
"Minivan 396\n",
|
|||
|
"Coupe 318\n",
|
|||
|
"Universal 216\n",
|
|||
|
"Microbus 184\n",
|
|||
|
"Goods wagon 151\n",
|
|||
|
"Pickup 31\n",
|
|||
|
"Cabriolet 20\n",
|
|||
|
"Limousine 7\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент автомобилей категории 'Седан': 45.82%\n",
|
|||
|
"Процент автомобилей категории 'Джип': 28.12%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Category в контрольной выборке:\n",
|
|||
|
"Category\n",
|
|||
|
"Sedan 1697\n",
|
|||
|
"Jeep 1109\n",
|
|||
|
"Hatchback 608\n",
|
|||
|
"Minivan 129\n",
|
|||
|
"Coupe 105\n",
|
|||
|
"Universal 73\n",
|
|||
|
"Microbus 57\n",
|
|||
|
"Goods wagon 42\n",
|
|||
|
"Pickup 17\n",
|
|||
|
"Cabriolet 9\n",
|
|||
|
"Limousine 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент автомобилей категории 'Седан': 44.11%\n",
|
|||
|
"Процент автомобилей категории 'Джип': 28.83%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Category в тестовой выборке:\n",
|
|||
|
"Category\n",
|
|||
|
"Sedan 1750\n",
|
|||
|
"Jeep 1118\n",
|
|||
|
"Hatchback 555\n",
|
|||
|
"Minivan 122\n",
|
|||
|
"Coupe 109\n",
|
|||
|
"Universal 75\n",
|
|||
|
"Microbus 65\n",
|
|||
|
"Goods wagon 40\n",
|
|||
|
"Cabriolet 7\n",
|
|||
|
"Pickup 4\n",
|
|||
|
"Limousine 3\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент автомобилей категории 'Седан': 45.48%\n",
|
|||
|
"Процент автомобилей категории 'Джип': 29.05%\n",
|
|||
|
"\n",
|
|||
|
"Необходима аугментация данных для балансировки классов.\n",
|
|||
|
"Необходима аугментация данных для балансировки классов.\n",
|
|||
|
"Необходима аугментация данных для балансировки классов.\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"train_df = pd.read_csv(\"..//static//csv//train_data.csv\")\n",
|
|||
|
"val_df = pd.read_csv(\"..//static//csv//val_data.csv\")\n",
|
|||
|
"test_df = pd.read_csv(\"..//static//csv//test_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Оценка сбалансированности\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['Category'].value_counts()\n",
|
|||
|
" print(f\"Распределение Category в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print(f\"Процент автомобилей категории 'Седан': {counts['Sedan'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print(f\"Процент автомобилей категории 'Джип': {counts['Jeep'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"# Определение необходимости аугментации данных\n",
|
|||
|
"def need_augmentation(df):\n",
|
|||
|
" counts = df['Category'].value_counts()\n",
|
|||
|
" ratio = counts['Sedan'] / counts['Jeep']\n",
|
|||
|
" if ratio > 1.5 or ratio < 0.67:\n",
|
|||
|
" print(\"Необходима аугментация данных для балансировки классов.\")\n",
|
|||
|
" else:\n",
|
|||
|
" print(\"Аугментация данных не требуется.\")\n",
|
|||
|
" \n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"need_augmentation(train_df)\n",
|
|||
|
"need_augmentation(val_df)\n",
|
|||
|
"need_augmentation(test_df)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"По результатам анализа требуется приращение, соотношения отзывов вне допустимого диапазона"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 36,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Оверсэмплинг:\n",
|
|||
|
"Распределение Category в обучающей выборке:\n",
|
|||
|
"Category\n",
|
|||
|
"Jeep 5289\n",
|
|||
|
"Hatchback 5289\n",
|
|||
|
"Sedan 5289\n",
|
|||
|
"Goods wagon 5289\n",
|
|||
|
"Cabriolet 5289\n",
|
|||
|
"Universal 5289\n",
|
|||
|
"Minivan 5289\n",
|
|||
|
"Microbus 5289\n",
|
|||
|
"Coupe 5289\n",
|
|||
|
"Pickup 5289\n",
|
|||
|
"Limousine 5289\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент автомобилей категории 'Седан': 9.09%\n",
|
|||
|
"Процент автомобилей категории 'Джип': 9.09%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Category в контрольной выборке:\n",
|
|||
|
"Category\n",
|
|||
|
"Jeep 1697\n",
|
|||
|
"Sedan 1697\n",
|
|||
|
"Minivan 1697\n",
|
|||
|
"Coupe 1697\n",
|
|||
|
"Hatchback 1697\n",
|
|||
|
"Goods wagon 1697\n",
|
|||
|
"Universal 1697\n",
|
|||
|
"Microbus 1697\n",
|
|||
|
"Pickup 1697\n",
|
|||
|
"Cabriolet 1697\n",
|
|||
|
"Limousine 1697\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент автомобилей категории 'Седан': 9.09%\n",
|
|||
|
"Процент автомобилей категории 'Джип': 9.09%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Category в тестовой выборке:\n",
|
|||
|
"Category\n",
|
|||
|
"Jeep 1750\n",
|
|||
|
"Hatchback 1750\n",
|
|||
|
"Sedan 1750\n",
|
|||
|
"Coupe 1750\n",
|
|||
|
"Minivan 1750\n",
|
|||
|
"Goods wagon 1750\n",
|
|||
|
"Microbus 1750\n",
|
|||
|
"Universal 1750\n",
|
|||
|
"Cabriolet 1750\n",
|
|||
|
"Pickup 1750\n",
|
|||
|
"Limousine 1750\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент автомобилей категории 'Седан': 9.09%\n",
|
|||
|
"Процент автомобилей категории 'Джип': 9.09%\n",
|
|||
|
"\n",
|
|||
|
"Андерсэмплинг:\n",
|
|||
|
"Распределение Category в обучающей выборке:\n",
|
|||
|
"Category\n",
|
|||
|
"Cabriolet 7\n",
|
|||
|
"Coupe 7\n",
|
|||
|
"Goods wagon 7\n",
|
|||
|
"Hatchback 7\n",
|
|||
|
"Jeep 7\n",
|
|||
|
"Limousine 7\n",
|
|||
|
"Microbus 7\n",
|
|||
|
"Minivan 7\n",
|
|||
|
"Pickup 7\n",
|
|||
|
"Sedan 7\n",
|
|||
|
"Universal 7\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент автомобилей категории 'Седан': 9.09%\n",
|
|||
|
"Процент автомобилей категории 'Джип': 9.09%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Category в контрольной выборке:\n",
|
|||
|
"Category\n",
|
|||
|
"Cabriolet 1\n",
|
|||
|
"Coupe 1\n",
|
|||
|
"Goods wagon 1\n",
|
|||
|
"Hatchback 1\n",
|
|||
|
"Jeep 1\n",
|
|||
|
"Limousine 1\n",
|
|||
|
"Microbus 1\n",
|
|||
|
"Minivan 1\n",
|
|||
|
"Pickup 1\n",
|
|||
|
"Sedan 1\n",
|
|||
|
"Universal 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент автомобилей категории 'Седан': 9.09%\n",
|
|||
|
"Процент автомобилей категории 'Джип': 9.09%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Category в тестовой выборке:\n",
|
|||
|
"Category\n",
|
|||
|
"Cabriolet 3\n",
|
|||
|
"Coupe 3\n",
|
|||
|
"Goods wagon 3\n",
|
|||
|
"Hatchback 3\n",
|
|||
|
"Jeep 3\n",
|
|||
|
"Limousine 3\n",
|
|||
|
"Microbus 3\n",
|
|||
|
"Minivan 3\n",
|
|||
|
"Pickup 3\n",
|
|||
|
"Sedan 3\n",
|
|||
|
"Universal 3\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент автомобилей категории 'Седан': 9.09%\n",
|
|||
|
"Процент автомобилей категории 'Джип': 9.09%\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Загрузка данных\n",
|
|||
|
"train_df = pd.read_csv(\"..//static//csv//train_data.csv\")\n",
|
|||
|
"val_df = pd.read_csv(\"..//static//csv//val_data.csv\")\n",
|
|||
|
"test_df = pd.read_csv(\"..//static//csv//test_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование категориальных признаков в числовые\n",
|
|||
|
"def encode(df):\n",
|
|||
|
" label_encoders = {}\n",
|
|||
|
" for column in df.select_dtypes(include=['object']).columns:\n",
|
|||
|
" if column != 'Category': # Пропускаем целевую переменную\n",
|
|||
|
" le = LabelEncoder()\n",
|
|||
|
" df[column] = le.fit_transform(df[column])\n",
|
|||
|
" label_encoders[column] = le\n",
|
|||
|
" return label_encoders\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование целевой переменной в числовые значения\n",
|
|||
|
"def encode_target(df):\n",
|
|||
|
" le = LabelEncoder()\n",
|
|||
|
" df['Category'] = le.fit_transform(df['Category'])\n",
|
|||
|
" return le\n",
|
|||
|
"\n",
|
|||
|
"# Применение кодирования\n",
|
|||
|
"label_encoders = encode(train_df)\n",
|
|||
|
"encode(val_df)\n",
|
|||
|
"encode(test_df)\n",
|
|||
|
"\n",
|
|||
|
"# Кодирование целевой переменной\n",
|
|||
|
"le_target = encode_target(train_df)\n",
|
|||
|
"encode_target(val_df)\n",
|
|||
|
"encode_target(test_df)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка типов данных\n",
|
|||
|
"def check_data_types(df):\n",
|
|||
|
" for column in df.columns:\n",
|
|||
|
" if df[column].dtype == 'object':\n",
|
|||
|
" print(f\"Столбец '{column}' содержит строковые данные.\")\n",
|
|||
|
"\n",
|
|||
|
"check_data_types(train_df)\n",
|
|||
|
"check_data_types(val_df)\n",
|
|||
|
"check_data_types(test_df)\n",
|
|||
|
"\n",
|
|||
|
"# Функция для выполнения oversampling\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" if 'Category' not in df.columns:\n",
|
|||
|
" print(\"Столбец 'Category' отсутствует.\")\n",
|
|||
|
" return df\n",
|
|||
|
" \n",
|
|||
|
" X = df.drop('Category', axis=1)\n",
|
|||
|
" y = df['Category']\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"# Функция для выполнения undersampling\n",
|
|||
|
"def undersample(df):\n",
|
|||
|
" if 'Category' not in df.columns:\n",
|
|||
|
" print(\"Столбец 'Category' отсутствует.\")\n",
|
|||
|
" return df\n",
|
|||
|
" \n",
|
|||
|
" X = df.drop('Category', axis=1)\n",
|
|||
|
" y = df['Category']\n",
|
|||
|
" \n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"# Применение oversampling и undersampling к каждой выборке\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"train_df_undersampled = undersample(train_df)\n",
|
|||
|
"val_df_undersampled = undersample(val_df)\n",
|
|||
|
"test_df_undersampled = undersample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"# Обратное преобразование целевой переменной в строковые метки\n",
|
|||
|
"def decode_target(df, le_target):\n",
|
|||
|
" df['Category'] = le_target.inverse_transform(df['Category'])\n",
|
|||
|
"\n",
|
|||
|
"decode_target(train_df_oversampled, le_target)\n",
|
|||
|
"decode_target(val_df_oversampled, le_target)\n",
|
|||
|
"decode_target(test_df_oversampled, le_target)\n",
|
|||
|
"\n",
|
|||
|
"decode_target(train_df_undersampled, le_target)\n",
|
|||
|
"decode_target(val_df_undersampled, le_target)\n",
|
|||
|
"decode_target(test_df_undersampled, le_target)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка результатов\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" if 'Category' not in df.columns:\n",
|
|||
|
" print(f\"Столбец 'Category' отсутствует в {name}.\")\n",
|
|||
|
" return\n",
|
|||
|
" \n",
|
|||
|
" counts = df['Category'].value_counts()\n",
|
|||
|
" print(f\"Распределение Category в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" \n",
|
|||
|
" if 'Sedan' in counts and 'Jeep' in counts:\n",
|
|||
|
" print(f\"Процент автомобилей категории 'Седан': {counts['Sedan'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print(f\"Процент автомобилей категории 'Джип': {counts['Jeep'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" else:\n",
|
|||
|
" print(\"Отсутствуют одна или обе категории (Седан/Внедорожник).\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности после oversampling\n",
|
|||
|
"print(\"Оверсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности после undersampling\n",
|
|||
|
"print(\"Андерсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_undersampled, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### **Классические рок-треки (по данным Spotify)**\n",
|
|||
|
"https://www.kaggle.com/datasets/thebumpkin/14400-classic-rock-tracks-with-spotify-data\n",
|
|||
|
"\n",
|
|||
|
" Этот набор данных, содержащий 1200 уникальных альбомов и 14 400 треков, представляет собой не просто коллекцию — это хроника эволюции классического рока. Каждый трек тщательно каталогизирован с 18 столбцами данных, включая ключевые метаданные, такие как название трека, исполнитель, альбом и год выпуска, наряду с функциями Spotify audio, которые позволяют получить представление о звуковом ландшафте этих неподвластных времени мелодий. Бизнес-цель может заключаться в улучшении стратегии маркетинга и продвижения музыкальных треков. Предположим как этот набор может быть полезен для бизнеса: Персонализированные рекомендации: Создание алгоритмов, которые будут рекомендовать пользователям музыку на основе их предпочтений. Цель технического проекта: Разработать и внедрить систему рекомендаций, которая будет предсказывать и рекомендовать пользователям музыкальные треки на основе их предпочтений и поведения. Входные данные: Данные о пользователях: Идентификатор пользователя, история прослушиваний, оценки треков, время прослушивания, частота прослушивания. Данные о треках: Атрибуты треков (название, исполнитель, альбом, год, длительность, танцевальность, энергичность, акустичность и т.д.). Данные о взаимодействии: Время и частота взаимодействия пользователя с определенными треками. Целевой признак: Рекомендации: Булева переменная, указывающая, должен ли конкретный трек быть рекомендован пользователю (1 - рекомендуется, 0 - не рекомендуется)."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выгрузка данных из csv файла \"Данные о клиентах\" в датафрейм"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 43,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['Track', 'Artist', 'Album', 'Year', 'Duration', 'Time_Signature',\n",
|
|||
|
" 'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness',\n",
|
|||
|
" 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo',\n",
|
|||
|
" 'Popularity'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df = pd.read_csv(\"..//static//csv//UltimateClassicRock.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Анализируем датафрейм при помощи \"ящика с усами\". Есть смещение в сторону меньших значений, это можно исправить при помощи oversampling и undersampling."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 44,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAzVUlEQVR4nO3dd5RV5dnw4XsGptBRUYoCgg3ELqhAFFQs2DXqp6KIvUAUjTEaC9iisb72khcBA/ZEUINRFEskRikLSxTUCKJB5FXpAgPM/v5wzQnDDFV8BsbrWosVdjvnmT07zvmxy+RlWZYFAADATyy/qgcAAAD8PIgPAAAgCfEBAAAkIT4AAIAkxAcAAJCE+AAAAJIQHwAAQBLiAwAASEJ8AAAASYgPgCo0ZcqUyMvLi0GDBlX1UFiBXr16xZZbbrnOXzcvLy/69++/zl8XYH0mPoBqYdCgQZGXl1fuz2abbRb77rtvvPDCC8nH89prr5UbS0FBQbRu3Tp69uwZn3322Tp5j3/84x/Rv3//mDVr1jp5vaqQYj9tKKrD9xNgVWpW9QAA1qVrr702WrVqFVmWxddffx2DBg2KQw45JJ577rk47LDDko/nggsuiA4dOsTixYtj/Pjx8dBDD8Vf//rXeP/996NZs2Y/6rX/8Y9/xDXXXBO9evWKhg0brpsBV5Gfcj+trxYsWBA1a/73x3B1+n4CrIj4AKqV7t27R/v27XPTZ5xxRjRu3Dgee+yxKomPvffeO4499tiIiDjttNNi2223jQsuuCAGDx4cl19+efLxrK9+LvuptLQ0SkpKori4OIqLi6t6OADJuewKqNYaNmwYtWrVKvcvzBER8+fPj1//+tfRvHnzKCoqiu222y5uvfXWyLIsIn74V+k2bdpEmzZtYsGCBbntvvvuu2jatGl06tQpli5dusbj2W+//SIiYvLkyStdb9SoUbH33ntHnTp1omHDhnHkkUfGRx99lFvev3//+M1vfhMREa1atcpdtjRlypRVjmH5y9PK/rz22msV1u3Vq1el6y5/r8LTTz8d7du3j3r16pVb79Zbb13leCpT2X667777ol27dlFUVBTNmjWL3r17V7hEqWvXrrHDDjvEuHHjolOnTlGrVq1o1apVPPDAA+XWK7tMb/n9VXYZWGX7Ylm33nprdOrUKTbZZJOoVatW7L777vH0009XWC8vLy/69OkTQ4cOzY39b3/7W25Z2X5c2fezS5cusfPOO1c6ju222y4OOuiglY4VYH3izAdQrcyePTu++eabyLIsZsyYEXfffXfMmzcvTj755Nw6WZbFEUccEa+++mqcccYZscsuu8SLL74Yv/nNb+I///lP3HHHHVGrVq0YPHhwdO7cOa644oq4/fbbIyKid+/eMXv27Bg0aFDUqFFjjcf373//OyIiNtlkkxWu8/LLL0f37t2jdevW0b9//1iwYEHcfffd0blz5xg/fnxsueWWccwxx8THH38cjz32WNxxxx3RqFGjiIjYdNNNV2scBxxwQPTs2TMiIsaMGRN33XXXCtdt1KhR3HHHHbnpU045pdzyt956K44//vjYeeed46abbooGDRrEN998ExdddNFqjaUyy++n/v37xzXXXBPdunWL8847LyZNmhT3339/jBkzJkaPHh0FBQW5bWfOnBmHHHJIHH/88XHiiSfGk08+Geedd14UFhbG6aefvtZjWtadd94ZRxxxRPTo0SNKSkri8ccfj+OOOy6ef/75OPTQQ8utO2rUqHjyySejT58+0ahRo0pvXl/Z9/OUU06Js846Kz744IPYYYcdctuMGTMmPv7447jyyivXydcEkEQGUA0MHDgwi4gKf4qKirJBgwaVW3fYsGFZRGTXX399ufnHHntslpeXl3366ae5eZdffnmWn5+fvfHGG9lTTz2VRUT2P//zP6scz6uvvppFRPbwww9n//d//5dNmzYt++tf/5ptueWWWV5eXjZmzJgsy7Js8uTJWURkAwcOzG27yy67ZJtttln27bff5ua9++67WX5+ftazZ8/cvFtuuSWLiGzy5MmrvZ9KSkqyiMj69OmTm1f2db366qsV1u/Ro0fWqlWrcvMiIuvXr19u+vLLL88iIvvqq69y88q+rltuuWWl41md/TRjxoyssLAwO/DAA7OlS5fmtr3nnnty25bp0qVLFhHZbbfdlpu3aNGi3D4tKSnJsuy/x8vy+65sPMvui1NPPTVr2bJlufW+//77ctMlJSXZDjvskO23334V9lV+fn72r3/9q8LXvvx+XNH3c9asWVlxcXH229/+ttz8Cy64IKtTp042b968Cq8NsL5y2RVQrdx7770xcuTIGDlyZAwZMiT23XffOPPMM+Mvf/lLbp0RI0ZEjRo14oILLii37a9//evIsqzc07H69+8f7dq1i1NPPTXOP//86NKlS4XtVub000+PTTfdNJo1axaHHnpozJ8/PwYPHlzuvpRlffXVVzFhwoTo1atXbLzxxrn5O+20UxxwwAExYsSI1X7vyixcuDAiYrXvNygpKYmioqKVrjN37tzIz8//UTdJr2w/vfzyy1FSUhJ9+/aN/Pz//tg666yzon79+vHXv/613GvVrFkzzjnnnNx0YWFhnHPOOTFjxowYN27cWo9xWbVq1cr9febMmTF79uzYe++9Y/z48RXW7dKlS2y//fZr/V4NGjSII488Mh577LHcZYFLly6NJ554Io466qioU6fOWr82QGouuwKqlT322KPcB/sTTzwxdt111+jTp08cdthhUVhYGJ9//nk0a9Ys6tWrV27btm3bRkTE559/nptXWFgYDz/8cHTo0CGKi4tj4MCBkZeXt9rjufrqq2PvvfeOGjVqRKNGjaJt27YV7j9ZVtl7b7fddhWWtW3bNl588cWYP3/+Wn/g/OabbyLihw+0q2PWrFlRt27dla7TsWPHuOeee+LCCy+MSy+9NBo0aBAzZ85co3GtbD+taJ8UFhZG69aty32/IiKaNWtWYf9su+22EfHD71XZa6+91mhslXn++efj+uuvjwkTJsSiRYty8ys7Nlq1avWj369nz57xxBNPxN///vfYZ5994uWXX46vv/66wiVwAOs78QFUa/n5+bHvvvvGnXfeGZ988km0a9dujV/jxRdfjIgfzhp88skna/Rhcscdd4xu3bqt8Xv+VMpusF7dX5o3ffr0aNmy5UrXOeGEE2L8+PFx9913x0MPPbRW40q9n1YUkKvzEIG///3vccQRR8Q+++wT9913XzRt2jQKCgpi4MCB8eijj1ZYf9mzJGvroIMOisaNG8eQIUNin332iSFDhkSTJk3Wq2MLYHW47Aqo9pYsWRIREfPmzYuIiJYtW8a0adNi7ty55dabOHFibnmZ9957L6699to47bTTYtddd40zzzwzZs+e/ZONtey9J02aVGHZxIkTo1GjRrl/1V+TMzBlxo4dGxGxwsu+lrV48eL49NNPc2eEViQ/Pz9uvfXW6Nq1a2yzzTa5S97WlRXtk5KSkpg8eXKFOJo2bVrMnz+/3LyPP/44Iv4bXRtttFFERIWnZS1/FqUyf/7zn6O4uDhefPHFOP3006N79+7rJAJW9v2sUaNGnHTSSfH000/HzJkzY9iwYXHiiSeu1UMPAKqS+ACqtcWLF8dLL70UhYWFuQ/RhxxySCxdujTuueeecuvecccdkZeXF927d89t26tXr2jWrFnceeedMWjQoPj6669/1FOcVqVp06axyy67xODBg8t9MP7ggw/ipZdeikMOOSQ3ryxC1uQ3Yj/99NOx3XbbRZs2bVa57vDhw2PBggW5x96uzN133x2jRo2KoUOHRrdu3aJz586rPaZV6datWxQWFsZdd92Vu+chImLAgAExe/bsCk+XWrJkSTz44IO56ZKSknjwwQdj0003jd133z0iIrbaaquIiHjjjTdy6y1dunS1ztzUqFEj8vLyyp0lmTJlSgwbNmytvr4yq/p+nnLKKTFz5sw455xzKjzBDWBD4bIroFp54YUXcmcwZsyYEY8++mh88skncdlll0X9+vUjIuLwww+PfffdN6644oqYMmV
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Box plot для столбца 'Popularity'\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['Popularity'])\n",
|
|||
|
"plt.title('Box Plot для Popularity')\n",
|
|||
|
"plt.xlabel('Popularity')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Решим проблему пустых значений при помощи удаления таких строк."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 45,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df_cleaned = df.dropna()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 46,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 8650\n",
|
|||
|
"Размер контрольной выборки: 2884\n",
|
|||
|
"Размер тестовой выборки: 2884\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Разделение на обучающую и тестовую выборки\n",
|
|||
|
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Оценка сбалансированности выборок, по результатам видно что баланса тут мало"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 47,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Popularity в обучающей выборке:\n",
|
|||
|
"Popularity\n",
|
|||
|
"23 258\n",
|
|||
|
"15 250\n",
|
|||
|
"26 246\n",
|
|||
|
"21 245\n",
|
|||
|
"14 245\n",
|
|||
|
" ... \n",
|
|||
|
"84 1\n",
|
|||
|
"87 1\n",
|
|||
|
"91 1\n",
|
|||
|
"79 1\n",
|
|||
|
"86 1\n",
|
|||
|
"Name: count, Length: 88, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Popularity в контрольной выборке:\n",
|
|||
|
"Popularity\n",
|
|||
|
"17 90\n",
|
|||
|
"26 86\n",
|
|||
|
"21 83\n",
|
|||
|
"24 83\n",
|
|||
|
"28 80\n",
|
|||
|
" ..\n",
|
|||
|
"85 1\n",
|
|||
|
"83 1\n",
|
|||
|
"84 1\n",
|
|||
|
"80 1\n",
|
|||
|
"77 1\n",
|
|||
|
"Name: count, Length: 85, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Popularity в тестовой выборке:\n",
|
|||
|
"Popularity\n",
|
|||
|
"22 86\n",
|
|||
|
"21 85\n",
|
|||
|
"12 84\n",
|
|||
|
"20 82\n",
|
|||
|
"26 81\n",
|
|||
|
" ..\n",
|
|||
|
"76 2\n",
|
|||
|
"71 2\n",
|
|||
|
"79 1\n",
|
|||
|
"82 1\n",
|
|||
|
"80 1\n",
|
|||
|
"Name: count, Length: 80, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['Popularity'].value_counts()\n",
|
|||
|
" print(f\"Распределение Popularity в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выполним овер- и андер- слемпинг."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 48,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Popularity в обучающей выборке после oversampling:\n",
|
|||
|
"Popularity\n",
|
|||
|
"44 258\n",
|
|||
|
"20 258\n",
|
|||
|
"30 258\n",
|
|||
|
"27 258\n",
|
|||
|
"8 258\n",
|
|||
|
" ... \n",
|
|||
|
"78 258\n",
|
|||
|
"79 258\n",
|
|||
|
"74 258\n",
|
|||
|
"81 258\n",
|
|||
|
"86 258\n",
|
|||
|
"Name: count, Length: 88, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Popularity в контрольной выборке после oversampling:\n",
|
|||
|
"Popularity\n",
|
|||
|
"21 90\n",
|
|||
|
"11 90\n",
|
|||
|
"28 90\n",
|
|||
|
"23 90\n",
|
|||
|
"37 90\n",
|
|||
|
" ..\n",
|
|||
|
"61 90\n",
|
|||
|
"84 90\n",
|
|||
|
"80 90\n",
|
|||
|
"77 90\n",
|
|||
|
"0 90\n",
|
|||
|
"Name: count, Length: 85, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Popularity в тестовой выборке после oversampling:\n",
|
|||
|
"Popularity\n",
|
|||
|
"14 86\n",
|
|||
|
"47 86\n",
|
|||
|
"27 86\n",
|
|||
|
"13 86\n",
|
|||
|
"66 86\n",
|
|||
|
" ..\n",
|
|||
|
"63 86\n",
|
|||
|
"79 86\n",
|
|||
|
"71 86\n",
|
|||
|
"82 86\n",
|
|||
|
"80 86\n",
|
|||
|
"Name: count, Length: 80, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop('Popularity', axis=1)\n",
|
|||
|
" y = df['Popularity']\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 50,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Popularity в обучающей выборке после undersampling:\n",
|
|||
|
"Popularity\n",
|
|||
|
"0 1\n",
|
|||
|
"1 1\n",
|
|||
|
"2 1\n",
|
|||
|
"3 1\n",
|
|||
|
"4 1\n",
|
|||
|
" ..\n",
|
|||
|
"84 1\n",
|
|||
|
"85 1\n",
|
|||
|
"86 1\n",
|
|||
|
"87 1\n",
|
|||
|
"91 1\n",
|
|||
|
"Name: count, Length: 88, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Popularity в контрольной выборке после undersampling:\n",
|
|||
|
"Popularity\n",
|
|||
|
"0 1\n",
|
|||
|
"1 1\n",
|
|||
|
"2 1\n",
|
|||
|
"3 1\n",
|
|||
|
"4 1\n",
|
|||
|
" ..\n",
|
|||
|
"82 1\n",
|
|||
|
"83 1\n",
|
|||
|
"84 1\n",
|
|||
|
"85 1\n",
|
|||
|
"87 1\n",
|
|||
|
"Name: count, Length: 85, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Popularity в тестовой выборке после undersampling:\n",
|
|||
|
"Popularity\n",
|
|||
|
"0 1\n",
|
|||
|
"1 1\n",
|
|||
|
"2 1\n",
|
|||
|
"3 1\n",
|
|||
|
"4 1\n",
|
|||
|
" ..\n",
|
|||
|
"76 1\n",
|
|||
|
"77 1\n",
|
|||
|
"79 1\n",
|
|||
|
"80 1\n",
|
|||
|
"82 1\n",
|
|||
|
"Name: count, Length: 80, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def undersample(df):\n",
|
|||
|
" X = df.drop('Popularity', axis=1)\n",
|
|||
|
" y = df['Popularity']\n",
|
|||
|
" \n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_undersampled = undersample(train_df)\n",
|
|||
|
"val_df_undersampled = undersample(val_df)\n",
|
|||
|
"test_df_undersampled = undersample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_undersampled, \"обучающей выборке после undersampling\")\n",
|
|||
|
"check_balance(val_df_undersampled, \"контрольной выборке после undersampling\")\n",
|
|||
|
"check_balance(test_df_undersampled, \"тестовой выборке после undersampling\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### **Онлайн обучение**\n",
|
|||
|
"\n",
|
|||
|
"https://www.kaggle.com/datasets/shariful07/student-flexibility-in-online-learning\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"Этот набор данных предоставляет информацию о студентах и их характеристиках, связанных с обучением и использованием технологий. В данных представлены следующие атрибуты: уровень образования студента (например, бакалавриат, магистратура), тип учебного заведения (государственное или частное), пол, возраст, тип используемого устройства, является ли студент IT-специалистом, местоположение, финансовое состояние, тип интернета, тип сети и уровень гибкости в обучении. Эти данные могут быть использованы для анализа влияния различных факторов на успеваемость студентов, оптимизации образовательных программ и разработки стратегий поддержки студентов в условиях цифровизации образования."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выгрузка данных из csv файла \"Онлайн обучение\" в датафрейм"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 52,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['Education Level', 'Institution Type', 'Gender', 'Age', 'Device',\n",
|
|||
|
" 'IT Student', 'Location', 'Financial Condition', 'Internet Type',\n",
|
|||
|
" 'Network Type', 'Flexibility Level'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df = pd.read_csv(\"..//static//csv//students_adaptability_level_online_education.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"При помощи ящика с усами и колонки возраста проверим набор на баланс."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 53,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAodElEQVR4nO3deXSV1b344W/CEKiEcBUEooA4gbXiVMThoqKooHWoWMXrRJ1aBXGqVa9V0erSJd5qxaH1LjT2Sq1iUetVqqBoa2sdQByuFYHGEQQBCYggIPv3R3+kRsjEsAPkedbKanPO++7sczbvOfl4zntSkFJKAQAAsJ4VNvQEAACAxkF8AAAAWYgPAAAgC/EBAABkIT4AAIAsxAcAAJCF+AAAALIQHwAAQBbiAwAAyEJ8AGzi3nvvvSgoKIiysrKGngoAjZz4AKijsrKyKCgoqPK15ZZbRp8+fWLs2LHZ5/Pcc89VmUuzZs1i2223jVNPPTX+8Y9/rJOf8de//jWGDRsW8+fPXyfjNbQnn3wyCgoKorS0NFasWNHQ0wFodJo29AQANjbXXnttdO3aNVJKMWvWrCgrK4vDDz88Hn/88fje976XfT5Dhw6Nnj17xrJly2LSpElx9913xxNPPBFvvvlmlJaWrtXYf/3rX+Oaa66JQYMGRZs2bdbNhBvQqFGjYpttton33nsvnn322ejbt29DTwmgUfHKB0A99e/fP04++eQ45ZRT4ic/+Un8+c9/jmbNmsUDDzzQIPPp3bt3nHzyyfHDH/4wRowYETfffHPMmzcv7rvvvgaZz4Zq0aJF8dhjj8VFF10Uu+++e4waNaqhpwTQ6IgPgLXUpk2baNmyZTRtWvXF5EWLFsXFF18cnTp1iqKioujWrVvcfPPNkVKKiIjFixdH9+7do3v37rF48eLK/ebNmxcdO3aMfffdN7766qt6z+eggw6KiIjy8vIat3v22Wejd+/esdlmm0WbNm3i6KOPjr///e+V1w8bNiwuueSSiIjo2rVr5du73nvvvVrn8M23p638eu6551bZdtCgQavddtiwYVW2e/jhh+O73/1uFBcXV9nu5ptvrnU+ERGPPPJILF68OH7wgx/EwIEDY8yYMbFkyZJVtlu8eHEMHTo02rZtG8XFxXHUUUfFxx9/vNo5ffzxx3H66adH+/bto6ioKHbeeee455576jQfgMbI264A6qmioiLmzJkTKaWYPXt2jBgxIj7//PM4+eSTK7dJKcVRRx0VEyZMiDPOOCN22223eOqpp+KSSy6Jjz/+OG655ZZo2bJl3HfffbHffvvFFVdcEb/4xS8iImLw4MFRUVERZWVl0aRJk3rPb/r06RERscUWW1S7zfjx46N///6x7bbbxrBhw2Lx4sUxYsSI2G+//WLSpEmxzTbbxLHHHhvvvvtuPPDAA3HLLbdE27ZtIyKiXbt2dZrHIYccEqeeempERLzyyitx2223Vbtt27Zt45Zbbqn8/pRTTqly/YsvvhjHH3987LrrrnHjjTdGSUlJzJkzJy688MI6zSXin2+56tOnT3To0CEGDhwYl112WTz++OPxgx/8oMp2gwYNioceeihOOeWU2HvvveP555+PI444YpXxZs2aFXvvvXcUFBTEkCFDol27djF27Ng444wzYsGCBXHBBRfUeW4AjUYCoE7uvffeFBGrfBUVFaWysrIq2z766KMpItJ1111X5fLjjjsuFRQUpGnTplVedvnll6fCwsL0pz/9KY0ePTpFRLr11ltrnc+ECRNSRKR77rknffrpp2nGjBnpiSeeSNtss00qKChIr7zySkoppfLy8hQR6d57763cd7fddktbbrllmjt3buVlr7/+eiosLEynnnpq5WXDhw9PEZHKy8vrfD8tXbo0RUQaMmRI5WUrb9eECRNW2f6kk05KXbt2rXJZRKSrr7668vvLL788RUSaOXNm5WUrb9fw4cNrndOsWbNS06ZN03//939XXrbvvvumo48+usp2EydOTBGRLrjggiqXDxo0aJU5nXHGGaljx45pzpw5VbYdOHBgKikpSV988UWt8wJobLztCqCe7rjjjhg3blyMGzcu7r///ujTp0+ceeaZMWbMmMptnnzyyWjSpEkMHTq0yr4XX3xxpJSqfDrWsGHDYuedd47TTjstzj333DjggANW2a8mp59+erRr1y5KS0vjiCOOiEWLFsV9990X3/3ud1e7/cyZM2Py5MkxaNCg2HzzzSsv79GjRxxyyCHx5JNP1vlnr87KtzK1aNGiTtsvXbo0ioqKatxm4cKFUVhYuMYnvf/ud7+LwsLCGDBgQOVlJ554YowdOzY+++yzysv++Mc/RkTEueeeW2X/8847r8r3KaX4/e9/H0ceeWSklGLOnDmVX4cddlhUVFTEpEmT1miuAJsyb7sCqKe99tqryi/2J554Yuy+++4xZMiQ+N73vhfNmzeP999/P0pLS6O4uLjKvjvttFNERLz//vuVlzVv3jzuueee6NmzZ7Ro0SLuvffeKCgoqPN8rrrqqujdu3c0adIk2rZtGzvttNMq55983cqf3a1bt1Wu22mnneKpp56KRYsWxWabbVbnOXzdnDlzIiKipKSkTtvPnz8/WrVqVeM2++yzT9x+++1x/vnnx09/+tMoKSmpEg21uf/++2OvvfaKuXPnxty5cyMiYvfdd4+lS5fG6NGj4+yzz46If943hYWF0bVr1yr7b7/99lW+//TTT2P+/Plx9913x913373anzl79uw6zw+gsRAfAGupsLAw+vTpE7/85S9j6tSpsfPOO9d7jKeeeioi/vmqwdSpU1f55bcmu+yyywb1kbErT0jfZptt6rT9J598El26dKlxm4EDB8akSZNixIgR1f6yX52pU6fGK6+8EhERO+ywwyrXjxo1qjI+6mrl3wg5+eST47TTTlvtNj169KjXmACNgfgAWAeWL18eERGff/55RER06dIlxo8fHwsXLqzy6sc777xTef1Kb7zxRlx77bXxwx/+MCZPnhxnnnlmvPnmm3V+5aC+Vv7sKVOmrHLdO++8E23btq181aM+r8Cs9Oqrr0ZEVPu2r69btmxZTJs2Lfr161fjdoWFhXHzzTfHm2++GeXl5XHnnXfGrFmzqpzkX51Ro0ZFs2bN4n/+539WOYH/hRdeiNtuuy0++OCD6Ny5c3Tp0iVWrFgR5eXlVUJl2rRpVfZr165dFBcXx1dffbVBhR/Ahs45HwBradmyZfH0009H8+bNK99Wdfjhh8dXX30Vt99+e5Vtb7nlligoKIj+/ftX7jto0KAoLS2NX/7yl1FWVhazZs2q16c41VfHjh1jt912i/vuu6/KXy5/66234umnn47DDz+88rKVEVKfv3D+8MMPR7du3aJ79+61bvvYY4/F4sWLKz8euCYjRoyIZ599NkaNGhV9+/aN/fbbr07zGTVqVPTu3TtOOOGEOO6446p8rfwo4ZV/o+Wwww6LiIg777xzlZ/9dU2aNIkBAwbE73//+3jrrbdW+ZmffvppneYG0Nh45QOgnsaOHVv5Csbs2bPjt7/9bUydOjUuu+yyaN26dUREHHnkkdGnT5+44oor4r333otdd901nn766XjsscfiggsuiO222y4iIq677rqYPHlyPPPMM1FcXBw9evSIq666Kn72s5/FcccdVyUE1qXhw4dH//79Y5999okzzjij8qN2S0pKqvwtiz333DMiIq644ooYOHBgNGvWLI488sjVng/yj3/8I2666aZ4+eWX49hjj43777+/8rqVb3saN25cdO7cOTp06BBXX3113HnnnbHvvvvGoYceWuN8/+///i9++tOfxrBhw6Jnz551vp0vvfRSTJs2LYYMGbLa67faaqvYY489YtSoUXHppZfGnnvuGQMGDIhbb7015s6dW/lRu++++25EVH0l6MYbb4wJEyZEr1694qyzzopvf/vbMW/evJg0aVKMHz8+5s2bV+d5AjQaDfxpWwAbjdV91G6LFi3Sbrvtlu666660YsWKKtsvXLgwXXjhham0tDQ1a9Ys7bDDDmn48OGV202cODE1bdo0nXf
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Box plot для столбца 'Age'\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['Age'])\n",
|
|||
|
"plt.title('Box Plot для Age')\n",
|
|||
|
"plt.xlabel('Age')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Теперь проверим на шум"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 63,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1YAAAIjCAYAAAAAxIqtAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABKEUlEQVR4nO3dd3xUVf7/8fekhzQw1EAKTSIdERCQpiiwLEgRsEJAQBFFFleRdemyKFhYYZFFad8VG2wAFVmluksVKSKCSIko0iSQIiQkIef3B78MDGkz3CSTgdfz8cjjkdx75sxn5t5zk3fuvWdsxhgjAAAAAMB183J3AQAAAADg6QhWAAAAAGARwQoAAAAALCJYAQAAAIBFBCsAAAAAsIhgBQAAAAAWEawAAAAAwCKCFQAAAABYRLACAAAAAIsIVgBQCv3000+y2WxauHChu0tBCYqLi1NMTIy7y3BJ+/bt1b59++t6rM1m04QJE4q0npKS1xidMGGCbDabU49fuHChbDabfvrpp+IpEECJI1gBKFHfffedHnjgAUVHRysgIEBVq1bVvffeq5kzZxbbc77//vuaMWNGruXHjx/XhAkTtHv37mJ77mtt2LBBNpvN/uXr66saNWqof//+OnLkSJE8x+bNmzVhwgQlJSUVSX/u9vnnn8tmsykiIkLZ2dnuLseSmJgYh+1/9Vd6erq7yyvVUlJSNHHiRDVq1EjBwcEKDAxU/fr1NXr0aB0/ftzd5eXrb3/7m5YvX+7uMgCUAB93FwDg5rF582Z16NBBUVFRGjJkiCpXrqxffvlFW7du1d///nc988wzxfK877//vvbu3auRI0c6LD9+/LgmTpyomJgYNW7cuFieOz8jRoxQs2bNlJmZqZ07d2ru3LlauXKlvvvuO0VERFjqe/PmzZo4caLi4uJUtmzZoinYjRYvXqyYmBj99NNPWrdunTp27Ojukixp3LixnnvuuVzL/fz89M4773hcePzyyy+L/TmOHDmijh076ueff1afPn00dOhQ+fn5ac+ePZo3b56WLVumH3/8sdjrKMxf//pXvfjiiw7L/va3v+mBBx5Qjx49HJY/9thjevDBB+Xv71+CFQIoTgQrACVmypQpCgsL0/bt23P9wX/69Gn3FFUMzp8/r6CgoALbtGnTRg888IAkaeDAgbr11ls1YsQILVq0SGPGjCmJMj3C+fPntWLFCk2dOlULFizQ4sWLPT5YVa1aVY8++mie67y8PO9CEj8/v2LtPysrS7169dKpU6e0YcMG3XXXXQ7rp0yZoldffbVYa3CWj4+PfHyc+9PK29tb3t7exVwRgJLkeUdwAB7r8OHDqlevXp5nUSpWrJhr2XvvvafmzZurTJkyKleunNq2bevw3/EVK1aoa9euioiIkL+/v2rWrKnJkyfr0qVL9jbt27fXypUrdfToUfslVzExMdqwYYOaNWsm6XKwyVl39f0S27ZtU+fOnRUWFqYyZcqoXbt22rRpk0ONOfdU7Nu3Tw8//LDKlSuX6w8/Z9x9992SpISEhALbrVu3Tm3atFFQUJDKli2r+++/X/v373eo5/nnn5ckVa9e3f66nLmPI79L1DZs2JCrbVxcXJ5tr71fZunSpbrjjjsUEhLi0O61114rtB5JWrZsmdLS0tSnTx89+OCDio+Pz/OSubS0NI0YMULly5dXSEiIunfvrl9//TXPmn799VcNGjRIlSpVkr+/v+rVq6f58+c7VU9e/U2fPl02m+267zO62rX3WOXcx/Paa69p7ty5qlmzpvz9/dWsWTNt377d4bF79uxRXFycatSooYCAAFWuXFmDBg1SYmKiQ7ucffbQoUP2s5phYWEaOHCgLly4kKumwsbhtfdYZWRkaNy4cWratKnCwsIUFBSkNm3aaP369df1nvz73//Wt99+q5deeinPsRUaGqopU6Y4LFuyZImaNm2qwMBAlS9fXo8++qh+/fVXhzZxcXEKDg7Wr7/+qh49eig4OFgVKlTQn//8Z4djiCQlJSUpLi5OYWFhKlu2rAYMGJDnpbbX3mNls9l0/vx5LVq0yL7vx8XFScr/HqvZs2erXr168vf3V0REhIYPH57rudq3b6/69etr37596tChg8qUKaOqVatq2rRphbybAIoTZ6wAlJjo6Ght2bJFe/fuVf369QtsO3HiRE2YMEGtWrXSpEmT5Ofnp23btmndunW67777JF3+wyQ4OFijRo1ScHCw1q1bp3HjxiklJUXTp0+XJL300ktKTk7WsWPH9Oabb0qSgoODddttt2nSpEkaN26chg4dqjZt2kiSWrVqJelygOnSpYuaNm2q8ePHy8vLSwsWLNDdd9+t//3vf2revLlDvX369FHt2rX1t7/9TcYYl9+bw4cPS5LCw8PzbbNmzRp16dJFNWrU0IQJE5SWlqaZM2eqdevW2rlzp2JiYtSrVy/9+OOP+uCDD/Tmm2+qfPnykqQKFSo4Vce9996r/v37S5K2b9+ut956K9+25cuXt7+n0uVLm662ZcsW9e3bV40aNdIrr7yisLAwnTlzRn/605+cqkW6fBlghw4dVLlyZT344IN68cUX9emnn6pPnz4O7eLi4vTxxx/rscce05133qmvvvpKXbt2zdXfqVOndOedd8pms+npp59WhQoVtGrVKj3++ONKSUnJdbloYZKSkjR16lSXHpOZmakzZ844LCtTpozKlCmT72Pef/99paam6oknnpDNZtO0adPUq1cvHTlyRL6+vpKk1atX68iRIxo4cKAqV66s77//XnPnztX333+vrVu35ppUoW/fvqpevbqmTp2qnTt36t1331XFihUdzv44Mw6vlZKSonfffVcPPfSQhgwZotTUVM2bN0+dOnXS119/7fJlt5988omk3PtXfhYuXKiBAweqWbNmmjp1qk6dOqW///3v2rRpk3bt2uXwj51Lly6pU6dOatGihV577TWtWbNGr7/+umrWrKlhw4ZJkowxuv/++7Vx40Y9+eSTuu2227Rs2TINGDCg0Fr+9a9/afDgwWrevLmGDh0qSapZs2a+7SdMmKCJEyeqY8eOGjZsmA4cOKC3335b27dv16ZNm+zbWpLOnTunzp07q1evXurbt6+WLl2q0aNHq0GDBurSpYtT7xWAImYAoIR8+eWXxtvb23h7e5uWLVuaF154wXzxxRcmIyPDod3BgweNl5eX6dmzp7l06ZLDuuzsbPv3Fy5cyPUcTzzxhClTpoxJT0+3L+vatauJjo7O1Xb79u1GklmwYEGu56hdu7bp1KlTruerXr26uffee+3Lxo8fbySZhx56yKn3YP369UaSmT9/vvntt9/M8ePHzcqVK01MTIyx2Wxm+/btxhhjEhISctXWuHFjU7FiRZOYmGhf9u233xovLy/Tv39/+7Lp06cbSSYhIcGpmowxJiMjw0gyTz/9tH3ZkiVLjCSzfv36XO0feeQRU716dYdlksz48ePtP48ZM8ZIMidOnLAvy3ld06dPL7SmU6dOGR8fH/POO+/Yl7Vq1crcf//9Du127NhhJJmRI0c6LI+Li8tV0+OPP26qVKlizpw549D2wQcfNGFhYXnuUwW9xhdeeMFUrFjRNG3a1LRr167Q1xQdHW0k5frK6XPAgAEO+2rO+xUeHm7Onj1rX75ixQojyXz66af2ZXnV/sEHHxhJ5r///a99Wc4+O2jQIIe2PXv2NOHh4fafnR2H7dq1c3jtWVlZ5uLFiw7tz507ZypVqpTrOa99P/PSpEkTExYWVmCbHBkZGaZixYqmfv36Ji0tzb78s88+M5LMuHHj7MsGDBhgJJlJkybler6mTZvaf16+fLmRZKZNm+bwGtu0aZNrjOa8t1cLCgoyAwYMyFXrggULHMbp6dOnjZ+fn7nvvvsc3u9Zs2bZjxk52rVrZySZ//u//7Mvu3jxoqlcubLp3bt3Ie8SgOLCpYAASsy9996rLVu2qHv37vr22281bdo0derUSVWrVrX/V1qSli9fruzsbI0bNy7XPSdX/9c9MDDQ/n1qaqrOnDmjNm3a6MKFC/rhhx+uu87du3fr4MGDevjhh5WYmKg
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Scatter plot для столбцов 'Age' и 'Financial Condition'\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.scatterplot(x='Age', y='Financial Condition', data=df)\n",
|
|||
|
"plt.title('Scatter Plot для Age и Financial Condition')\n",
|
|||
|
"plt.xlabel('Age')\n",
|
|||
|
"plt.ylabel('Financial Condition')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Удаление строк с пустыми значениями"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 64,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df_cleaned = df.dropna()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 65,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 723\n",
|
|||
|
"Размер контрольной выборки: 241\n",
|
|||
|
"Размер тестовой выборки: 241\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Разделение на обучающую и тестовую выборки\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Применение методов приращения данных (аугментации)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 66,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Gender в обучающей выборке после oversampling:\n",
|
|||
|
"Gender\n",
|
|||
|
"Male 397\n",
|
|||
|
"Female 397\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Gender в контрольной выборке после oversampling:\n",
|
|||
|
"Gender\n",
|
|||
|
"Male 140\n",
|
|||
|
"Female 140\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Gender в тестовой выборке после oversampling:\n",
|
|||
|
"Gender\n",
|
|||
|
"Female 126\n",
|
|||
|
"Male 126\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Gender в обучающей выборке после undersampling:\n",
|
|||
|
"Gender\n",
|
|||
|
"Female 326\n",
|
|||
|
"Male 326\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Gender в контрольной выборке после undersampling:\n",
|
|||
|
"Gender\n",
|
|||
|
"Female 101\n",
|
|||
|
"Male 101\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Gender в тестовой выборке после undersampling:\n",
|
|||
|
"Gender\n",
|
|||
|
"Female 115\n",
|
|||
|
"Male 115\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Разделение на обучающую и тестовую выборки\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['Gender'].value_counts()\n",
|
|||
|
" print(f\"Распределение Gender в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop('Gender', axis=1)\n",
|
|||
|
" y = df['Gender']\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")\n",
|
|||
|
"\n",
|
|||
|
"def undersample(df):\n",
|
|||
|
" X = df.drop('Gender', axis=1)\n",
|
|||
|
" y = df['Gender']\n",
|
|||
|
" \n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_undersampled = undersample(train_df)\n",
|
|||
|
"val_df_undersampled = undersample(val_df)\n",
|
|||
|
"test_df_undersampled = undersample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_undersampled, \"обучающей выборке после undersampling\")\n",
|
|||
|
"check_balance(val_df_undersampled, \"контрольной выборке после undersampling\")\n",
|
|||
|
"check_balance(test_df_undersampled, \"тестовой выборке после undersampling\")"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aimenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|