AIM-PIbd-31-Yaruskin-S-A/lab_2/laba2.ipynb

2113 lines
986 KiB
Plaintext
Raw Permalink Normal View History

2024-10-10 15:58:16 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Продажи домов\n",
"2. Данные о населении \n",
"3. Набор данных для анализа и прогнозирования сердечного приступа"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h4>Продажа домов</h4>"
]
},
{
"cell_type": "code",
2024-10-12 12:05:52 +04:00
"execution_count": 262,
2024-10-10 15:58:16 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',\n",
" 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',\n",
" 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',\n",
" 'sqft_living15', 'sqft_lot15'],\n",
" dtype='object') \n",
"\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd \n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"df = pd.read_csv(\"..//static//csv//House.csv\", index_col=\"id\")\n",
"\n",
"print(df.columns, \"\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Столбцы для русских:<br>\n",
"<br>\n",
"<span style=\"color: #189FFF;\">id:</span> Идентификатор объекта<br>\n",
"<span style=\"color: #189FFF;\">date:</span> Дата продажи<br>\n",
"<span style=\"color: #189FFF;\">price:</span> Цена недвижимости<br>\n",
"<span style=\"color: #189FFF;\">bedrooms:</span> Количество спален<br>\n",
"<span style=\"color: #189FFF;\">bathrooms:</span> Количество ванных комнат<br>\n",
"<span style=\"color: #189FFF;\">sqft_living:</span> Жилая площадь<br>\n",
"<span style=\"color: #189FFF;\">sqft_lot:</span> Площадь участка<br>\n",
"<span style=\"color: #189FFF;\">floors:</span> Количество этажей<br>\n",
"<span style=\"color: #189FFF;\">waterfront:</span> Признак наличия вида на водоем<br>\n",
"<span style=\"color: #189FFF;\">view:</span> Оценка вида<br>\n",
"<span style=\"color: #189FFF;\">condition:</span> Состояние дома<br>\n",
"<span style=\"color: #189FFF;\">grade:</span> Оценка конструкции<br>\n",
"<span style=\"color: #189FFF;\">sqft_above:</span> Площадь надземных помещений<br>\n",
"<span style=\"color: #189FFF;\">sqft_basement:</span> Площадь подвала<br>\n",
"<span style=\"color: #189FFF;\">yr_built:</span> Год постройки<br>\n",
"<span style=\"color: #189FFF;\">yr_renovated:</span> Год последнего ремонта<br>\n",
"<span style=\"color: #189FFF;\">zipcode:</span> Почтовый индекс<br>\n",
"<span style=\"color: #189FFF;\">lat:</span> Широта<br>\n",
"<span style=\"color: #189FFF;\">long:</span> Долгота<br>\n",
"<span style=\"color: #189FFF;\">sqft_living15:</span> Жилая площадь соседних домов<br>\n",
"<span style=\"color: #189FFF;\">sqft_lot15:</span> Площадь участка соседних домов<br>\n",
"<br>\n",
"Проблемная область: Прогнозирование стоимости недвижимости в зависимости от характеристик дома.<br>"
]
},
{
"cell_type": "code",
2024-10-12 12:05:52 +04:00
"execution_count": 263,
2024-10-10 15:58:16 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method DataFrame.info of date price bedrooms bathrooms sqft_living \\\n",
"id \n",
"7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
"6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
"5631500400 20150225T000000 180000.0 2 1.00 770 \n",
"2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
"1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
"... ... ... ... ... ... \n",
"263000018 20140521T000000 360000.0 3 2.50 1530 \n",
"6600060120 20150223T000000 400000.0 4 2.50 2310 \n",
"1523300141 20140623T000000 402101.0 2 0.75 1020 \n",
"291310100 20150116T000000 400000.0 3 2.50 1600 \n",
"1523300157 20141015T000000 325000.0 2 0.75 1020 \n",
"\n",
" sqft_lot floors waterfront view condition grade sqft_above \\\n",
"id \n",
"7129300520 5650 1.0 0 0 3 7 1180 \n",
"6414100192 7242 2.0 0 0 3 7 2170 \n",
"5631500400 10000 1.0 0 0 3 6 770 \n",
"2487200875 5000 1.0 0 0 5 7 1050 \n",
"1954400510 8080 1.0 0 0 3 8 1680 \n",
"... ... ... ... ... ... ... ... \n",
"263000018 1131 3.0 0 0 3 8 1530 \n",
"6600060120 5813 2.0 0 0 3 8 2310 \n",
"1523300141 1350 2.0 0 0 3 7 1020 \n",
"291310100 2388 2.0 0 0 3 8 1600 \n",
"1523300157 1076 2.0 0 0 3 7 1020 \n",
"\n",
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
"id \n",
"7129300520 0 1955 0 98178 47.5112 -122.257 \n",
"6414100192 400 1951 1991 98125 47.7210 -122.319 \n",
"5631500400 0 1933 0 98028 47.7379 -122.233 \n",
"2487200875 910 1965 0 98136 47.5208 -122.393 \n",
"1954400510 0 1987 0 98074 47.6168 -122.045 \n",
"... ... ... ... ... ... ... \n",
"263000018 0 2009 0 98103 47.6993 -122.346 \n",
"6600060120 0 2014 0 98146 47.5107 -122.362 \n",
"1523300141 0 2009 0 98144 47.5944 -122.299 \n",
"291310100 0 2004 0 98027 47.5345 -122.069 \n",
"1523300157 0 2008 0 98144 47.5941 -122.299 \n",
"\n",
" sqft_living15 sqft_lot15 \n",
"id \n",
"7129300520 1340 5650 \n",
"6414100192 1690 7639 \n",
"5631500400 2720 8062 \n",
"2487200875 1360 5000 \n",
"1954400510 1800 7503 \n",
"... ... ... \n",
"263000018 1530 1509 \n",
"6600060120 1830 7200 \n",
"1523300141 1020 2007 \n",
"291310100 1410 1287 \n",
"1523300157 1020 1357 \n",
"\n",
"[21613 rows x 20 columns]> \n",
"\n"
]
}
],
"source": [
"print(df.info, \"\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Объектом наблюдения является - Недвижимость<br>\n",
"Атрибуты - содержит набор информации о продаже дома, такие как:<br>\n",
"цену продажи, дата продажи, количество спален, ванных комнат, общую площадь дома, площадь участка, местоположение.\n"
]
},
{
"cell_type": "code",
2024-10-12 12:05:52 +04:00
"execution_count": 264,
2024-10-10 15:58:16 +04:00
"metadata": {},
"outputs": [
{
"data": {
2024-10-12 02:16:37 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAwoAAAIjCAYAAACj7OxrAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXxcd33v/9c5sy+a0S5Ltmx5353NWZyFQPYQdgi3TVoClNvS0gKF3l7S+7uQ3Jat5Rba3tuUUpqEC2EnrIEskN1ZnMR27MSbbFmWJVm7NPt2zvn9MbJiRXIs2yOPZL2fPM6DaGbOme98JUvfz/l+v5+P4TiOg4iIiIiIyDHMcjdARERERERmHgUKIiIiIiIygQIFERERERGZQIGCiIiIiIhMoEBBREREREQmUKAgIiIiIiITKFAQEREREZEJFCiIiIiIiMgEChRERERERGQCBQoiIiIiIjKBAgUREeCee+7BMAxeeOGFSZ9/85vfzLp1685wq6bfXXfdxc0338zChQsxDIMPfvCD5W6SiIjMEO5yN0BERMrny1/+MvF4nIsuuoju7u5yN0dERGYQBQoiInPY448/PjabEA6Hy90cERGZQbT0SETkFBUKBf72b/+WpUuX4vP5aGlp4W/+5m/IZrPjXtfS0oJhGHzyk5+ccI3rr78ewzB429veNu7xbDbL5z73OZYtW4bP56O5uZm//uu/nnBtwzD48z//c77zne+wcuVK/H4/F1xwAU888cSUPsOiRYswDOPkPriIiMwJmlEQETnGyMgI/f39Ex7P5/MTHvvIRz7Cvffey/ve9z4+/elP89xzz/HFL36RXbt2cf/99497rd/v5zvf+Q7/8A//gMfjAeDw4cP89re/xe/3j3utbdu84x3v4KmnnuKP//iPWb16NTt27OCrX/0qe/fu5ac//em41z/++ON8//vf5+Mf/zg+n49//dd/5YYbbuD5558/K/dViIjImaFAQUTkGNdcc81xn1u7du3Yf2/fvp17772Xj3zkI3zjG98A4M/+7M+or6/nK1/5Co8++ihvectbxl5/xRVXsHXrVn7+85/z3ve+FyhuoL744ovp7Owc9z733XcfjzzyCI8//jiXX3752OPr1q3jox/9KJs3b+bSSy8de3znzp288MILXHDBBQD83u/9HitXruSzn/0sP/nJT06jN0REZC7T0iMRkWP83//7f3n44YcnHBs2bBj3ugceeACAT33qU+Me//SnPw3Ar371q3GPe71ebr31Vu6+++6xx+655x4+9KEPTWjDD3/4Q1avXs2qVavo7+8fO6666ioAHn300XGv37Rp01iQALBw4ULe+c538uCDD2JZ1sl2gYiICHAWBQpPPPEEb3/722lqasIwjAlT81PhOA5f+cpXWLFiBT6fj/nz5/P5z3++9I0VkRnroosu4pprrplwVFVVjXtde3s7pmmybNmycY/PmzePyspK2tvbJ1z7Qx/6EL/5zW/o7u7m8ccfp7u7m/e///0TXrdv3z5eeeUV6urqxh0rVqwAoLe3d9zrly9fPuEaK1asIJVK0dfXd9J9ICIyF2jseGJnzdKjZDLJOeecw4c//GHe8573nNI1PvGJT/DQQw/xla98hfXr1zM4OMjg4GCJWyoiZ5OT2Qh8zjnncM455/Ctb32LXbt28d73vpdIJDLhdbZts379ev7xH/9x0us0NzefcntFRKRIY8cTO2sChRtvvJEbb7zxuM9ns1n+x//4H3z3u99leHiYdevW8eUvf5k3v/nNAOzatYu77rqLnTt3snLlSgAWL158JpouIrPQokWLsG2bffv2sXr16rHHe3p6GB4eZtGiRZOe9+EPf5ivfvWrHDlyhF/84heTvmbp0qVs376dq6++ekqByL59+yY8tnfvXoLBIHV1dVP8RCIic4vGjid21iw9OpE///M/55lnnuF73/seL7/8MjfffDM33HDD2B/YX/ziFyxZsoRf/vKXLF68mJaWFj7ykY+cVVGhiJTOW9/6VgC+9rWvjXv86CzATTfdNOl5t9xyC52dndTX14/9sXm997///XR2do5tkj5WOp0mmUyOe+yZZ57hpZdeGvu6o6ODn/3sZ1x33XW4XK6pfiQRETmGxo5n0YzCGzl06BB33303hw4doqmpCYC/+qu/4je/+Q133303X/jCFzhw4ADt7e388Ic/5Fvf+haWZfGXf/mXvO997+N3v/tdmT+BiMw055xzDrfddhv//u//zvDwMFdeeSXPP/889957L+9617vGZTw6VlVVFd3d3bhcruPOFvzhH/4hP/jBD/joRz/Ko48+ymWXXYZlWezevZsf/OAHPPjgg2zcuHHs9evWreP6668flx4V4M477zzh5/jFL37B9u3bgWIK2Jdffpm/+7u/A+Ad73jHhE3cIiJzgcaORXMiUNixYweWZY1tBDwqm81SU1MDFNcEZ7NZvvWtb4297pvf/CYXXHABe/bsGZtSEhE56j/+4z9YsmQJ99xzD/fffz/z5s3j9ttv53Of+9wbnldZWfmGz5umyU9/+lO++tWv8q1vfYv777+fYDDIkiVL+MQnPjHhd9mVV17Jpk2buPPOOzl06BBr1qzhnnvumdIg/8c//jH33nvv2Ndbt25l69atACxYsECBgojMSRo7Fs2JQCGRSOByuXjxxRcnTMOHw2EAGhsbcbvd434gjq47PnTo0FnxzRaR4/vgBz/IBz/4weM+/9hjj014zO1289nPfpbPfvazb3jtgwcPnvTzHo+Hv/7rv+av//qv3/Dco2699VZuvfXWKb32WPfccw/33HPPSZ8nInI209ixaE4ECueddx6WZdHb28sVV1wx6Wsuu+wyCoUC+/fvZ+nSpUBxMyBw3E2JIiIiInL20dix6KwJFBKJBK2trWNft7W1sW3bNqqrq1mxYgW33norH/jAB/jf//t/c95559HX18dvf/tbNmzYwE033cQ111zD+eefz4c//GG+9rWvYds2H/vYx7j22msnTDuJiIiIyOymseMUOGeJRx991AEmHLfddpvjOI6Ty+Wcz372s05LS4vj8XicxsZG593vfrfz8ssvj12js7PTec973uOEw2GnoaHB+eAHP+gMDAyU6ROJiJwY4HzsYx8rdzNERGYdjR1PzHAcxylTjCIiIiIiIjPUnKmjICIiIiIiU6dAQUREREREJpjVm5lt26arq4uKiorjFi4SERERkfJxHId4PE5TUxOmOfPuUWcyGXK53LRc2+v14vf7p+XaZ0JZAwXLsrjjjjv49re/zZEjR2hqauKDH/wg/9//9/9NaeDf1dVFc3PzGWipiIiIiJyOjo4OFixYUO5mjJPJZFjcUseRnsS0XH/evHm0tbXN2mChrIHCl7/8Ze666y7uvfde1q5dywsvvMCHPvQhotEoH//4x094fkVFBVD8wYtEIif9/vl8noceeojrrrsOj8dz0ufLROrT0lOflp76tPTUp6WnPi099en0OFG/xmIxmpubx8ZtM0kul+NIT4L2Vz5OpMJX0mvH4lkWrf1ncrmcAoVTsXnzZt75zndy0003AdDS0sJ3v/tdnn/++Smdf3TWIRKJnHKgEAwGiUQi+oVRIurT0lOflp76tPTUp6WnPi099en0mGq/zuRl4hUVXioi3pJe02H2JxYta6Bw6aWX8u///u/s3buXFStWsH37dp566in+8R//cdLXZ7NZstns2NexWAwo/oDm8/mTfv+j55zKuTI59WnpqU9LT31aeurT0lOflp76dHqcqF9nQ3/bONglHtiX+nrlUNY6CrZt8zd/8zf8/d//PS6XC8uy+PznP8/tt98+6evvuOMO7rzzzgmP33fffQSDweluroiIiIicpFQqxS233MLIyMgprQCZTrFYjGg0Sv+hTxOJlHjpUSxL7cL/PSM/91SVdUbhBz/4Ad/5zne47777WLt2Ldu2beOTn/wkTU1N3HbbbRNef/vtt/OpT31q7Ouja96uu+66U1569PDDD3PttddqCrJE1Kelpz4tPfVp6alPS099Wnrq0+lxon49ugJkJnNG/1fqa852ZQ0
2024-10-10 15:58:16 +04:00
"text/plain": [
"<Figure size 1000x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"\n",
"plt.scatter(df['sqft_living'], df['price'], c=df['price'], alpha=0.6)\n",
"plt.colorbar(label='Price')\n",
"\n",
"plt.title(\"Номер 1\")\n",
"plt.ylabel(\"Price\")\n",
"plt.xlabel(\"Living Area\")\n",
"plt.grid(visible='true')\n",
"\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
2024-10-12 12:05:52 +04:00
"execution_count": 265,
2024-10-10 15:58:16 +04:00
"metadata": {},
"outputs": [
{
"data": {
2024-10-12 02:16:37 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA/UAAAIjCAYAAABCqzt/AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC7OElEQVR4nOzdd3zbd50/8NdXkiVLsi1vx3HseGTPtmnSpoF0t7RQCmEEytGyypVj9IA7rimUHgc04QelhQN60CvrrqXhIC3QFbp3kzR7NsN2PCInnrIty7ItfX9/fL+fr4a1hyXZr+fj0QdYlpSv41jW+/NekizLMoiIiIiIiIgo5+gyfQFERERERERElBgG9UREREREREQ5ikE9ERERERERUY5iUE9ERERERESUoxjUExEREREREeUoBvVEREREREREOYpBPREREREREVGOYlBPRERERERElKMY1BMRERERERHlKAb1RERERERERDmKQT0REVGW++1vfwtJkvD222+H/Pxll12GZcuWTfFVpVd7ezu+853vYM2aNSgpKUF5eTkuu+wyPPfcc5m+NCIioqzCoJ6IiIiyzl/+8hf84Ac/wLx58/C9730Pd911F4aGhnD11VfjN7/5TaYvj4iIKGsYMn0BRERERMEuv/xytLW1oby8XLvttttuw3nnnYdvf/vb+PSnP53BqyMiIsoezNQTERFNQxMTE/jud7+LpqYmmEwm1NfX484774Tb7Q64X319PSRJwj//8z9Peo5rr70WkiThfe97X8Dtbrcbd999N+bNmweTyYTa2lp84xvfmPTckiThS1/6Eh5++GEsXLgQ+fn5WLVqFV555ZWo17906dKAgB4ATCYTrr/+enR0dGBoaCjGvwkiIqLpjZl6IiKiHOFwONDT0zPp9vHx8Um3fe5zn8Pvfvc7fPjDH8bXv/517NixA5s3b8bRo0fx2GOPBdw3Pz8fDz/8MH74wx8iLy8PANDR0YHnn38e+fn5Aff1er14//vfj9deew2f//znsXjxYhw8eBD33Xcfjh8/jscffzzg/i+//DK2bt2Kr3zlKzCZTPjFL36B97znPdi5c2dCcwC6urpgsVhgsVjifiwREdF0xKCeiIgoR1x11VVhP7d06VLt/+/fvx+/+93v8LnPfQ4PPvggAOCf/umfUFlZiR/96Ed48cUXcfnll2v3f/e73429e/fir3/9Kz70oQ8BUIbzXXTRRejs7Az4cx555BE899xzePnll/Gud71Lu33ZsmW47bbb8MYbb+CSSy7Rbj906BDefvttrFq1CgDwsY99DAsXLsS3v/1tbNu2La6v/+TJk9i2bRs+8pGPQK/Xx/VYIiKi6Yrl90RERDni5z//OZ599tlJ/61YsSLgfk899RQA4Gtf+1rA7V//+tcBAE8++WTA7UajEZ/4xCcCBtD99re/Ddm3/n//939YvHgxFi1ahJ6eHu2/K664AgDw4osvBtx/7dq1WkAPAHV1dbjxxhuxfft2eDyemL/2kZERfOQjH4HZbMaWLVtifhwREdF0x0w9ERFRjlizZg0uvPDCSbeXlJQElOWfPn0aOp0O8+bNC7jfrFmzUFxcjNOnT096jk9/+tNYtWoV7HY7jh8/Drvdjo9+9KP43ve+F3C/EydO4OjRo6ioqAh5jefOnQv4eP78+ZPus2DBAoyMjKC7uxuzZs0K/wWrPB4PPvaxj+HIkSN4+umnMXv27KiPISIimikY1BMREU1TkiTFfN+VK1di5cqV+P3vf4+jR4/iQx/6EIqKiibdz+v1Yvny5fjxj38c8nlqa2sTvt5wbr31VjzxxBN4+OGHtYoAIiIiUjCoJyIimmbmzp0Lr9eLEydOYPHixdrtZ8+excDAAObOnRvycZ/5zGdw3333oaurC3/7299C3qepqQn79+/HlVdeGdOhwYkTJybddvz4cVgslrDZfn//+q//it/85je4//778fGPfzzq/YmIiGYa9tQTERFNM9dffz0A4P777w+4XWTX3/ve94Z83E033YTOzk5UVlbisssuC3mfj370o+js7NQG8PlzuVxwOp0Bt7355pvYs2eP9nF7ezv+8pe/4Jprrok67O6HP/whfvSjH+HOO+/E7bffHvG+REREMxUz9URERNPMypUrccstt+BXv/oVBgYGcOmll2Lnzp343e9+hw984AMBk+/9lZSUwG63Q6/Xh83Cf/KTn8Qf//hH3HbbbXjxxRexbt06eDweHDt2DH/84x+xffv2gL7/ZcuW4dprrw1YaQcA3/nOdyJ+DY899hi+8Y1vYP78+Vi8eDH+93//N+DzV199NaqqquL5ayEiIpqWGNQTERFNQ//93/+NxsZG/Pa3v8Vjjz2GWbNmYdOmTbj77rsjPq64uDji53U6HR5//HHcd999+P3vf4/HHnsMFosFjY2NuP3227FgwYKA+1966aVYu3YtvvOd76CtrQ1LlizBb3/720kT+4Pt378fgFK+/8lPfnLS51988UUG9URERAAkWZblTF8EERERTT+SJOGLX/wifvazn2X6UoiIiKYt9tQTERERERER5SgG9UREREREREQ5ikE9ERERERERUY7ioDwiIiJKC47tISIiSj9m6omIiIiIiIhyFIN6IiIiIiIiohw148rvvV4vzpw5g8LCQkiSlOnLISIiIiIiomlOlmUMDQ1h9uzZ0OlSm1ufcUH9mTNnUFtbm+nLICIiIiIiohmmvb0dc+bMSelzzrigvrCwEIDyl1lUVJThqyEiIiIiIqLpbnBwELW1tVo8mkozLqgXJfdFRUUM6omIiIiIiGjKpKMFnIPyiIiIiIiIiHIUg3oiIiIiIiKiHMWgnoiIiIiIiChHMagnIiIiIiIiylEM6omIiIiIiIhyFIN6IiIiIiIiohzFoJ6IiIiIiIgoRzGoJyIiIiIiIspRDOqJiIiIiIiIchSDeiIiIiIiIqIcxaCeiIiIiIiIKEcxqCciIiIiIiLKUQzqiYiIiIiIiHIUg3oiIiIiIiKiHMWgnoiIiIiIiChHMagnyiJ2hwtvnOqB3eHK9KUQEREREVEOMGT6AohI8fBbp/GtvxyCLAM6Cdi8YTk2rq7L9GUREREREVEWY6aeKAvsPt2Hbz6uBPQA4JWBO7cdYsaeiIiIiIgiYlBPlGF/P9yFT/73jkm3e2QZrT0jGbgiIiIiIiLKFSy/J8qQsQkvfvDMMTz0WkvIz+slCfXllim+KiIiIiIiyiUM6ommkN3hQkuPE/kGHf7jiaPY1z4AAPjcuxrQM+zG4/vOAFAC+ns2LEO1zZzBqyUiIiIiomzHoJ5oimzd1YZN2w7CK/tuK8o34EcfWYlrls7CEwfO4PF9Z7B4ViF+/enVDOiJiIiIiCgqBvVEU8DucE0K6AHgN59ejVVzSwEAVqPy42jQ6xjQExERERFRTDgoj2gKtPQ4JwX0ADA24bvRYtQDAJxjE1N1WURERERElOMY1BNNgYZyK3RS4G3Bg/CsJiVT73QzqCciIiIiotgwqCeaAtU2M77z/mXaxzoJkwbhiUz9iNsz5ddHRERERES5iUE90RS5cnElACWgf+3fLsfG1XUBny8QmfqxCchyiFp9IiIiIiKiIAzqiaZIn3MMAFBWYMLs4sn75y1qUO+VAfeEd0qvjYiIiIiIchODeqIpMjAyDgAoseSF/Lw5T6/9f/bVExERERFRLBjUE02R/hElU19sMYb8vF4naYH9yBj76omIiIiIKDoG9URTZEAN6kvDBPUAYDUpQf0wM/VERERERBQDBvVEU6TPqZbfW0OX3wOAxaj01Y9wVz0REREREcWAQT3RFIlWfg/476pn+T0REREREUXHoJ5oisRUfi921TNTT0REREREMWBQTzRF+tTp98Vhpt8DvrV2zNQTEREREVEsGNQTTRGRqS9hpp6IiIiIiFKEQT3RFBE99SXW8EG9GJQ3zEw9ERERERHFgEE90RTpF9PvI5Tfi5V2zNQTEREREVEsGNQTTYGxCa+2ez5i+T176omIiIiIKA4M6ommwIBLKb3XSUCROUKmnj31REREREQUBwb1RFNAlN7
2024-10-10 15:58:16 +04:00
"text/plain": [
"<Figure size 1200x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"year_condition = df.groupby('yr_built')['condition'].mean().reset_index()\n",
"\n",
"plt.figure(figsize=(12, 6))\n",
"\n",
"plt.plot(year_condition['yr_built'], year_condition['condition'], marker='.')\n",
"\n",
"plt.title(\"Номер 2\")\n",
"plt.xlabel(\"Year Built\")\n",
"plt.ylabel(\"Condition\")\n",
"\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Связь между объектами есть. Цена связана почти со всеми характиристиками дома.<br> Например на графике номер один показана зависимоость между ценой и размером дома.<br> А на графике номер 2 показа зависимость состояния домов с годами."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3>Примеры бизнес целей</h3>\n",
"\n",
"1. Прогнозирование стоимости недвижимости на основе характиристик дома.\n",
"2. Наблюдение за изменениями характиристик дома с годами.\n",
"\n",
"Эффект для бизнеса: Оценка и оптимизация цен, Оценка и планирование затрат, выявление тенденции на рынке, стратегия планирования."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3>Цели технического проекта</h3>\n",
"<ul>Для первой цели:</ul>\n",
" <li>Вход: Характеристики недвижимости</li>\n",
" <li>Целевой признак: Цена.</li>\n",
"<ul>Для второй цели:</ul>\n",
" <li>Вход: оценка конструкции, Состояние дома</li>\n",
" <li>Целевой признак: Год постройки</li>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3>Код ниже нужен для определения проблем данных</h3>"
]
},
{
"cell_type": "code",
2024-10-12 12:05:52 +04:00
"execution_count": 266,
2024-10-10 15:58:16 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"date 20150527T000000\n",
"price 7700000.0\n",
"bedrooms 33\n",
"bathrooms 8.0\n",
"sqft_living 13540\n",
"sqft_lot 1651359\n",
"floors 3.5\n",
"waterfront 1\n",
"view 4\n",
"condition 5\n",
"grade 13\n",
"sqft_above 9410\n",
"sqft_basement 4820\n",
"yr_built 2015\n",
"yr_renovated 2015\n",
"zipcode 98199\n",
"lat 47.7776\n",
"long -121.315\n",
"sqft_living15 6210\n",
"sqft_lot15 871200\n",
"dtype: object \n",
"\n",
"Index(['bedrooms', 'bathrooms', 'waterfront', 'view', 'sqft_basement',\n",
" 'yr_renovated'],\n",
" dtype='object') \n",
"\n",
"Признаки с низкой дисперсией:\n",
" waterfront 0.007485\n",
"lat 0.019200\n",
"long 0.019833\n",
"dtype: float64 \n",
"\n",
"id\n",
"7129300520 1955\n",
"6414100192 1951\n",
"5631500400 1933\n",
"2487200875 1965\n",
"1954400510 1987\n",
" ... \n",
"263000018 2009\n",
"6600060120 2014\n",
"1523300141 2009\n",
"291310100 2004\n",
"1523300157 2008\n",
"Name: yr_built, Length: 21613, dtype: int64\n"
]
}
],
"source": [
"max_value = df.max(axis=0)\n",
"\n",
"columns_with_zero = df.columns[(df == 0).any()]\n",
"\n",
"numeric_data = df.select_dtypes(include='number')\n",
"shum = numeric_data.var()\n",
"low_dispers = 0.1\n",
"low_var_columns = shum[shum < low_dispers]\n",
"\n",
"\n",
"year = df['yr_built']\n",
"\n",
"print(max_value, \"\\n\")\n",
"print(columns_with_zero, \"\\n\")\n",
"print(\"Признаки с низкой дисперсией:\\n\", low_var_columns, \"\\n\")\n",
"print(year)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<ul><h4>Из полученных данных выяснилось:</h4></ul>\n",
" <li>признаки bedrooms, bathrooms, waterfront, view, sqft_basement и yr_renovated имеют в себе нулевые поля, что может создать смещение если искать по этим полям</li>\n",
" <li>признаки bedrooms, bathrooms и price имеют аномально высокие значения, и это указывает на наличие выбросов</li>\n",
" <li>признаки waterfront, view, condition имеют низкие значения дисперсии, что может привести к снижению значимости этих признаков</li>\n",
" <li>признак yr_built варьируется от 1900 до 2015. Это может быть актуальной информацией для анализа старых зданий, но актуальность данных по ремонту и реконструкции (это призгак yr_renovated) может быть ниже, так как 0 указывает на отсутствие ремонта</li>\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<ol><h3>Примеры решения проблем для набора данных</h3></ol>\n",
" <li>Удаление выбросов на основе значения или bathrooms > 5</li>\n",
" <li>Замена 0 на год постройки (это признак yr_built), если дом не подвергался ремонту</li>\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3>Оценка качества данных</h3>\n",
"1. Информативность. Набор данных предоставляет достаточную информацию для анализа цен на недвижимость.\n",
"2. Степень покрытия. Набор данных затрагивает только один райно, не включая информацию о других райнов.\n",
"3. Соответствие реальным данным. Данные вполне кажутся реальными, не считая некоторых редких выбросов.\n",
"4. Согласованность меток. Метки состояние и оценка вида, имеют четкие значения."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3>Разбиение данных на обучающую, контрольную и тестовую выборки</h3>"
]
},
{
"cell_type": "code",
2024-10-12 12:05:52 +04:00
"execution_count": 267,
2024-10-10 15:58:16 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Исходный размер строк: 21613 строк\n",
"Размер обучающей выборки: 15129 строк\n",
"Размер валидационной выборки: 3242 строк\n",
"Размер тестовой выборки: 3242 строк\n"
]
}
],
"source": [
"df_numeric = df.select_dtypes(include='number')\n",
"\n",
"x = df_numeric.drop(['price'], axis=1)\n",
"y = df_numeric['price']\n",
"\n",
"x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.3, random_state=14)\n",
"\n",
"x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=14)\n",
"\n",
"print(f\"Исходный размер строк: {df_numeric.shape[0]} строк\")\n",
"print(f\"Размер обучающей выборки: {x_train.shape[0]} строк\")\n",
"print(f\"Размер валидационной выборки: {x_val.shape[0]} строк\")\n",
"print(f\"Размер тестовой выборки: {x_test.shape[0]} строк\")"
]
},
{
"cell_type": "code",
2024-10-12 12:05:52 +04:00
"execution_count": 268,
2024-10-10 15:58:16 +04:00
"metadata": {},
"outputs": [
{
"data": {
2024-10-12 02:16:37 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1sAAAIjCAYAAAD1OgEdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACdvUlEQVR4nOzdeXhTVf4/8Pe92dN9oRuUUrayFhAVUVlEVhF11HEUGdBhxHHAcdBRfzgugDqOorjPOM4oOl/BBUVchpFNHFArKlrWgiyFsrakW9pmzz2/P9JEQveSNEnzfj1Pn6dJTk4+SW9u7+eecz5XEkIIEBERERERUUDJoQ6AiIiIiIioM2KyRUREREREFARMtoiIiIiIiIKAyRYREREREVEQMNkiIiIiIiIKAiZbREREREREQcBki4iIiIiIKAiYbBEREREREQUBky0iIiLqUFVVVThw4ABcLleoQ6EAEkKgoqIC+/fvD3UoRGGDyRYREREFldPpxFNPPYUhQ4ZAp9MhKSkJffr0wcaNG0MdWkTYtWsXVq9e7btdWFiI//znP6EL6Aw1NTV48MEHkZeXB61Wi5SUFPTt2xf79u0LdWhEYYHJFlEHeOONNyBJku9Hr9ejb9++mDdvHkpLS0MdHlFYMplMkCQJCxcuDHUodA7sdjvGjx+Phx56CGPHjsXKlSuxfv16fP755xg5cmSow4sINTU1uP322/HNN99g//79uOuuu7Bz585Qh4Xy8nKMHDkSL7zwAq6//np89NFHWL9+Pb744gv06NEj1OERhQV1qAMgiiaLFy9Gbm4ubDYbvvzyS/z973/HmjVrsGvXLhiNxlCHR0QUcE8++SS2bt2KtWvXYuzYsaEOJyKNHDnS9wMAffv2xW233RbiqIB7770XJ0+eREFBAQYOHBjqcIjCEpMtog40ZcoUnH/++QCA3/72t0hJScHSpUvx0Ucf4aabbgpxdEREgeVyufDcc8/hnnvuYaJ1jlavXo09e/bAarVi8ODB0Gq1IY2nrKwMb775Jl555RUmWkTN4DRCohAaN24cAKC4uBgAUFFRgT/96U8YPHgwYmNjER8fjylTpmD79u0Nnmuz2bBw4UL07dsXer0emZmZuPbaa3Hw4EEAwOHDh/2mLp79c+aBzxdffAFJkvDuu+/igQceQEZGBmJiYnDVVVfh6NGjDV5769atmDx5MhISEmA0GjFmzBh89dVXjb7HsWPHNvr6jU0Ne+uttzB8+HAYDAYkJyfjxhtvbPT1m3tvZ1IUBc899xwGDhwIvV6P9PR03H777aisrPRr16NHD1x55ZUNXmfevHkN+mws9iVLljT4TAHP9KlHHnkEvXv3hk6nQ3Z2Nu677z7Y7fZGP6v2vs+6ujrcc889yM7Ohk6nQ15eHp5++mkIIRrE3tjPY489BgBwOBx4+OGHMXz4cCQkJCAmJgajRo3Cpk2bGo3r6aefxrPPPoucnBwYDAaMGTMGu3bt8mt7yy23NJhOdPToURgMBkiShMOHD/vut1gsuPXWWxETE4MBAwZg27ZtADzrfW699VYYjUYMGTIE33//vV9/3m3smmuuafAZ3n777ZAkCYMGDfK7/+mnn8bFF1+MlJQUGAwGDB8+HO+//37jf4izjB07tkF/3j7Pfk+ApxjEH//4R9/fp3fv3njyySehKIqvzZmf6dkGDRrU6Pe1uXgb+9wb06NHD992IMsyMjIy8Ktf/QolJSUtPhcA/va3v2HgwIHQ6XTIysrC3LlzUVVV5Xt83759qKysRFxcHMaMGQOj0YiEhARceeWVftvKpk2bIEkSPvzwwwavsWLFCkiShIKCAl/Mt9xyi18b72fyxRdf+O7bsmULfvnLX6J79+6+79/8+fNhtVr9nrtw4cIG36nly5dj6NCh0Ov1SElJwU033dTgM7nlllsQGxvrd9/777/fIA4AiI2NbRAz0Lp93tixY31//wEDBmD48OHYvn17o/uCxpy9D05NTcXUqVMbfFclScK8efOa7Mc7Hd67fX/33XdQFAUOhwPnn39+s58VAHz++ecYNWoUYmJikJiYiKuvvhpFRUV+bbx/i7179+KGG25AfHw8UlJScNddd8FmszWI98x9scvlwhVXXIHk5GTs2bPHr21r/7cQBQNHtohCyJsYpaSkAAAOHTqE1atX45e//CVyc3NRWlqKf/zjHxgzZgz27NmDrKwsAIDb7caVV16JjRs34sYbb8Rdd92FmpoarF+/Hrt27UKvXr18r3HTTTfhiiuu8HvdBQsWNBrP448/DkmScP/996OsrAzPPfccxo8fj8LCQhgMBgCef5hTpkzB8OHD8cgjj0CWZSxbtgzjxo3Dli1bcOGFFzbot1u3bnjiiScAALW1tbjjjjsafe2HHnoIN9xwA37729/i9OnTePHFFzF69Gj8+OOPSExMbPCcOXPmYNSoUQCAVatWNThQu/322/HGG2/g1ltvxR/+8AcUFxfjpZdewo8//oivvvoKGo2m0c+hLaqqqnzv7UyKouCqq67Cl19+iTlz5qB///7YuXMnnn32Wfz0009+i91b0tz7FELgqquuwqZNmzB79mwMHToUa9euxb333ovjx4/j2Wef9etrwoQJmDlzpt99Q4cOBQCYzWb861//wk033YTbbrsNNTU1eO211zBp0iR8++23vnZe//73v1FTU4O5c+fCZrPh+eefx7hx47Bz506kp6c3+X4efvjhBgdOADB//ny8+eabmDdvHrp164bf//73AIBXX30V48aNw2OPPYbnn38eU6ZMwaFDhxAXF+d7rl6vx3/+8x+UlZUhLS0NAGC1WvHuu+9Cr9c3eK3nn38eV111FW6++WY4HA688847+OUvf4lPP/0UU6dObTL2trJYLBgzZgyOHz+O22+/Hd27d8fXX3+NBQsW4OTJk3juuecC9lrtNWrUKMyZMweKomDXrl147rnncOLECWzZsqXZ5y1cuBCLFi3C+PHjcccdd2Dfvn34+9//ju+++873/SovLwfg2ef06dMHixYtgs1mw8svv4xLLrkE3333Hfr27YuxY8ciOzsby5cvxy9+8Qu/11m+fDl69erV5vVdK1euhMViwR133IGUlBR8++23ePHFF3Hs2DGsXLmyyeetWLECM2bMwJAhQ/DEE0+gvLwcL7zwAr788kv8+OOPSE1NbVMcTWnPPs/r/vvvb9Nr9evXD3/+858hhMDBgwexdOlSXHHFFa1Oqhvj/dvOmzcPw4cPx1//+lecPn260c9qw4YNmDJlCnr27ImFCxfCarXixRdfxCWXXIIffvihwYmBG264AT169MATTzyBb775Bi+88AIqKyvx73//u8l4fvvb3+KLL77A+vXrMWDAAN/95/I5EwWEIKKgW7ZsmQAgNmzYIE6fPi2OHj0q3nnnHZGSkiIMBoM4duyYEEIIm80m3G6333OLi4uFTqcTixcv9t33+uuvCwBi6dKlDV5LURTf8wCIJUuWNGgzcOBAMWbMGN/tTZs2CQCia9euwmw2++5/7733BADx/PPP+/ru06ePmDRpku91hBDCYrGI3NxcMWHChAavdfHFF4tBgwb5bp8+fVoAEI888ojvvsOHDwuVSiUef/xxv+fu3LlTqNXqBvfv379fABBvvvmm775HHnlEnLlL27JliwAgli9f7vfczz77rMH9OTk5YurUqQ1inzt3rjh7N3l27Pfdd59IS0sTw4cP9/tM/+///k/Isiy2bNni9/xXXnlFABBfffVVg9c7W2ve5+rVqwUA8dhjj/k99/rrrxeSJIkDBw74xT537twmX8/lcgm73e53X2VlpUhPTxe/+c1vfPd5t60zt10hhNi6dasAIObPn++7b9asWSInJ8d3e9euXUKWZTFlyhQBQBQXFwshhDh16pTQarViwYIFvraffvqpACCuuOIK3/ZWVFQkJEkSzz77rK/dmDFjxMCBA0V+fr54+umnfff/3//9n+jWrZsYNWqUGDhwoN/7slgsfrcdDocYNGiQGDduXJOfz9mvd7YlS5b4vSchhHj00UdFTEyM+Omnn/za/r//9/+ESqUSJSU
2024-10-10 15:58:16 +04:00
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
2024-10-12 02:16:37 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACYRElEQVR4nOzdeXxU5dn/8c85s2WyEwIJa0BAVgHFDaugKCBu9dHWWrVF24q1YFttbX9YF0RbH6t1qaJ2sWqraIt1ebRWFtequKEgSET2IFsIkHUy67l/fySZEhIgCZNMlu/79eKlc+ac+1wzOXPmXHPf93UsY4xBREREREREmsxOdgAiIiIiIiIdjRIpERERERGRZlIiJSIiIiIi0kxKpERERERERJpJiZSIiIiIiEgzKZESERERERFpJiVSIiIiIiIizaRESkREREREpJmUSImIiIhIm3Ech5KSEjZs2JDsUEQOixIpERERkU7gnXfe4c0334w/fvPNN3n33XeTF9A+duzYwU9/+lMKCgrwer306NGDESNGUF5enuzQRFpMiZR0WY8//jiWZcX/paSkcOSRRzJr1ix27tyZ7PBE2qWSkhIsy2LOnDnJDkVE9rNlyxZ+9KMfsXLlSlauXMmPfvQjtmzZkuywWLduHccddxzPPPMMV111FS+//DKLFy/mtddeIy0tLdnhibSYO9kBiCTb3LlzGThwIMFgkHfeeYeHH36YV155hVWrVpGamprs8ERERJrkggsu4L777mP06NEAjB8/ngsuuCDJUcFVV12F1+vl/fffp0+fPskORyRhlEhJlzdt2jSOPfZYAH7wgx/QvXt37rnnHl588UW+/e1vJzk6ERGRpvH5fLz33nusWrUKgFGjRuFyuZIa07Jly3j99ddZtGiRkijpdDS0T2Q/kyZNAmDjxo0A7Nmzh5///OccddRRpKenk5mZybRp01ixYkWDbYPBIHPmzOHII48kJSWFXr16ccEFF7B+/XoANm3aVG844f7/Tj311Hhbb775JpZl8fe//50bbriB/Px80tLSOO+88xodqvHBBx9w5plnkpWVRWpqKhMnTjzg2PhTTz210f03NlzrySefZNy4cfj9fnJycrj44osb3f/BXtu+HMfhvvvuY+TIkaSkpJCXl8dVV13F3r176603YMAAzjnnnAb7mTVrVoM2G4v9rrvuavCeAoRCIW655RYGDx6Mz+ejX79+/OIXvyAUCjX6XrX0dVZVVfGzn/2Mfv364fP5GDp0KHfffTfGmAaxN/bv9ttvByAcDnPzzTczbtw4srKySEtL45RTTuGNN95oNK67776be++9l4KCAvx+PxMnToxfVNW5/PLLGTBgQL1lW7Zswe/3Y1kWmzZtii8PBAJcccUVpKWlMWLECJYtWwZAJBLhiiuuIDU1lTFjxvDxxx/Xa6/uGDv//PMbvIdXXXUVlmUxatSoesvvvvtuTjrpJLp3747f72fcuHE8++yzjf8h9nPqqac2aK+uzf1fE0BpaSk//elP43+fwYMHc+edd+I4Tnydfd/T/Y0aNarRz+vB4m3sfW/MgAED4seBbdvk5+fzrW99i6KiooRt25z3ev9h0I2dr+rW2fd9dhyH0aNHY1kWjz/++EFj3fff/us2NVbLspg1a1aD5eecc06D9705nwGAhx56iJEjR+Lz+ejduzczZ86ktLS03jrNOQabc36LRqPcdtttDBo0CJ/Px4ABA7jhhhsanLMGDBjA5ZdfjsvlYsyYMYwZM4bnnnsOy7ISdtwd7DNRZ86cOfVew/vvv09KSgrr16+Pv4f5+flcddVV7Nmzp8H2CxYsiH/n5Obmctlll7F169Z661x++eWkp6ezYcMGpk6dSlpaGr1792bu3Ln1zrF18e57TFVUVDBu3DgGDhzI9u3b48ub+t0ksi/1SInspy7p6d69OwAbNmzghRde4Jvf/CYDBw5k586d/OEPf2DixImsXr2a3r17AxCLxTjnnHN47bXXuPjii/nJT35CRUUFixcvZtWqVQwaNCi+j29/+9ucddZZ9fY7e/bsRuP59a9/jWVZ/PKXv6S4uJj77ruPM844g+XLl+P3+wF4/fXXmTZtGuPGjeOWW27Btm0ee+wxJk2axH/+8x+OP/74Bu327duXO+64A4DKykquvvrqRvd90003cdFFF/GDH/yAXbt28cADDzBhwgQ+/fRTsrOzG2wzY8YMTjnlFACee+45nn/++XrPX3XVVTz++ONcccUV/PjHP2bjxo08+OCDfPrpp7z77rt4PJ5G34fmKC0tjb+2fTmOw3nnncc777zDjBkzGD58OCtXruTee+/lyy+/5IUXXmjyPg72Oo0xnHfeebzxxht8//vfZ+zYsSxcuJDrr7+erVu3cu+999Zra/LkyXz3u9+tt2zs2LEAlJeX8+c//5lvf/vbXHnllVRUVPDoo48ydepUPvzww/h6df76179SUVHBzJkzCQaD3H///UyaNImVK1eSl5d3wNdz8803EwwGGyy/9tpreeKJJ5g1axZ9+/blRz/6EQB//OMfmTRpErfffjv3338/06ZNY8OGDWRkZMS3TUlJ4V//+hfFxcX07NkTgOrqav7+97+TkpLSYF/3338/5513HpdeeinhcJhnnnmGb37zm7z88sucffbZB4y9uQKBABMnTmTr1q1cddVV9O/fn/fee4/Zs2ezfft27rvvvoTtq6VOOeUUZsyYgeM4rFq1ivvuu49t27bxn//8JyHbtuS9vvfee8nNzQVqzg2H8re//Y2VK1cedJ2xY8fys5/9DKj58ermm29usE5bHRcH+gzMmTOHW2+9lTPOOIOrr76aNWvW8PDDD/PRRx8l7Jx1MD/4wQ944okn+MY3vsHPfvYzPvjgA+644w4KCwsbnF/3FY1G+dWvftWsfR3OcXcgu3fvJhgMcvXVVzNp0iR++MMfsn79eubNm8cHH3zABx98gM/nA4h/Nxx33HHccccd7Ny5k/vvv5933323wXdOLBbjzDPP5MQTT+S3v/0tr776KrfccgvRaJS5c+c2GkskEuHCCy+kqKiId999l169esWfa4vvJumEjEgX9dhjjxnALFmyxOzatcts2bLFPPPMM6Z79+7G7/ebr776yhhjTDAYNLFYrN62GzduND6fz8ydOze+7C9/+YsBzD333NNgX47jxLcDzF133dVgnZEjR5qJEyfGH7/xxhsGMH369DHl5eXx5f/4xz8MYO6///5420OGDDFTp06N78cYYwKBgBk4cKCZPHlyg32ddNJJZtSoUfHHu3btMoC55ZZb4ss2bdpkXC6X+fWvf11v25UrVxq3291g+dq1aw1gnnjiifiyW265xex7mvnPf/5jAPPUU0/V2/bVV19tsLygoMCcffbZDWKfOXOm2f/UtX/sv/jFL0zPnj3NuHHj6r2nf/vb34xt2+Y///lPve0feeQRA5h33323wf7215TX+cILLxjA3H777fW2/cY3vmEsyzLr1q2rF/vMmTMPuL9oNGpCoVC9ZXv37jV5eXnme9/7XnxZ3bG177FrjDEffPCBAcy1114bXzZ9+nRTUFAQf7xq1Spj27aZNm2aAczGjRuNMcbs2LHDeL1eM3v27Pi6L7/8sgHMWWedFT/eCgsLjWVZ5t57742vN3HiRDNy5EgzevRoc/fdd8eX/+1vfzN9+/Y1p5xyihk5cmS91xUIBOo9DofDZtSoUWbSpEkHfH/239/+7rrrrnqvyRhjbrvtNpOWlma+/PLLeuv+v//3/4zL5TJFRUXGmJZ9XhcsWHDAGPd/3w+koKDATJ8+vd6ySy65xKSmpiZs2+a813/6058MYDZv3hxfNnHixHqvv+58Wvc+B4NB079///gx9dhjjzVot3fv3uacc86JP/7oo48aXbepsR7os3T22Wc3eN+b+hkoLi42Xq/XTJkypd73wIMPPmgA85e//KXee9LUY7Cp57fly5cbwPzgBz+ot97Pf/5zA5jXX3+9Xpv7/u0feugh4/P5zGmnnZaw4+5gn4k6+58P6x6ffvrpJhqNxpfXHTMPPPCAMabm79qzZ08
2024-10-10 15:58:16 +04:00
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
2024-10-12 02:16:37 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACW60lEQVR4nOzdeXxU1f3/8dedNZNlkpAQwhrCvshiERUXXEFxL7ZW0YrWquWHttXW9ot1AbW1VuvSahdbq7YWbd2rVRFxV3ChshqRPbKGANkz6z2/P0JGQgIkYZKZJO/n48FD586dcz935s7N/cw553MtY4xBREREREREms2R6ABEREREREQ6GiVSIiIiIiIiLaRESkREREREpIWUSImIiIiIiLSQEikREREREZEWUiIlIiIiIiLSQkqkREREREREWkiJlIiIiIiISAspkRIRERHpoiKRCCUlJRQXFyc6FJEOR4mUiIiISJy9/PLLLFmyJPb4hRdeYOXKlYkLaC+rV6/myiuvpGfPnng8Hnr06MGECRMwxiQ6NJEORYmUSDt67LHHsCwr9i8lJYUhQ4ZwzTXXsH379kSHJ5KUSktLsSyL2bNnJzoUkWZbvnw5P/rRj1i9ejWLFi3iBz/4AZWVlYkOi0WLFnHkkUfy5ptv8n//93/MmzeP+fPn88ILL2BZVqLDE+lQXIkOQKQruu222ygsLCQQCPD+++/zxz/+kVdeeYUVK1aQmpqa6PBEROQQff/73+dvf/sbQ4YMAWDq1KkcffTRCY0pFApx+eWXM2TIEF5//XUyMzMTGo9IR6dESiQBpkyZwhFHHAHU/bHNycnh3nvv5cUXX+Siiy5KcHQiInKounfvzooVK2I/kA0fPjzRIfHSSy+xatUqvvjiCyVRInGgoX0iSeDkk08GYP369QDs2rWLn/70p4waNYr09HT8fj9Tpkxh6dKljV4bCASYPXs2Q4YMISUlhZ49ezJ16lTWrl0LwIYNGxoMJ9z334knnhhr6+2338ayLP71r39x4403kp+fT1paGueccw5fffVVo21/9NFHnH766WRmZpKamsoJJ5zABx980OQ+nnjiiU1uv6nhWk888QTjxo3D5/PRrVs3Lrzwwia3f6B925tt29x///2MHDmSlJQUevTowdVXX83u3bsbrNe/f3/OOuusRtu55pprGrXZVOx33313o/cUIBgMcuuttzJo0CC8Xi99+/blZz/7GcFgsMn3qrX7WV1dzU9+8hP69u2L1+tl6NCh3HPPPY3mPeyvrTvuuAOo+9X6lltuYdy4cWRmZpKWlsbxxx/PW2+91WRc99xzD/fddx8FBQX4fD5OOOEEVqxY0WDdyy67jP79+zdY9tVXX+Hz+bAsiw0bNsSW19TUcPnll5OWlsaIESNYvHgxAOFwmMsvv5zU1FTGjBnDp59+2qC9+mPsvPPOa/QeXn311ViWxWGHHdZg+T333MMxxxxDTk4OPp+PcePG8cwzzzT9QezjxBNPbNRefZv77hNAWVkZP/7xj2Ofz6BBg7jrrruwbTu2zt7v6b4OO+ywJr+vB4q3qfe9Kf37948dBw6Hg/z8fL7zne8ctADB3q9r6t/e227u9xDg1Vdf5YQTTiAjIwO/38/48eOZO3cusP9zSVPfi0gkwu23387AgQPxer3079+fG2+8sdF3r7n735Lv2OzZs/F6vYwbN47hw4fv9/zQlL33xel00rt3b6666irKyspi67Tm81+0aBGFhYU8++yzDBw4EI/HQ79+/fjZz35GbW1to9f/4Q9/YOTIkXi9Xnr16sXMmTMbxABffw8WL17MMcccg8/no7CwkD/96U8N1quP9+23344t27JlC/379+eII46gqqoqtvxQz5ki7UU9UiJJoD7pycnJAWDdunW88MILfPvb36awsJDt27fz5z//mRNOOIHPP/+cXr16ARCNRjnrrLNYsGABF154IT/60Y+orKxk/vz5rFixgoEDB8a2cdFFF3HGGWc02O6sWbOajOeXv/wllmXx85//nJKSEu6//35OPfVUlixZgs/nA+DNN99kypQpjBs3jltvvRWHw8Gjjz7KySefzHvvvceRRx7ZqN0+ffpw5513AlBVVcWMGTOa3PbNN9/MBRdcwPe//3127NjB73//eyZOnMhnn31GVlZWo9dcddVVHH/88QA899xzPP/88w2ev/rqq3nssce4/PLL+eEPf8j69et58MEH+eyzz/jggw9wu91Nvg8tUVZWFtu3vdm2zTnnnMP777/PVVddxfDhw1m+fDn33XcfX375JS+88EKzt3Gg/TTGcM455/DWW29xxRVXMHbsWObNm8cNN9zA5s2bue+++xq0NWnSJC699NIGy8aOHQtARUUFf/3rX7nooou48sorqays5JFHHuG0007j448/jq1X7+9//zuVlZXMnDmTQCDAAw88wMknn8zy5cvp0aPHfvfnlltuIRAINFp+3XXX8fjjj3PNNdfQp08f/t//+38APPzww5x88snccccdPPDAA0yZMoV169aRkZERe21KSgr//e9/KSkpIS8vD4Da2lr+9a9/kZKS0mhbDzzwAOeccw4XX3wxoVCIp556im9/+9u8/PLLnHnmmfuNvaVqamo44YQT2Lx5M1dffTX9+vXjww8/ZNasWWzdupX7778/bttqreOPP56rrroK27ZZsWIF999/P1u2bOG9997b72vuv//+2AVwUVERv/rVr7jxxhtjvS/p6emxdZv7PXzsscf43ve+x8iRI5k1axZZWVl89tlnvPbaa0ybNo1f/OIXfP/73wfq5s9dd911Db4be/v+97/P448/zre+9S1+8pOf8NFHH3HnnXdSVFTU6DxxsP1v6Xdsb/s7PxzIN7/5TaZOnUokEmHhwoU8/PDD1NbW8o9//KNF7ext586drFu3jhtvvJGpU6fyk5/8hE8//ZS7776bFStW8N///jeWiM6ePZs5c+Zw6qmnMmPGDFatWsUf//hHPvnkk0bnzd27d3PGGWdwwQUXcNFFF/Hvf/+bGTNm4PF4+N73vtdkLOXl5UyZMgW3280rr7wSO1biec4UaXNGRNrNo48+agDzxhtvmB07dpivvvrKPPXUUyYnJ8f4fD6zadMmY4wxgUDARKPRBq9dv3698Xq95rbbbost+9vf/mYAc++99zbalm3bsdcB5u677260zsiRI80JJ5wQe/zWW28ZwPTu3dtUVFTElv/73/82gHnggQdibQ8ePNicdtppse0YY0xNTY0pLCw0kyZNarStY445xhx22GGxxzt27DCAufXWW2PLNmzYYJxOp/nlL3/Z4LXLly83Lper0fLVq1cbwDz++OOxZbfeeqvZ+9T23nvvGcD885//bPDa1157rdHygoICc+aZZzaKfebMmWbf0+W+sf/sZz8zeXl5Zty4cQ3e03/84x/G4XCY9957r8Hr//SnPxnAfPDBB422t6/m7OcLL7xgAHPHHXc0eO23vvUtY1mWWbNmTYPYZ86cud/tRSIREwwGGyzbvXu36dGjh/ne974XW1Z/bO197BpjzEcffWQAc91118WWTZ8+3RQUFMQer1ixwjgcDjNlyhQDmPXr1xtjjNm2bZvxeDxm1qxZsXVffvllA5gzzjgjdrwVFRUZy7LMfffdF1vvhBNOMCNHjjSjR48299xzT2z5P/7xD9OnTx9z/PHHm5EjRzbYr5qamgaPQ6GQOeyww8zJJ5+83/dn3+3t6+67726wT8YYc/vtt5u0tDTz5ZdfNlj3//7v/4zT6TTFxcXGmNZ9X59++un9xrjv+74/BQUFZvr06Q2WTZs2zaSmph70tfvG89ZbbzV6rrnfw7KyMpORkWGOOuooU1tb22Ddvc819erfr0cffbTRc0uWLDGA+f73v99g+U9/+lMDmDfffDO2rDn739LvWHPOD/uz7+uNqTuHjhgxIva4NZ//9OnTDWAuu+yyBuvVn09eeuklY4wxJSUlxuPxmMmTJzf4W/Tggw8awPztb3+LLTvhhBMMYH7729/GlgWDQTN27FiTl5dnQqFQg3jfeustEwgEzIknnmjy8vI
2024-10-10 15:58:16 +04:00
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Статистические показатели для обучающей выборки:\n",
"Среднее значение: 13.05\n",
"Стандартное отклонение: 0.52\n",
"Минимальное значение: 11.23\n",
"Максимальное значение: 15.86\n",
"Количество наблюдений: 15129\n",
"\n",
"Статистические показатели для валидационной выборки:\n",
"Среднее значение: 13.05\n",
"Стандартное отклонение: 0.53\n",
"Минимальное значение: 11.26\n",
"Максимальное значение: 15.49\n",
"Количество наблюдений: 3242\n",
"\n",
"Статистические показатели для тестовой выборки:\n",
"Среднее значение: 13.06\n",
"Стандартное отклонение: 0.54\n",
"Минимальное значение: 11.35\n",
"Максимальное значение: 15.53\n",
"Количество наблюдений: 3242\n",
"\n"
]
}
],
"source": [
"import seaborn as sns\n",
"\n",
"df['price_log'] = np.log(df['price'])\n",
"\n",
"X = df.drop(['price', 'price_log'], axis=1)\n",
"y = df['price_log']\n",
"\n",
"X = X.select_dtypes(include='number')\n",
"\n",
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n",
"\n",
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n",
"def plot_distribution(data, title):\n",
" \"\"\"Построение гистограммы распределения целевого признака\"\"\"\n",
" plt.figure(figsize=(10, 6))\n",
" sns.histplot(data, kde=True, bins=30, color='skyblue')\n",
" plt.title(title)\n",
" plt.xlabel('Logarithm of Price')\n",
" plt.ylabel('Count')\n",
" plt.grid(True)\n",
" plt.show()\n",
"\n",
"plot_distribution(y_train, 'Распределение логарифма цены в обучающей выборке')\n",
"plot_distribution(y_val, 'Распределение логарифма цены в валидационной выборке')\n",
"plot_distribution(y_test, 'Распределение логарифма цены в тестовой выборке')\n",
"\n",
"def get_statistics(df, name):\n",
" print(f\"Статистические показатели для {name} выборки:\")\n",
" print(f\"Среднее значение: {df.mean():.2f}\")\n",
" print(f\"Стандартное отклонение: {df.std():.2f}\")\n",
" print(f\"Минимальное значение: {df.min():.2f}\")\n",
" print(f\"Максимальное значение: {df.max():.2f}\")\n",
" print(f\"Количество наблюдений: {df.count()}\\n\")\n",
"\n",
"get_statistics(y_train, \"обучающей\")\n",
"\n",
"get_statistics(y_val, \"валидационной\")\n",
"\n",
"get_statistics(y_test, \"тестовой\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3>Oversampling и undersampling</h3>"
]
},
{
"cell_type": "code",
2024-10-12 12:05:52 +04:00
"execution_count": 269,
2024-10-10 15:58:16 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов после SMOTE (oversampling):\n",
"price_category\n",
"0 3054\n",
"1 3054\n",
"2 3054\n",
"3 3054\n",
"4 3054\n",
"Name: count, dtype: int64\n",
"Распределение классов после RandomUnderSampler (undersampling):\n",
"price_category\n",
"0 2993\n",
"1 2993\n",
"2 2993\n",
"3 2993\n",
"4 2993\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"from imblearn.over_sampling import SMOTE\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"\n",
"if 'date' in df.columns:\n",
" df['year'] = pd.to_datetime(df['date'], errors='coerce').dt.year\n",
" df = df.drop(['date'], axis=1)\n",
"\n",
"df['price_log'] = np.log(df['price'])\n",
"\n",
"df['price_category'] = pd.qcut(df['price_log'], q=5, labels=[0, 1, 2, 3, 4])\n",
"\n",
"X = df.drop(['price', 'price_log', 'price_category'], axis=1)\n",
"y = df['price_category']\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n",
"\n",
"smote = SMOTE(random_state=42)\n",
"X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)\n",
"\n",
"print(\"Распределение классов после SMOTE (oversampling):\")\n",
"print(pd.Series(y_train_smote).value_counts())\n",
"\n",
"undersampler = RandomUnderSampler(random_state=42)\n",
"X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)\n",
"\n",
"print(\"Распределение классов после RandomUnderSampler (undersampling):\")\n",
"print(pd.Series(y_train_under).value_counts())\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3>Оценка сбалансированности выборок</h3>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оценка необходимости аугментации данных\n"
]
},
{
"cell_type": "code",
2024-11-01 21:16:02 +04:00
"execution_count": 1,
2024-10-10 15:58:16 +04:00
"metadata": {},
"outputs": [
{
2024-11-01 21:16:02 +04:00
"ename": "NameError",
"evalue": "name 'y_train' is not defined",
2024-10-10 15:58:16 +04:00
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
2024-11-01 21:16:02 +04:00
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[1], line 18\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mВыборка \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m сбалансирована, аугментация не требуется.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m---> 18\u001b[0m check_augmentation_need(\u001b[43my_train\u001b[49m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mобучающей\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 19\u001b[0m check_augmentation_need(y_val, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mвалидационной\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 20\u001b[0m check_augmentation_need(y_test, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mтестовой\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"\u001b[1;31mNameError\u001b[0m: name 'y_train' is not defined"
2024-10-10 15:58:16 +04:00
]
}
],
"source": [
"def check_augmentation_need(data, name):\n",
" \"\"\"Проверка необходимости аугментации данных\"\"\"\n",
" quantiles = data.quantile([0.25, 0.5, 0.75])\n",
" mean = data.mean()\n",
" std = data.std()\n",
" \n",
" print(f\"Проверка необходимости аугментации для {name} выборки:\")\n",
" print(f\"Среднее значение: {mean:.2f}, Стандартное отклонение: {std:.2f}\")\n",
" print(f\"25-й квантиль: {quantiles[0.25]:.2f}\")\n",
" print(f\"50-й квантиль (медиана): {quantiles[0.5]:.2f}\")\n",
" print(f\"75-й квантиль: {quantiles[0.75]:.2f}\")\n",
" \n",
" if std > mean * 0.5:\n",
" print(f\"Выборка {name} несбалансирована, рекомендуется аугментация.\\n\")\n",
" else:\n",
" print(f\"Выборка {name} сбалансирована, аугментация не требуется.\\n\")\n",
"\n",
"check_augmentation_need(y_train, \"обучающей\")\n",
"check_augmentation_need(y_val, \"валидационной\")\n",
"check_augmentation_need(y_test, \"тестовой\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Поскольку все выборки демонстрируют одинаковое распределение целевого признака и сбалансированное распределение значений, применение методов аугментации не требуется."
]
},
{
"cell_type": "code",
"execution_count": 192,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение 'condition' в обучающей выборке:\n",
" condition\n",
"3 9837\n",
"4 3958\n",
"5 1189\n",
"2 121\n",
"1 24\n",
"Name: count, dtype: int64\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_8140\\3337968062.py:11: FutureWarning: \n",
"\n",
"Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.\n",
"\n",
" sns.barplot(x=condition_counts.index, y=condition_counts.values, palette='viridis')\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAskAAAIjCAYAAADx6oYJAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABU10lEQVR4nO3dd3QUZf/+8WsDKZAQqmkSSaT3aFAMSJNOUFEUKUpAiiWgdEWRJohUg4ggKkWBr4pKEX2AAFINnUhVQeEBgST0QIAkZOf3h8/uj50NkITABvN+nZNzmJl7Zz4z98xyZfbeicUwDEMAAAAA7NxcXQAAAACQ1xCSAQAAABNCMgAAAGBCSAYAAABMCMkAAACACSEZAAAAMCEkAwAAACaEZAAAAMCEkAwAQB5y9epVJSUl6ciRI64uBbksLS1NCQkJOn78uKtLQRYQkgEAucpisWj48OH26dmzZ8tisejw4cM3fe2aNWtksVi0Zs2a21ZfXnTgwAH16NFDgYGB8vDwkL+/vyIiIsQfxc2auXPnOpxfs2fP1rFjx1xX0DW2bdumjh07qlSpUvL09FRgYKDatm3r6rKQBYRk3JTtPzjbj5eXlypUqKBevXopMTHR1eUB+YotRGYlcOZlH3/8sWbPnu2y7VssFpdu/1qbNm3Sww8/rNWrV+vNN9/U8uXLFRsbq0WLFslisbi6vLvC+vXrNWjQIB0+fFjLly9XdHS03NxcH3EWL16sRx99VPv27dPo0aMVGxur2NhYffLJJ64uDVlQ0NUF4O4xcuRIhYaG6sqVK9qwYYOmTZumn376SXv27FHhwoVdXR6APOqFF15Q+/bt5enpaZ/38ccfq1SpUurSpYtD2/r16+vy5cvy8PC4w1W6Rlpamrp27aoKFSpoxYoVKlq0qKtLuiv17dtXDRs2VGhoqCSpX79+CgwMdGlNZ86cUffu3dW8eXMtWLAg35zT/yaEZGRZy5YtVatWLUlS9+7dVbJkSU2aNEmLFy9Whw4dXFwdgLyqQIECKlCgQJbaurm5ycvL6zZXlHf88MMP+v333/Xbb78RkG9BpUqV9Oeff2rPnj0qVaqUypYt6+qSNGvWLF25ckWzZ88mIN+lXP9ZBO5ajz32mCTp0KFDkv75rXnAgAGqXr26fHx85Ovrq5YtW+rXX391eu2VK1c0fPhwVahQQV5eXgoMDNTTTz+tP//8U5J0+PBhhyEe5p+GDRva12X7+Pnrr7/WW2+9pYCAAHl7e+uJJ57Q0aNHnba9efNmtWjRQkWLFlXhwoXVoEEDbdy4MdN9bNiwYabbv3a8pc3cuXMVHh6uQoUKqUSJEmrfvn2m27/Rvl3LarUqJiZGVatWlZeXl/z9/fXSSy/p7NmzDu1CQkLUunVrp+306tXLaZ2Z1T5+/HinYypJqampGjZsmMqVKydPT08FBwdr0KBBSk1NzfRYXet6x832c+1QAVv9K1asUFhYmLy8vFSlShV9//33DuvMbFyr1WpVjRo1nD46Hz58uKpUqWI/Dx955BEtWrTIqcZq1ao51T5hwgSn7SxevFiRkZEKCgqSp6enypYtq3fffVcZGRlO6zQfx9GjR8vNzU3z58+3z1u/fr2effZZ3XffffZj27dvX12+fPnGB/YGNm/erFatWql48eLy9vZWjRo1NHnyZIc2q1evVr169eTt7a1ixYrpySef1P79+x3aDB8+XBaLRQcPHlSXLl1UrFgxFS1aVF27dtWlS5cc2qampqpv37665557VKRIET3xxBP6+++/nWoz911ISIj27t2rtWvXOl3T1xuTvGDBAvv1VapUKT3//PNOY067dOkiHx8fHTt2TG3atJGPj4/uueceDRgwwKmvssJWi+3H09NTFSpU0JgxY7I0VjgpKUndunWTv7+/vLy8VLNmTc2ZM8ehzaZNmxQaGqrvvvtOZcuWlYeHh+677z4NGjTI4XyIiopSqVKllJ6e7rSdZs2aqWLFig41m49fly5dFBIS4jBvwoQJqlOnjkqWLKlChQopPDxc3377rdP6Q0JCHO74X7hwQb169dK9994rT09PlS9fXu+//76sVqvD6ywWi3r16uUwr3Xr1k51fPvtt5nWfO7cOfXp00fBwcHy9PRUuXLlNHbsWIft2N5PZ8+eLW9vb9WuXVtly5ZVdHS0LBaL0ycVZub3Y3d3d4WEhGjgwIFKS0uzt7Odw9u2bbvuuszX/6ZNmxQWFqb33nvPvg/XO1ZXr17Vu+++q7Jly8rT01MhISF66623nN5vb+X9cu/evSpevLhat26tq1evZus451fcSUaO2QJtyZIlJUl//fWXFi1apGeffVahoaFKTEzUJ598ogYNGmjfvn0KCgqSJGVkZKh169ZatWqV2rdvr9dff10XLlxQbGys9uzZ43AHoEOHDmrVqpXDdgcPHpxpPaNHj5bFYtEbb7yhpKQkxcTEqEmTJoqPj1ehQoUk/RMSWrZsqfDwcA0bNkxubm6aNWuWHnvsMa1fv14PP/yw03pLly6tMWPGSJIuXryoV155JdNtv/POO2rXrp26d++ukydPasqUKapfv7527typYsWKOb2mZ8+eqlevniTp+++/18KFCx2Wv/TSS5o9e7a6du2q1157TYcOHdJHH32knTt3auPGjXJ3d8/0OGTHuXPn7Pt2LavVqieeeEIbNmxQz549VblyZe3evVsffPCB/vjjD6fAmZlrj5vNTz/9pP/7v/9zanvgwAE999xzevnllxUVFaVZs2bp2Wef1bJly9S0adPrbuPLL7/U7t27neanpKToqaeeUkhIiC5fvqzZs2erbdu2iouLy7SPb2b27Nny8fFRv3795OPjo9WrV2vo0KFKTk7W+PHjr/u6WbNmaciQIZo4caI6duxon79gwQJdunRJr7zyikqWLKktW7ZoypQp+vvvv7VgwYJs1xcbG6vWrVsrMDBQr7/+ugICArR//34tXbpUr7/+uiRp5cqVatmype6//34NHz5cly9f1pQpU1S3bl3t2LHDKbi0a9dOoaGhGjNmjHbs2KHPPvtMfn5+Gjt2rL1N9+7dNXfuXHXs2FF16tTR6tWrFRkZedN6Y2Ji1Lt3b/n4+Ojtt9+WJPn7+1+3ve06eOihhzRmzBglJiZq8uTJ2rhxo9P1lZGRoebNm6t27dqaMGGCVq5cqYkTJ6ps2bKZXrtZ8dZbb6ly5cq6fPmy/ZdxPz8/devW7bqvuXz5sho2bKiDBw+qV69eCg0N1YIFC9SlSxedO3fO3i+nT5/WX3/9pbfeektPP/20+vfvr23btmn8+PHas2ePfvzxR1ksFr3wwgv64osvtHz5codfihMSErR69WoNGzYs2/s1efJkPfHEE+rUqZPS0tL01Vdf6dlnn9XSpUtv2I9t27ZVbGysOnfurIcfflg///yzBg8erMOHD2v69OnZriMzly5dUoMGDXTs2DG99NJLuu+++/TLL79o8ODBOnHihGJiYq772oMHD+rTTz/N1vZs78epqalavny5JkyYIC8vL7377rs53ofTp09rw4YN2rBhg1588UWFh4dr1apVmR6r7t27a86cOXrmmWfUv39/bd68WWPGjNH+/fud/m/Iyfvl0aNH1aJFC1WqVEnffPONChb8J/7dynHOFwzgJmbNmmVIMlauXGmcPHnSOHr0qPHVV18ZJUuWNAoVKmT8/fffhmEYxpUrV4yMjAyH1x46dMjw9PQ0Ro4caZ83c+ZMQ5IxadIkp21ZrVb76yQZ48ePd2pTtWpVo0GDBvbpn3/+2ZBk3HvvvUZycrJ9/jfffGNIMiZPnmxfd/ny5Y3mzZvbt2MYhnHp0iUjNDTUaNq0qdO26tSpY1SrVs0+ffLkSUOSMWzYMPu8w4cPGwUKFDBGjx7t8Nrdu3cbBQsWdJp/4MABQ5IxZ84c+7xhw4YZ116O69evNyQZ8+bNc3jtsmXLnOaXKVPGiIyMdKo9OjraMF/i5toHDRpk+Pn5GeHh4Q7H9MsvvzTc3NyM9evXO7x++vTphiRj48aNTtu7VoMGDYyqVas6zR8/frwhyTh06JBD/ZKM7777zj7
"text/plain": [
"<Figure size 800x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение 'condition' в валидационной выборке:\n",
" condition\n",
"3 2125\n",
"4 830\n",
"5 256\n",
"2 27\n",
"1 4\n",
"Name: count, dtype: int64\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_8140\\3337968062.py:11: FutureWarning: \n",
"\n",
"Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.\n",
"\n",
" sns.barplot(x=condition_counts.index, y=condition_counts.values, palette='viridis')\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAsAAAAIjCAYAAAAN/63DAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABc3ElEQVR4nO3deVgW9f7/8dcNwo3KoqhsiUCWu6hRGZlbJoamWVbHrFyybEE9appfKxWtjqZmmpmdzsml0lNZpuUxFdw1zC1yzaMet1QwNUFBAWF+f3Tu++ftDQIK3uA8H9fFdTkzn5l5f+6ZuXk59+ceLIZhGAIAAABMws3VBQAAAAA3EgEYAAAApkIABgAAgKkQgAEAAGAqBGAAAACYCgEYAAAApkIABgAAgKkQgAEAAGAqFVxdAAAAAP6/ixcv6syZM6pQoYICAgJcXc5NiTvAAGACFotF8fHx9unZs2fLYrHo0KFDha67evVqWSwWrV69utTqA26EDz74QGfPnrVPT5kyRRkZGa4r6DKJiYnq0qWLqlSpoooVK+qWW27RX//6V1eXddMiAJdTtl9eth8vLy/VqVNH/fv3V2pqqqvLA0zFFhCLEibLsg8//FCzZ8922f4tFotL94+b3/fff6/4+HgdPXpUc+fO1ciRI1WxYkVXl6UPP/xQHTp0UFpamqZOnaqEhAQlJCRo7Nixri7tpsUQiHJu7NixioiI0MWLF7V+/XrNmDFDS5Ys0c6dO1WpUiVXlwegjHrmmWfUvXt3Wa1W+7wPP/xQ1atXV+/evR3atmrVShcuXJCnp+cNrhIoWa+99pq6dOmiqVOnys3NTe+++67c3Fx7L3Dfvn0aMmSI+vXrpw8//FAWi8Wl9ZgFAbici42N1Z133ilJeu6551StWjVNnjxZixYt0pNPPuni6gCUVe7u7nJ3dy9SWzc3N3l5eZVyRUDpa926tQ4fPqw9e/YoNDRUNWvWdHVJev/99xUUFKT333+f8HsDMQTiJnP//fdLkg4ePChJOnPmjIYOHarGjRvL29tbvr6+io2N1S+//OK07sWLFxUfH686derIy8tLwcHBevTRR3XgwAFJ0qFDhxyGXVz506ZNG/u2bB8Jf/nll3rttdcUFBSkypUrq0uXLjp69KjTvn/66Sc9+OCD8vPzU6VKldS6dWtt2LAh3z62adMm3/1fPr7R5vPPP1dUVJQqVqwof39/de/ePd/9X61vl8vLy9OUKVPUsGFDeXl5KTAwUC+88IL++OMPh3bh4eF66KGHnPbTv39/p23mV/vEiROdXlNJysrK0ujRo3XbbbfJarUqNDRUr776qrKysvJ9rS5X0Otm+7n843tb/cuXL1fTpk3l5eWlBg0aaMGCBQ7bzG8caV5eniIjI50+zo6Pj1eDBg3s5+E999yjhQsXOtXYqFEjp9onTZrktJ9FixapU6dOCgkJkdVqVe3atfXmm28qNzfXaZtXvo5vv/223NzcNG/ePPu8devW6fHHH1etWrXsr+3gwYN14cKFq7+wV/HTTz+pY8eOqlq1qipXrqzIyEhNnTrVoc3KlSvVsmVLVa5cWVWqVNHDDz+sPXv2OLSJj4+XxWLR/v371bt3b1WpUkV+fn7q06ePMjMzHdpmZWVp8ODBqlGjhnx8fNSlSxf99ttvTrVdeezCw8O1a9curVmzxumaLmgM8Pz58+3XV/Xq1fX000/r2LFjDm169+4tb29vHTt2TF27dpW3t7dq1KihoUOHOh2rorDVYvuxWq2qU6eOxo0bJ8MwSmTdw4cP6+WXX1bdunVVsWJFVatWTY8//niBQ1wKurYuP//zOw83b96c7/tMfrUW9J5U1Fptx3vLli0O80+dOpXve1Bx3pdOnjypvn37KjAwUF5eXmrSpInmzJnj0Mb2Hjtp0iSnvjZq1Cjf3x9ff/21U1tvb2+nTyj++9//6vHHH5e/v78qVaqke+65R//+978d2lx+DlepUkXR0dGqWbOmOnXqVODvj/zWL+zcsV2rp06dKnBb4eHhDn3YuHGjoqKi9PLLLyswMFBWq1WNGjXSP/7xD6d1MzIy9Morryg0NFRWq1V169bVpEmTnM59i8Wi/v37a+7cuapbt668vLwUFRWltWvXOrSz1Xu5VatWyWq16sUXX3SYf+zYMT377LP2Ghs2bKiZM2de9XUry7gDfJOxhdVq1apJ+vONYeHChXr88ccVERGh1NRU/f3vf1fr1q21e/duhYSESJJyc3P10EMPacWKFerevbv++te/6ty5c0pISNDOnTtVu3Zt+z6efPJJdezY0WG/I0aMyLeet99+WxaLRcOHD9fJkyc1ZcoUPfDAA0pOTraPu1q5cqViY2MVFRWl0aNHy83NTbNmzdL999+vdevW6e6773babs2aNTVu3DhJ0vnz5/XSSy/lu++RI0fqiSee0HPPPafff/9d06ZNU6tWrfTzzz+rSpUqTuv069dPLVu2lCQtWLBA3377rcPyF154QbNnz1afPn00cOBAHTx4UB988IF+/vlnbdiwQR4eHvm+DsVx9uxZe98ul5eXpy5dumj9+vXq16+f6tevrx07dui9997Tf/7zH6cwmZ/LXzebJUuW6F//+pdT23379ukvf/mLXnzxRfXq1UuzZs3S448/rqVLl6p9+/YF7uOzzz7Tjh07nOZnZGTokUceUXh4uC5cuKDZs2erW7duSkpKyvcYF2b27Nny9vbWkCFD5O3trZUrV2rUqFFKT0/XxIkTC1xv1qxZeuONN/Tuu++qR48e9vnz589XZmamXnrpJVWrVk2bNm3StGnT9Ntvv2n+/PnFri8hIUEPPfSQgoOD9de//lVBQUHas2ePFi9ebP9iS2JiomJjY3XrrbcqPj5eFy5c0LRp09SiRQtt27ZN4eHhDtt84oknFBERoXHjxmnbtm365z//qYCAAL3zzjv2Ns8995w+//xz9ejRQ/fee69WrlypTp06FVrvlClTNGDAAHl7e+v111+XJAUGBhbY3nYd3HXXXRo3bpxSU1M1depUbdiwwen6ys3NVYcOHdS8eXNNmjRJiYmJevfdd1W7du18r92ieO2111S/fn1duHDB/h/tgIAA9e3b97rX3bx5s3788Ud1795dNWvW1KFDhzRjxgy1adNGu3fvznd4Wb169eyv26lTpzR48OBC6xg+fHihbQYOHKi77rpLkvTpp58qISHBYfm11HotCnpfunDhgtq0aaP9+/erf//+ioiI0Pz589W7d2+dPXu21L/ElZqaqnvvvVeZmZkaOHCgqlWrpjlz5qhLly76+uuv9cgjjxS47tq1a7VkyZJi7e96zruCnD59Wlu2bFGFChUUFxen2rVra+HCherXr59Onz6t//u//5MkGYahLl26aNWqVerbt6+aNm2qZcuWadiwYTp27Jjee+89h+2uWbNGX375pQYOHCir1aoPP/xQDz74oDZt2pTvjQZJ+uWXX9S1a1d17NhR06dPt89PTU3VPffcYw/WNWrU0A8//KC+ffsqPT1dgwYNuub+u4yBcmnWrFmGJCMxMdH4/fffjaNHjxpffPGFUa1aNaNixYrGb7/9ZhiGYVy8eNHIzc11WPfgwYOG1Wo1xo4da583c+ZMQ5IxefJkp33l5eXZ15NkTJw40alNw4YNjdatW9unV61aZUgybrnlFiM9Pd0+/6uvvjIkGVOnTrVv+/bbbzc6dOhg349hGEZmZqYRERFhtG/f3mlf9957r9GoUSP79O+//25IMkaPHm2fd+jQIcPd3d14++23HdbdsWOHUaFCBaf5+/btMyQZc+bMsc8bPXq0cfklsm7dOkOSMXfuXId1ly5d6jQ/LCzM6NSpk1PtcXFxxpWX3ZW1v/rqq0ZAQIARFRXl8Jp+9tlnhpubm7Fu3TqH9T/66CNDkrFhwwan/V2udevWRsOGDZ3mT5w40ZBkHDx40KF+ScY333xjn5eWlmYEBwcbzZo1s8+znYe2dS9evGjUqlXLiI2NNSQZs2bNKrCekydPGpKMSZMmXVONmZmZTu1eeOEFo1KlSsbFixcdtml7Hf/9738bFSpUMF5
"text/plain": [
"<Figure size 800x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение 'condition' в тестовой выборке:\n",
" condition\n",
"3 2069\n",
"4 891\n",
"5 256\n",
"2 24\n",
"1 2\n",
"Name: count, dtype: int64\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_8140\\3337968062.py:11: FutureWarning: \n",
"\n",
"Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.\n",
"\n",
" sns.barplot(x=condition_counts.index, y=condition_counts.values, palette='viridis')\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAsAAAAIjCAYAAAAN/63DAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABbEElEQVR4nO3de3yP9f/H8edns4PZgWGnmm2UY0YtNKcI01pKKUlFEh2GLyJffcPoMKfEt6R8vzl8w7fSgZIvhhwbiZZjfRERNjltDNvsc/3+6Pv5/Hx8NjY2n831uN9uu+W6rvd1Xa/357quz55dn/fnmsUwDEMAAACASbi5ugAAAADgeiIAAwAAwFQIwAAAADAVAjAAAABMhQAMAAAAUyEAAwAAwFQIwAAAADAVAjAAAABMhQAMAABQiOzsbB08eFAnT550dSkoQQRgAECpsFgsSkpKsk/PmjVLFotF+/fvv+K6q1atksVi0apVq0qtPrjWm2++KavVKkmyWq1KTk52cUX/b/78+WrXrp38/Pzk6+urGjVqaPz48a4uCyWIAIwis/3ysv14e3urdu3a6tevnzIyMlxdHmAqtoBYlDBZlr333nuaNWuWy/ZvsVhcun8zmz17tiZOnKjff/9db731lmbPnu3qkiRJf/3rX9W1a1f5+fnpH//4h1JSUrR8+XK9+OKLri4NJaiCqwtA+TNmzBhFRUXp/PnzWrdunaZNm6bFixdr+/bt8vHxcXV5AMqop556St26dZOXl5d93nvvvadq1arp6aefdmjbunVrnTt3Tp6ente5SlwvY8aMUY8ePTRs2DB5eXlpzpw5ri5Jq1ev1rhx45ScnKy//vWvri4HpYgAjGKLj4/XnXfeKUl69tlnVbVqVU2aNEkLFy7U448/7uLqAJRV7u7ucnd3L1JbNzc3eXt7l3JFcKXHHntMbdu21Z49e3TrrbeqevXqri5JEydOVPPmzQm/JsAQCFyze+65R5K0b98+SdKJEyc0ZMgQNWzYUL6+vvL391d8fLx++uknp3XPnz+vpKQk1a5dW97e3goNDdXDDz+svXv3SpL279/vMOzi0p82bdrYt2X7SPiTTz7RK6+8opCQEFWqVEkPPPCADh486LTvjRs36t5771VAQIB8fHx09913a/369QX2sU2bNgXu/+LxjTZz5sxRTEyMKlasqMDAQHXr1q3A/V+ubxezWq2aPHmyGjRoIG9vbwUHB+u5555z+kJGZGSk7r//fqf99OvXz2mbBdU+YcIEp9dUknJycjRq1Cjdcsst8vLyUnh4uF5++WXl5OQU+FpdrLDXzfZz8cf3tvqXLVumxo0by9vbW/Xr19cXX3zhsM2CxpFarVZFR0c7fZydlJSk+vXr28/Du+66SwsWLHCq8bbbbnOqfeLEiU77WbhwoRISEhQWFiYvLy/VqlVLr732mvLz8522eenr+MYbb8jNzU3z5s2zz1u7dq0effRR1ahRw/7aDho0SOfOnbv8C3sZGzdu1H333acqVaqoUqVKio6O1pQpUxzarFy5Uq1atVKlSpVUuXJlPfjgg9q1a5dDm6SkJFksFu3Zs0dPP/20KleurICAAPXq1Utnz551aJuTk6NBgwapevXq8vPz0wMPPKDff//dqbZLj11kZKR27Nih1atXO13ThY0Bnj9/vv36qlatmp588kkdOnTIoc3TTz8tX19fHTp0SJ07d5avr6+qV6+uIUOGOB2rorDVYvvx8vJS7dq1lZycLMMwirzeld5DDh06pGeeeUbBwcHy8vJSgwYNNGPGDKftXu5980rvmRaLxeFu+6+//qpHH31UgYGB8vHx0V133aVvvvnmqvv/448/Kj4+Xv7+/vL19VW7du20YcMGhzYXnwdBQUFq3ry5qlatWuA1XJBLh+P5+PioYcOG+uc//+nQznYeXM6lx2DDhg267bbb1K1bNwUGBqpixYpq0qSJ0/uGJB09elS9e/dWcHCwvL291ahRI6dhHLbjMXHiRL399tuKiIhQxYoVdffdd2v79u1O9UZGRjrMmzNnjtzc3DR27FiH+T///LMeeeQRBQYGytvbW3feeae++uqry/YVjrgDjGtmC6tVq1aV9Ocb6oIFC/Too48qKipKGRkZ+uCDD3T33Xdr586dCgsLkyTl5+fr/vvv14oVK9StWzf95S9/0enTp5WSkqLt27erVq1a9n08/vjjuu+++xz2O3z48ALreeONN2SxWDRs2DAdPXpUkydPVvv27ZWWlqaKFStK+jMAxMfHKyYmRqNGjZKbm5tmzpype+65R2vXrlXTpk2dtnvzzTfbv6Rx5swZvfDCCwXue8SIEerataueffZZ/fHHH3rnnXfUunVr/fjjj6pcubLTOn379lWrVq0kSV988YW+/PJLh+XPPfecZs2apV69emnAgAHat2+f3n33Xf34449av369PDw8CnwdiuPUqVMFfgHFarXqgQce0Lp169S3b1/Vq1dP27Zt09tvv63//ve/Bf5SuNTFr5vN4sWL9e9//9up7e7du/XYY4/p+eefV8+ePTVz5kw9+uijWrJkiTp06FDoPj766CNt27bNaX52drYeeughRUZG6ty5c5o1a5a6dOmi1NTUAo/xlcyaNUu+vr4aPHiwfH19tXLlSo0cOVJZWVmaMGFCoevNnDlTr776qt566y11797dPn/+/Pk6e/asXnjhBVWtWlXff/+93nnnHf3++++aP39+setLSUnR/fffr9DQUP3lL39RSEiIdu3apUWLFukvf/mLJGn58uWKj49XzZo1lZSUpHPnzumdd95RixYttGXLFqdfwF27dlVUVJSSk5O1ZcsW/fOf/1RQUJDGjRtnb/Pss89qzpw56t69u5o3b66VK1cqISHhivVOnjxZ/fv3l6+vr/72t79JkoKDgwttb7sOmjRpouTkZGVkZGjKlClav3690/WVn5+vjh07qlmzZpo4caKWL1+ut956S7Vq1Srw2i2KV155RfXq1dO5c+fs/6MdFBSk3r17F9i+Xr16+uijj+zT06dP165du/T222/b50VHR0uSMjIydNddd8lisahfv36qXr26/vOf/6h3797KysrSwIED7f263Ptm+/btHfZpe0+5eJ7tvTUjI0PNmzfX2bNnNWDAAFWtWlWzZ8/WAw88oM8++0wPPfRQsfq/Y8cOtWrVSv7+/nr55Zfl4eGhDz74QG3atNHq1avVrFmzQl/bwq7hy3n77bdVrVo1ZWVlacaMGerTp48iIyPVvn37Ym3nYsePH9f06dPl6+urAQMGqHr16pozZ44efvhhzZ071/4p57lz59SmTRvt2bNH/fr1U1RUlObPn6+nn35ap06dsl9vNv/61790+vRpJSYm6vz585oyZYruuecebdu2rdBzftmyZXrmmWfUr18/hzvSO3bsUIsWLXTTTTfpr3/9qypVqqRPP/1UnTt31ueff+503FAIAyiimTNnGpKM5cuXG3/88Ydx8OBB4+OPPzaqVq1qVKxY0fj9998NwzCM8+fPG/n5+Q7r7tu3z/Dy8jLGjBljnzdjxgxDkjFp0iSnfVmtVvt6kowJEyY4tWnQoIFx991326e//fZbQ5Jx0003GVlZWfb5n376qSHJmDJlin3bt956q9GxY0f7fgzDMM6ePWtERUUZHTp0cNpX8+bNjdtuu80+/ccffxiSjFGjRtnn7d+/33B3dzfeeOMNh3W3bdtmVKhQwWn+7t27DUnG7Nmz7fNGjRplXHxZrl271pBkzJ0712HdJUuWOM2PiIgwEhISnGpPTEw0Lr3UL6395ZdfNoKCgoyYmBiH1/Sjjz4y3NzcjLVr1zqs//777xuSjPXr1zvt72J333230aBBA6f5EyZMMCQZ+/btc6hfkvH555/b52VmZhqhoaHG7bffbp9nOw9t654/f96oUaOGER8fb0gyZs6cWWg9R48eNSQZEydOvKoaz54969TuueeeM3x8fIzz5887bNP2On7zzTdGhQoVjJdeeslp3YK2l5ycbFgsFuO3334rtB8FuXDhghEVFWVEREQYJ0+edFh28XneuHFjIygoyDh
"text/plain": [
"<Figure size 800x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Проверка необходимости аугментации для признака 'condition' в обучающей выборке:\n",
"Минимальное количество наблюдений в классе: 24\n",
"Максимальное количество наблюдений в классе: 9837\n",
"Выборка 'обучающей' несбалансирована, рекомендуется аугментация.\n",
"\n",
"Проверка необходимости аугментации для признака 'condition' в валидационной выборке:\n",
"Минимальное количество наблюдений в классе: 4\n",
"Максимальное количество наблюдений в классе: 2125\n",
"Выборка 'валидационной' несбалансирована, рекомендуется аугментация.\n",
"\n",
"Проверка необходимости аугментации для признака 'condition' в тестовой выборке:\n",
"Минимальное количество наблюдений в классе: 2\n",
"Максимальное количество наблюдений в классе: 2069\n",
"Выборка 'тестовой' несбалансирована, рекомендуется аугментация.\n",
"\n",
"Распределение классов после SMOTE (oversampling):\n",
"condition\n",
"3 9837\n",
"5 9837\n",
"4 9837\n",
"2 9837\n",
"1 9837\n",
"Name: count, dtype: int64\n",
"Распределение классов после RandomUnderSampler (undersampling):\n",
"condition\n",
"1 24\n",
"2 24\n",
"3 24\n",
"4 24\n",
"5 24\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"if 'condition' in df.columns:\n",
" X_train, X_temp, y_train, y_temp = train_test_split(df.drop(['price'], axis=1), df['condition'], test_size=0.3, random_state=42)\n",
" X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n",
"\n",
" def analyze_condition_distribution(data, name):\n",
" \"\"\"Проверка и визуализация распределения признака 'condition'\"\"\"\n",
" condition_counts = data.value_counts()\n",
" print(f\"Распределение 'condition' в {name} выборке:\\n\", condition_counts)\n",
" \n",
" plt.figure(figsize=(8, 6))\n",
" sns.barplot(x=condition_counts.index, y=condition_counts.values, palette='viridis')\n",
" plt.title(f\"Распределение признака 'condition' в {name} выборке\")\n",
" plt.xlabel('Condition')\n",
" plt.ylabel('Count')\n",
" plt.grid(True)\n",
" plt.show()\n",
"\n",
" analyze_condition_distribution(y_train, 'обучающей')\n",
" analyze_condition_distribution(y_val, 'валидационной')\n",
" analyze_condition_distribution(y_test, 'тестовой')\n",
"\n",
" def check_condition_augmentation(data, name):\n",
" print(f\"Проверка необходимости аугментации для признака 'condition' в {name} выборке:\")\n",
" min_count = data.value_counts().min()\n",
" max_count = data.value_counts().max()\n",
" print(f\"Минимальное количество наблюдений в классе: {min_count}\")\n",
" print(f\"Максимальное количество наблюдений в классе: {max_count}\")\n",
" \n",
" if max_count > min_count * 1.5:\n",
" print(f\"Выборка '{name}' несбалансирована, рекомендуется аугментация.\\n\")\n",
" else:\n",
" print(f\"Выборка '{name}' сбалансирована, аугментация не требуется.\\n\")\n",
"\n",
" check_condition_augmentation(y_train, 'обучающей')\n",
" check_condition_augmentation(y_val, 'валидационной')\n",
" check_condition_augmentation(y_test, 'тестовой')\n",
"else:\n",
" print(\"Признак 'condition' отсутствует в данных.\")\n",
"\n",
"smote = SMOTE(random_state=42)\n",
"X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)\n",
"\n",
"print(\"Распределение классов после SMOTE (oversampling):\")\n",
"print(pd.Series(y_train_smote).value_counts())\n",
"\n",
"undersampler = RandomUnderSampler(random_state=42)\n",
"X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)\n",
"\n",
"print(\"Распределение классов после RandomUnderSampler (undersampling):\")\n",
"print(pd.Series(y_train_under).value_counts())\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"В этом исследование данные не сбалансированы, поэтому требуется аугментация."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2>Данные о населении</h2>"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Country (or dependency) Population 2020 Yearly Change Net Change \\\n",
"no \n",
"1 China 1,439,323,776 0.39% 5,540,090 \n",
"2 India 1,380,004,385 0.99% 13,586,631 \n",
"3 United States 331,002,651 0.59% 1,937,734 \n",
"4 Indonesia 273,523,615 1.07% 2,898,047 \n",
"5 Pakistan 220,892,340 2.00% 4,327,022 \n",
"\n",
" Density (P/Km²) Land Area (Km²) Migrants (net) Fert. Rate Med. Age \\\n",
"no \n",
"1 153 9,388,211 -348,399 1.7 38 \n",
"2 464 2,973,190 -532,687 2.2 28 \n",
"3 36 9,147,420 954,806 1.8 38 \n",
"4 151 1,811,570 -98,955 2.3 30 \n",
"5 287 770,880 -233,379 3.6 23 \n",
"\n",
" Urban Pop % World Share \n",
"no \n",
"1 61% 18.47% \n",
"2 35% 17.70% \n",
"3 83% 4.25% \n",
"4 56% 3.51% \n",
"5 35% 2.83% \n",
"\n",
"Country (or dependency)\n",
"Population 2020\n",
"Yearly Change\n",
"Net Change\n",
"Density (P/Km²)\n",
"Land Area (Km²)\n",
"Migrants (net)\n",
"Fert. Rate\n",
"Med. Age\n",
"Urban Pop %\n",
"World Share\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd \n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"df2 = pd.read_csv(\"..//static//csv//WorldPopulation.csv\", index_col=\"no\")\n",
"\n",
"print(df2.head(), \"\\n\")\n",
"print(*list(df2.columns), sep='\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Объектом наблюдения является страны и информация о их наслении.<br>\n",
"Атрибуты объекта: Страна, Население, Годовое изменение, NetChange, Плотность, Площадь суши, Мигранты, Fert.Rate, Средний возраст, UrbanPop%, Доля в мире;<br>\n",
"Связь между объектами: имеется связь между атрибутами, например между Коэффициент фертильности и Плотностю населения.<br>"
]
},
{
"cell_type": "code",
2024-10-12 02:16:37 +04:00
"execution_count": 49,
2024-10-10 15:58:16 +04:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-10-12 02:16:37 +04:00
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_13360\\2445382032.py:10: SettingWithCopyWarning: \n",
2024-10-10 15:58:16 +04:00
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_clean['Density (P/Km²)'] = pd.cut(df2['Density (P/Km²)'], bins=range(0, 1000, 100))\n"
]
},
{
"data": {
2024-10-12 02:16:37 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0EAAAJSCAYAAAAFyM5YAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACOKklEQVR4nOzdd3gUVdvH8d8mkEILPYBACCWUQOjSQxMBQboUUYooSlcfRRGRLvrooyICgoUu0kGsKCIIoiJSpIggoUgJTRJCenLeP3h3zZKAWdhlk+z3c125YM/Mzt65c3Zm7ilnLMYYIwAAAADwEF7uDgAAAAAA7iSKIAAAAAAehSIIAAAAgEehCAIAAADgUSiCAAAAAHgUiiAAAAAAHoUiCAAAAIBHoQgCAAAA4FEoggAAAAB4FIogAAAAAB6FIghZwvz582WxWGw/fn5+CgkJ0fDhwxUZGenu8AAAAJCD5HJ3AEBakyZNUnBwsOLj47V161bNnj1bn3/+ufbt26c8efK4OzwAAADkABRByFLat2+vevXqSZIeffRRFSlSRG+88YbWrVunPn36uDk6AAAA5ARcDocsrVWrVpKkiIgISdKlS5f0zDPPqEaNGsqXL58KFCig9u3ba8+ePeneGx8frwkTJigkJER+fn4qWbKkunXrpj///FOSdOzYMbtL8K7/adGihW1Z3333nSwWi5YtW6YXXnhBJUqUUN68edWpUyedPHky3Wf/9NNPateunQICApQnTx41b95c27Zty/B3bNGiRYafP2HChHTzLl68WHXr1pW/v78KFy6s3r17Z/j5N/vd0kpNTdVbb72l0NBQ+fn5KTAwUI8//rj+/vtvu/nKlSunjh07pvuc4cOHp1tmRrG/9tpr6XIqSQkJCRo/frwqVqwoX19flSlTRqNHj1ZCQkKGubIaP368cufOrfPnz6ebNnjwYBUsWFDx8fF2sW/YsEG1atWSn5+fqlWrptWrV6d779GjR/XAAw+ocOHCypMnjxo2bKjPPvvMbh5rX7D++Pr6KiQkRNOmTZMxxjbfhAkTZLFYdOHCBbv3//LLL7JYLJo/f75d+8qVK1WvXj3lz5/fbvmvv/76TXNhvZR0y5Ytevzxx1WkSBEVKFBA/fr1S/d3lKRZs2YpNDRUvr6+KlWqlIYNG6bLly/bpicmJqp58+YqX768XX6tv09ar7/+unLlyqXPP/883efcqP999913dp/10ksvqW7dugoICFDevHnVrFkzbdq0Kd3yHOmrFotFTz75ZLpltG3bVhaLJV1fPnfunAYNGqTAwED5+fmpZs2aWrBgQYYxTJ8+XTVq1JCfn5+KFSumdu3a6Zdffrnp73z9OsXah9LmQpI6dOhww+9+Wtf3wet/BgwYYDd/Zvr1jdzsc8qVK5dufms/+beYdu3apfbt26tAgQLKly+fWrdurR9//NE2/fpLpDP6sX6HBgwYoHz58qWLZeXKlRnmecWKFbb1aNGiRfXQQw/p1KlT6d7/+++/q2fPnipWrJj8/f1VuXJljR079qa/Z0Z9vUWLFunWfTt27MhwnZyRFi1aqHr16unaX3/9dVksFh07dszWtm7dOnXo0EGlSpWSr6+vKlSooMmTJyslJSXd+3/66Sfdd999KlSokPLmzauwsDBNnz49XQ569OihwoULy8/PT/Xq1dMnn3xywzhv9ndK+7n/tn10ZP05YMCAdH3x5MmT8vf3T5ef5ORkTZkyRSEhIfL19bWL0/o9hufgTBCyNGvBUqRIEUnXNuZr167VAw88oODgYEVGRmrOnDlq3ry5Dhw4oFKlSkmSUlJS1LFjR23cuFG9e/fWqFGjdOXKFX399dfat2+fKlSoYPuMPn366L777rP73DFjxmQYz9SpU2WxWPTcc8/p3Llzeuutt3TPPfdo9+7d8vf3lyR9++23at++verWravx48fLy8tL8+bNU6tWrfT999/r7rvvTrfc0qVLa9q0aZKkmJgYDRkyJMPPHjdunHr27KlHH31U58+f14wZMxQeHq5du3apYMGC6d4zePBgNWvWTJK0evVqrVmzxm76448/rvnz52vgwIEaOXKkIiIi9M4772jXrl3atm2bcufOnWEeHHH58mXb75ZWamqqOnXqpK1bt2rw4MGqWrWqfvvtN7355pv6448/tHbt2hsu8+GHH9akSZO0bNkyDR8+3NaemJiolStXqnv37vLz87O1Hz58WL169dITTzyh/v37a968eXrggQf05Zdfqk2bNpKkyMhINW7cWLGxsRo5cqSKFCmiBQsWqFOnTlq5cqW6du1qF8MLL7ygqlWrKi4uzlYcFy9eXIMGDXI4R9u3b1fPnj1Vs2ZNvfLKKwoICNCFCxf01FNPZXoZw4cPV8GCBTVhwgQdOnRIs2fP1vHjx207zNK1HYuJEyfqnnvu0ZAhQ2zz7dixw/b39vHx0erVq9WwYUN17dpVGzdulK+vb7rPW7dunZ577jm99dZb6b4/Vm3atFG/fv0kXdvpe/vtt+2mR0dH6/3331efPn302GOP6cqVK/rggw/Utm1b/fzzz6pVq5ZtXkf6qp+fn5YsWaLXXnvN1v7XX39p48aNdv1CkuLi4tSiRQsdOXJEw4cPV3BwsFasWKEBAwbo8uXLGjVqlG3eQYMGaf78+Wrfvr0effRRJScn6/vvv9ePP/6oevXqadGiRbZ5v//+e82dO1dvvvmmihYtKkkKDAy84d9vy5YtGRaTNzNy5EjVr1/fru3RRx+1e+1ov85I2r+j1f/+978Mi2yrtLm4vh/v379fzZo1U4ECBTR69Gjlzp1bc+bMUYsWLbR582Y1aNBA4eHhdsuYOnWqJNmKEElq3Ljxv8Z+PWsfql+/vqZNm6bIyEhNnz5d27Zts1uP7t27V82aNVPu3Lk1ePBglStXTn/++afWr1+vqVOnqlu3bqpYsaLd71i1alUNHjzY1la1atUbxvHcc885HHtmf798+fLp6aefVr58+fTtt9/qpZdeUnR0tF577TXbfF9//bU6duyokiVLatSoUSpRooQOHjyoTz/91Nbn9+/fryZNmuiuu+7S888/r7x582r58uXq0qWLVq1alWHfqVKliu1vlNE67Fa2j7fipZdesh0IS+t///ufxo0bp65du+q5556Tr6+v7bsKD2SALGDevHlGkvnmm2/M+fPnzcmTJ83HH39sihQpYvz9/c1ff/1ljDEmPj7epKSk2L03IiLC+Pr6mkmTJtnaPvzwQyPJvPHGG+k+KzU11fY+Sea1115LN09oaKhp3ry57fWmTZuMJHPXXXeZ6OhoW/vy5cuNJDN9+nTbsitVqmTatm1r+xxjjImNjTXBwcGmTZs26T6rcePGpnr16rbX58+fN5LM+PHjbW3Hjh0z3t7eZurUqXbv/e2330yuXLnStR8+fNhIMgsWLLC1jR8/3qT9yn///fdGklmyZInde7/88st07UFBQaZDhw7pYh82bJi5fjVyfeyjR482xYsXN3Xr1rXL6aJFi4yXl5f5/vvv7d7/7rvvGklm27Zt6T4vrUaNGpkGDRrYta1evdpIMps2bbKLXZJZtWqVrS0qKsqULFnS1K5d29b25JNPGkl28Vy5csUEBwebcuXK2fqdtS+k/Yz4+Hjj5eVlhg4damuz5vv8+fN2Me7YscNIMvPmzbO1jRkzxkgyZ86csbXdrH+mZf3u1K1b1yQmJtra//vf/xpJZt26dcYYY86dO2d8fHzMvffea/cdeuedd4wk8+GHH9ot99ChQ6ZQoULmoYcesvt9jDFm165dJm/evGbYsGEZxpSYmGgkmeHDh9vaVqxYkS5vycnJJiEhwe69f//9twkMDDSPPPKIrc3RvtqmTRtTtGhRs3LlSlv75MmTTePGjdP15bfeestIMosXL7aLv1GjRiZfvny27/u3335rJJmRI0em+33TftetrH+XiIiIdNMy6kMNGjQw7du3T/f9yYj1/StWrEg3LW/evKZ///6215nt1zciKcO/c4cOHUxQUFC69rFjxxqLxWLXFhQUZBdTly5djI+Pj/nzzz9tbadPnzb58+c34eH
2024-10-10 15:58:16 +04:00
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df2['Fert. Rate'] = pd.to_numeric(df2['Fert. Rate'], errors='coerce')\n",
"\n",
"\n",
"df2['Fert. Rate'] = pd.to_numeric(df2['Fert. Rate'], errors='coerce')\n",
"\n",
"df_clean = df2.dropna(subset=['Fert. Rate', 'Density (P/Km²)'])\n",
" \n",
"## correlation = df_clean[['Density (P/Km²)', 'Fert. Rate']].corr().iloc[0, 1] ## использовать только один раз потом удалить\n",
"\n",
"df_clean['Density (P/Km²)'] = pd.cut(df2['Density (P/Km²)'], bins=range(0, 1000, 100))\n",
"plt.figure(figsize=(10, 6))\n",
"sns.boxplot(data=df_clean, x='Density (P/Km²)', y='Fert. Rate')\n",
"plt.title('Распределение уровня рождаемости по плотности населения')\n",
"plt.xlabel('Плотность (чел./км²)')\n",
"plt.ylabel('Уровень рождаемости')\n",
"plt.xticks(rotation=45)\n",
2024-10-12 02:16:37 +04:00
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"В регионах с низкой плотностью населения рождаемость значительно варьируется, в то время как в регионах с плотностью более 400 чел./км² наблюдается более стабильная и высокая рождаемость."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3>Бизнес-цели</h3>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. **Бизнес-цель**: Определение наилучших стран для выхода на рынок товаров и услуг, связанных с материнством и детством.<br>\n",
" Эффект для бизнеса: Возможность выбора стран с высоким уровнем рождаемости и значительным числом молодых семей для запуска маркетинговых кампаний и открытия новых филиалов.\n",
"- **Цели технического проекта**:\n",
" - Построить модель для определения стран с высоким потенциалом развития рынка товаров для детей и матерей.\n",
" - **Входные признаки**: Плотность населения, Уровень рождаемости, Средний возраст, Доля городского населения.\n",
" - **Целевой признак**: Страна с высоким или низким потенциалом для выхода на рынок.\n",
"2. **Бизнес-цель**: Оптимизация стратегий миграционной политики.\n",
" Эффект для бизнеса: Компании, предоставляющие услуги в области миграции, найма и адаптации, могут использовать эти данные для выбора мест, где их услуги будут наиболее востребованы.\n",
"- **Цели технического проекта**:\n",
" - Построить модель, определяющую страны с наибольшим оттоком мигрантов и прогнозировать влияние миграции на рынок труда.\n",
" - **Входные признаки**: Годовое изменение населения, Плотность населения, Доля городского населения.\n",
" - **Целевой признак**: Уровень чистой миграции.\n",
"3. **Бизнес-цель**: Определение экономического потенциала стран на основе плотности населения и уровня урбанизации.\n",
" Эффект для бизнеса: Компании могут определить страны с высоким уровнем урбанизации и значительной плотностью населения для открытия новых офисов, производств или филиалов.\n",
"- **Цели технического проекта**:\n",
" - Создать модель для ранжирования стран по их экономическому потенциалу на основе демографических данных.\n",
" - **Входные признаки**: Плотность населения, Уровень урбанизации, Средний возраст, Доля городского населения.\n",
" - **Целевой признак**: Оценка экономического потенциала."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**<h3>Поиск проблем</h3>**"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Количество пропущеных ячеек: \n",
" Country (or dependency) 0\n",
"Population 2020 0\n",
"Yearly Change 0\n",
"Net Change 0\n",
"Density (P/Km²) 12\n",
"Land Area (Km²) 0\n",
"Migrants (net) 34\n",
"Fert. Rate 34\n",
"Med. Age 34\n",
"Urban Pop % 13\n",
"World Share 0\n",
"dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA9oAAAIQCAYAAAB+ExYhAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABUl0lEQVR4nO3deVxUZf//8TeMAoqCispShJKVS06ilmEWmbhn2XpLampqj1vQ1MxKMlzDslULsSyXW/PrfbeoZUmS5bRIai4hamaKy52AigqaCTlzfn/4Y25H0EBPjg6v5+Mxj5pzXefM5xyGkfecc67LyzAMQwAAAAAAwBTe7i4AAAAAAABPQtAGAAAAAMBEBG0AAAAAAExE0AYAAAAAwEQEbQAAAAAATETQBgAAAADARARtAAAAAABMRNAGAAAAAMBEBG0AgEczDEOHDx/Wjh073F0KAACoJAjaAACPc+zYMY0dO1Y33HCDfHx8FBQUpOuvv17bt293d2nAZW/8+PHy8vJydxkAcEUjaAPAZWTu3Lny8vI67+PGG290d5mXtfz8fEVHR2v69Ol68MEHtXTpUqWnp2vVqlVq0KCBu8tDJdegQQOX32c/Pz9dd911Gj16tA4fPuzu8i7ajBkzNHfuXHeXAQBuV8XdBQAASps4caIaNmxYavkLL7zghmquLKNHj1ZOTo4yMjLUrFkzd5cDlNKiRQuNGjVKknTy5EmtX79eb7zxhmw2m9auXevm6i7OjBkzVLduXfXv39/dpQCAWxG0AeAy1LVrV7Vu3brU8nfffVeHDh1yQ0VXhgMHDmjevHmaOXMmIRuXrauuukp9+vRxPh80aJBq1KihV155RTt27NB1113nxuoAAGbg0nEAuMKdOnVKkyZN0rXXXitfX181aNBAiYmJKioqcunXoEGDUmeZPvjgA3l5eZW6pNrhcGjatGlq3ry5/Pz8VK9ePXXp0kU//vijs4+Xl5fGjx/vUke3bt1Up04dbd269bw133nnnS6Xz9atW1fdu3dXVlaWSz8vLy8NHTr0nNspudR+9+7dkqR169bJ4XCouLhYrVu3lp+fn4KCghQXF6e9e/e6rNu/f3/VqFFDu3btUufOneXv76+wsDBNnDhRhmG49P399981atQohYeHy9fXVzfccINeeeWVUv0kacGCBbrllltUvXp11a5dW3fccYdWrFjh0mfGjBlq1qyZfH19FRYWpoSEBB09evSCjtG5rFmzRt26dVPt2rXl7+8vq9WqadOmnXedv7p14cyftyRt3LhRXbt2VUBAgGrUqKEOHTrohx9+KHObPj4+OnjwoEtbRkaGc9tnvrek0+/NVq1aqVq1aqpbt6769Omj3377zaVPyc/wTB9++KG8vLy0atUql+U1atQo9f7/q/eXJO3evVteXl4ul0P/8ssvuu+++1S7dm1Vq1ZNN998s5YsWXLe7fyVkJAQSVKVKq7nQL766ivdfvvt8vf3V61atXTvvfdq27ZtzvY5c+bIy8tLs2fPdlkvOTlZXl5e+vzzz13245VXXtHrr7+uiIgIVatWTTExMeV6T5Xnc6ZBgwbasmWLbDab8+d65513XughAYArGme0AeAKN2jQIM2bN08PPvigRo0apTVr1mjKlCnatm2bFi9efM71Tp06peeee67MtoEDB2ru3Lnq2rWrBg0apFOnTunbb7/VDz/8UOaZ9pI6Vq1apfT0dDVt2vQv627cuLGee+45GYahnTt36rXXXlO3bt1KBeKKyM/PlyQNHTpUrVq10osvvqiDBw9q+vTp+u6777Rx40bVrVvX2d9ut6tLly669dZbNXXqVKWlpWncuHE6deqUJk6cKOn0qOX33HOPvv76aw0cOFAtWrTQF198odGjR+u3337T66+/7tzehAkTNH78eLVt21YTJ06Uj4+P1qxZo6+++kqdOnWSdHqgqQkTJig2NlZDhgzR9u3blZqaqnXr1un7779X1apVL/oYpaen6+6771ZoaKiGDx+ukJAQbdu2TcuWLdPw4cP/8jiefevC8ePHNWTIEJc+W7Zs0e23366AgAA9/fTTqlq1qt5++23deeedstlsatOmjUt/i8WiBQsWaOTIkc5lc+bMkZ+fn06ePOnSd+7cuRowYIBuvvlmTZkyRXl5eZo2bZq+//57bdy4UbVq1frLffi7HD58WHfccYeOHTumJ554QiEhIVqwYIHuv/9+vf/++4qLi/vLbfz555/OK1NOnjypjRs36rXXXtMdd9zhcty//PJLde3aVZGRkRo/frz++OMPvfnmm7rtttu0YcMGNWjQQAMGDNDHH3+sJ598Uh07dlR4eLg2b96sCRMmaODAgerWrZvLa//rX//SsWPHlJCQoJMnT2ratGm66667tHnzZgUHB5+z5vJ8zrzxxhsaNmyYatSo4fxsOd82AcCjGQCAy8acOXMMSca6devKbI+JiTGaNWvmfL5p0yZDkjFo0CCXfk899ZQhyfjqq6+cyyIiIox+/fo5n8+YMcPw9fU12rdvb0RERDiXf/XVV4Yk44knnij1+g6Hw/n/koxx48YZhmEYY8aMMSwWi7FkyZJy7WdMTIwRExPjsiwxMdGQZBw4cMDlNRISEs65nZLjlZ2d7fK8adOmxokTJ5z9vv76a0OSMWrUKOeyfv36GZKMYcOGuexf9+7dDR8fH+PgwYOGYRjGkiVLDEnG5MmTXV77wQcfNLy8vIxff/3VMAzD2LFjh+Ht7W3cd999ht1ud+lbctwOHDhg+Pj4GJ06dXLp89ZbbxmSjNmzZ1f4GJ3t1KlTRsOGDY2IiAjjyJEjZdZxLud6/x08eNDl520YhtGzZ0/Dx8fH2Llzp3PZ/v37jZo1axp33HFHqW3GxcUZzZs3dy7//fffjYCAAOORRx5xec3i4mKjfv36xo033mj88ccfzv7Lli0zJBlJSUnOZf369TP8/f1dav3ggw8MScbXX3/tstzf39/l/W8Yf/3+MgzDyM7ONiQZc+bMMQzDMEaNGmVIMtLS0px9Tpw4YTRp0sQICQkxiouLz7u9iIgIQ1Kpx2233WYcOnTIpW+LFi2M+vXrG/n5+c5lP/30k+Ht7W08+uijzmU5OTlGnTp1jI4dOxpFRUVGVFSUcc011xgFBQWl9qNatWrGf//7X+fyNWvWGJKMkSNHOpeNGzfOOPNPxIp8zjRr1qzU+xYAKiMuHQeAK1jJZaFPPvmky/KSgZY+++yzMtc7ceKEJk6cqKFDh+qaa65xafvoo4/k5eWlcePGlVqvrCl/3nrrLU2ZMkXTp0/XvffeW+7aS87qHTx4UBkZGVq8eLGsVqvLGWfp9Bm/Q4cOKT8/Xw6Ho1zbTkhIULVq1ZzP77zzTrVq1arM43HmpcMllxIXFxfryy+/lHT6GFssFj3xxBMu640aNUqGYWj58uWSpCVLlsjhcCgpKUne3q7/vJYcty+//FLFxcUaMWKES5/BgwcrICCgVH3lPUZn2rhxo7KzszVixIhSZ37NmrLJbrdrxYoV6tmzpyIjI53LQ0ND9cgjj+i7775TYWGhyzp9+/bVzz//7LxE/KOPPlJgYKA6dOjg0u/HH3/UgQMHFB8fLz8/P+fy7t27q3Hjxud8T1+o8r6/jh8/rkOHDunzzz9X06ZN1blzZ2dbtWrVFB8fr9zcXG3YsOEvX7NNmzZKT09Xenq6li1bphdeeEFbtmzRPffcoz/++EOSlJOTo02bNql///6qU6eOc12r1aqOHTs6f/el05edp6SkKD09Xbfffrs2bdqk2bNnKyAgoNRr9+zZU1dddZXz+S233KI2bdq4bO9sF/o5AwCVGUEbAK5ge/bskbe3txo1auSyPCQkRLVq1dKePXvKXO+1117TyZMnlZiYWKpt586dCgsLc/nj/lyWL1/uvBS5olMTrV69WvXq1VP9+vXVtm1bnTp1ynnP+Jnee+891atXT3Xr1lW1atV0xx13lLqft0TJuo0bNy7V1qRJE+e93CW8vb1dgqIkXX/99ZLk7Ltnzx6FhYWpZs2apbZX0i6dPm7e3t7nvWy+pO8NN9zgstzHx0eRkZGlfl7
"text/plain": [
"<Figure size 1200x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Количество выбросов (по Z-оценке): 7\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAnsAAAIXCAYAAADkApi2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADOa0lEQVR4nOzdd1QUVxvA4d8u0pUmSlFpIgI2VOxdUTDGaIoxGqNYY43RWBNjjRKjxhZLYsOWaOw12BJr7L13ggVQuoLU3e8PPldXQEFAVnyfc+bo3Llz986wDO/etgq1Wq1GCCGEEEIUSsqCroAQQgghhMg/EuwJIYQQQhRiEuwJIYQQQhRiEuwJIYQQQhRiEuwJIYQQQhRiEuwJIYQQQhRiEuwJIYQQQhRiEuwJIYQQQhRiEuwJIYQQQhRiEuwJIYQQQhRiEuwJ8QYEBgaiUCg4ceJEhmMLFixAoVDQtm1b0tLSCqB2QgghCjMJ9oQoQBs2bKBPnz40aNCAVatWoaenV9BVEkIIUchIsCdEAdm7dy8dOnTA09OTLVu2YGRkVNBVEkIIUQhJsCdEAThz5gxt2rTBzs6OHTt2YG5uniHPmjVrqF69OsbGxlhbW9OpUyfu3bunlcff35+iRYty69YtfH19MTU1xd7envHjx6NWqzX5goODUSgUTJ06lenTp+Po6IixsTGNGjXiwoULGV77ypUrfPLJJ1hZWWFkZIS3tzebN2/O9FoaN26MQqHIsAUGBmrlmzdvHhUrVsTExEQr39q1a7XKqlixYobXmDp1KgqFguDgYE3a067x59NUKhWVK1fO9PX//vtvGjRogKmpKRYWFrRp04bLly9r5Rk7diwKhYKIiAit9BMnTmQo8+m9f9HatWtRKBTs3btXk3bgwAHatWuHg4MDhoaGlClThkGDBvHkyZNMz/f29qZYsWJa92nq1KkZ8j7v6f0wMDDg4cOHWscOHz6sKef5oQTZqZe/v3+mP9/nt6c/AycnJ95//3127tyJl5cXRkZGeHp6sn79+kzrmp2fXU7uc3JyMqNHj6Z69eqYm5tjampKgwYN+Oeff15674Qo7IoUdAWEeNfcvHkTPz8/DA0N2bFjB3Z2dhnyBAYG0rVrV2rUqEFAQADh4eHMnDmTQ4cOcfr0aSwsLDR509LS8PPzo3bt2vz0008EBQUxZswYUlNTGT9+vFa5y5Yt49GjR/Tr14/ExERmzpxJ06ZNOX/+PDY2NgBcvHiRevXqUapUKUaMGIGpqSl//vknbdu2Zd26dXz44YcZ6uvu7s53330HQEREBIMGDdI6vnr1avr27Uvjxo0ZMGAApqamXL58mUmTJuX2dmpZvnw558+fz5C+e/duWrZsiYuLC2PHjuXJkyfMnj2bevXqcerUKZycnPK0Hi9as2YNCQkJ9OnTh+LFi3Ps2DFmz57N3bt3WbNmjSbf4cOH+fTTT6lSpQo//vgj5ubmmd7Pl9HT02PFihVa5yxZsgQjIyMSExNzXK8vv/wSHx8fzTlffPEFH374IR999JEmrUSJEpr/X79+nfbt29O7d2+6dOnCkiVLaNeuHUFBQTRv3jzLemf1s8uJuLg4Fi5cSIcOHejZsyePHj1i0aJF+Pr6cuzYMby8vHJVvhBvLbUQIt8tWbJEDai3bt2qLlu2rBpQt2jRItO8ycnJ6pIlS6orVqyofvLkiSZ969atakA9evRoTVqXLl3UgHrAgAGaNJVKpW7VqpXawMBA/fDhQ7VarVbfvn1bDaiNjY3Vd+/e1eQ9evSoGlAPGjRIk9asWTN1pUqV1ImJiVpl1q1bV12uXLkM9a1Xr566SZMmmv2nr7VkyRJNWocOHdQWFhZa1/PPP/+oAfWaNWs0aY0aNVJXqFAhw2tMmTJFDahv376tSXt6T5+mJSYmqh0cHNQtW7bM8PpeXl7qkiVLqiMjIzVpZ8+eVSuVSnXnzp01aWPGjFEDmvv21PHjxzOU2aVLF7WpqWmGuq5Zs0YNqP/55x9NWkJCQoZ8AQEBaoVCof7vv/80aSNHjlQD6tDQUE3a0/s5ZcqUDGU87+n96NChg7pSpUqa9Pj4eLWZmZm6Y8eOakB9/PjxHNfreYB6zJgxmR5zdHRUA+p169Zp0mJjY9V2dnbqqlWrZqhrdn52ObnPqamp6qSkJK180dHRahsbG3W3bt0yrbMQ7wLpxhXiDfL39+fOnTt07NiRnTt3arXqPHXixAkePHhA3759tcbxtWrVCnd3d7Zt25bhnP79+2v+r1Ao6N+/P8nJyezevVsrX9u2bSlVqpRmv2bNmtSqVYvt27cDEBUVxd9//82nn37Ko0ePiIiIICIigsjISHx9fbl+/XqGruTk5GQMDQ1fet2PHj3CxMQkX8clzpkzh8jISMaMGaOVHhoaypkzZ/D398fKykqTXrlyZZo3b6659udFRUVprj0iIoLY2NgsX/f5fBERETx69ChDHmNjY83/4+PjiYiIoG7duqjVak6fPq059ujRI5RKpVbLbU598cUXXLlyRdNdu27dOszNzWnWrNlr1ysn7O3ttVp/zczM6Ny5M6dPnyYsLCzTc7L62eWUnp4eBgYGQHq3cFRUFKmpqXh7e3Pq1KlclS3E20yCPSHeoKioKFasWMHSpUvx8vJi4MCBGQKJ//77D4Dy5ctnON/d3V1z/CmlUomLi4tWmpubG4DWmCiAcuXKZSjTzc1Nk+/GjRuo1Wq+//57SpQoobU9/UP84MEDrfNjYmIyHVP1vDp16nD//n3Gjh1LSEjIKwOonIqNjWXSpEkMHjxY0x391Mvup4eHBxEREcTHx2ully9fXuvan+/GfF58fHyG+9StW7cM+UJCQjTBZtGiRSlRogSNGjXS1P2pOnXqoFKpGDhwIDdv3iQiIoLo6Ogc3YsSJUrQqlUrFi9eDMDixYvp0qULSmXGx31265UTrq6uKBQKrbSs3o9PXyern93rWLp0KZUrV8bIyIjixYtTokQJtm3blqfvNyHeNjJmT4g3aMqUKbRr1w6A3377jdq1azNy5Ejmzp1bwDVLp1KpABgyZAi+vr6Z5nF1ddXaDwsLyzLvU4MGDeLq1atMmDCBcePG5U1lnzN58mSUSiVDhw4lMjIy1+WtW7cOMzMzzf61a9fo169fhnxGRkZs2bJFK+3AgQNaYyXT0tJo3rw5UVFRDB8+HHd3d0xNTbl37x7+/v6aew7w2WefcerUKWbPns1vv/322vXv1q0bnTt3ZsCAAezfv5+FCxdy4MABrTw5qVd+ysuf3YoVK/D396dt27YMHTqUkiVLoqenR0BAADdv3syjGgvx9pFgT4g3qGHDhpr/16hRg379+jFnzhw6d+5M7dq1AXB0dATg6tWrNG3aVOv8q1evao4/pVKpuHXrlqb1BNKDEyDDxIPr169nqNO1a9c0+Z62EOrr62fZmvW8u3fv8ujRIzw8PF6az9jYmAULFnD69GnMzc0ZM2YMZ8+eZciQIa98jVe5f/8+M2fOJCAggGLFimUIGJ6/ny+6cuUK1tbWmJqaaqU3bNgQa2trzX5W3ap6enoZ7lNMTIzW/vnz57l27RpLly6lc+fOmvRdu3ZlKE+pVDJ16lTOnz/P7du3mTt3LuHh4XTq1CnT189Ky5YtMTIy4rPPPqN+/fqULVs2Q7CXk3rlxNPW4edb97J6P77qZ5dTa9euxcXFhfXr12u9fm67h4V420k3rhAFaOLEidjZ2dGrVy9SU1MB8Pb2pmTJksyfP5+kpCRN3r/++ovLly/TqlWrDOX88ssvmv+r1Wp++eUX9PX1M4zT2rhxo9aYu2PHjnH06FFatmwJQMmSJWncuDG//voroaGhGV7nxSU9Vq1aBZAhKM3MyJEjCQkJYcWKFfj4+FC9evVXnpMd48aNw8bGht69e2d63M7ODi8vL5YuXaoViF24cIGdO3fy3nvv5Uk9svJ0oWz1c0vhqNVqZs6cmWn+2bNn8/fff7Ny5Up8fHyoV69ejl+zSJEidO7cmXPnzmXarfw69cqu+/fvs2HDBs1+XFwcy5Ytw8vLC1tbW628r/rZ5VRm13T06FEOHz6
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from scipy import stats\n",
"\n",
"missing_val = df2.isnull().sum()\n",
"print(\"Количество пропущеных ячеек: \\n\", missing_val)\n",
"\n",
"df2['Population 2020'] = pd.to_numeric(df2['Population 2020'].astype(str).str.replace(',', ''), errors='coerce')\n",
"df2['Density (P/Km²)'] = pd.to_numeric(df2['Density (P/Km²)'].astype(str).str.replace(',', ''), errors='coerce')\n",
"df2['Fert. Rate'] = pd.to_numeric(df2['Fert. Rate'], errors='coerce')\n",
"df2['Med. Age'] = pd.to_numeric(df2['Med. Age'], errors='coerce')\n",
"df2['Urban Pop %'] = pd.to_numeric(df2['Urban Pop %'].astype(str).str.replace('%', ''), errors='coerce')\n",
"df2['World Share'] = pd.to_numeric(df2['World Share'].astype(str).str.replace('%', ''), errors='coerce')\n",
"\n",
"# Удаление пропусков для корректного анализа\n",
"data = df2.dropna()\n",
"\n",
"# 1. Визуализация распределения данных (помогает выявить выбросы)\n",
"plt.figure(figsize=(12, 6))\n",
"sns.boxplot(data=data[['Fert. Rate', 'Density (P/Km²)', 'Med. Age']])\n",
"plt.title('Поиск выбросов с помощью Boxplot')\n",
2024-10-10 15:58:16 +04:00
"plt.show()\n",
"\n",
2024-10-12 02:16:37 +04:00
"z_scrore = np.abs(stats.zscore(data[['Fert. Rate', 'Density (P/Km²)', 'Med. Age', 'Urban Pop %', 'World Share']]))\n",
"outliers = np.where(z_scrore > 3)\n",
"print(f\"Количество выбросов (по Z-оценке): {len(outliers[0])}\")\n",
2024-10-10 15:58:16 +04:00
"\n",
2024-10-12 02:16:37 +04:00
"# Построение корреляционной матрицы для поиска зашумленности\n",
"corr_matrix = data[['Population 2020', 'Density (P/Km²)', 'Fert. Rate', 'Med. Age', 'Urban Pop %', 'World Share']].corr()\n",
"sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')\n",
"plt.title('Корреляционная матрица')\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Из матрицы корреляции можно сделать выводы отностительно зависимости между атрибутами и выявит наиболее бесполезные их них. Признаки с низкими корреляциями, такие как Density (P/Km²), Population 2020, и World Share, могут содержать шум или не являться значимыми для текущей задачи демографического анализа. Это не обязательно означает, что эти переменные всегда шумны, но в контексте анализа они могут оказаться несущественными."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проведем анализа остаточных ошибок, чтобы убедиться в том, что выше упомянутые атрибуты действиельно бесполезны в данном контексе."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"По итогу вышло больше количество шумов"
2024-10-10 15:58:16 +04:00
]
},
{
"cell_type": "code",
2024-10-12 02:16:37 +04:00
"execution_count": 96,
2024-10-10 15:58:16 +04:00
"metadata": {},
2024-10-12 02:16:37 +04:00
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA+0AAAIjCAYAAAB20vpjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABuZElEQVR4nO3deXhMZ//H8c9kj8giSCIEsYfalQatvai1tKpVPGhpS1V1UU+rWl0UfVqlSrcfXaiiqrpp7VoNtdcSat8TIpJIQrY5vz/STI0kZCLJTOT9uq65knOfc+7znTkz5DP3WUyGYRgCAAAAAAAOx8neBQAAAAAAgJwR2gEAAAAAcFCEdgAAAAAAHBShHQAAAAAAB0VoBwAAAADAQRHaAQAAAABwUIR2AAAAAAAcFKEdAAAAAAAHRWgHAAAAAMBBEdoBAAAAAHBQhHYAt5R58+bJZDJZHh4eHqpVq5ZGjRql6Ohoe5cHACVOcnKyZs2apbvvvlsVKlSQt7e3GjdurNmzZysjIyPb8mazWVOnTlVoaKg8PDzUoEEDffXVV9mWmTdvnnr27KmQkBB5eXnptttu0+uvv64rV67kWMenn36qsLAweXh4qGbNmpo5c2ahPF8AKGiEdgC3pEmTJumLL77Q+++/r5YtW2r27NkKDw9XcnKyvUsDgBLlyJEjevLJJ2UYhsaOHau3335boaGheuKJJzR06NBsy7/44osaN26cOnXqpJkzZ6py5cp66KGHtHDhQssyycnJGjJkiM6fP6/HHntM06dPV/PmzTVx4kR17dpVhmFY9fnhhx/qkUceUb169TRz5kyFh4dr9OjRmjJlSqE/fwC4WSbj2n/VAKAYmzdvnoYMGaItW7aoWbNmlvZnnnlG77zzjhYsWKAHH3zQjhUCQMkSExOj6Oho1atXz6p96NChmjt3rg4ePKgaNWpIkk6fPq3Q0FANHz5c77//viTJMAy1adNGR48e1bFjx+Ts7KzU1FRt3bpVLVu2tOpz0qRJmjhxolauXKmOHTtKki5fvqyQkBDdcccd+uGHHyzLPvzww1q2bJlOnjypMmXKFOZLAAA3hZF2ACVC+/btJUlHjx6VJMXGxurZZ59V/fr1Vbp0afn4+Khr167atWtXtnWvXLmiV155RbVq1ZKHh4cqVKigPn366PDhw5KkY8eOWR2Sf+2jbdu2lr7WrVsnk8mkr7/+Wv/9738VFBQkLy8v9ezZUydPnsy27c2bN6tLly7y9fVVqVKl1KZNG23cuDHH59i2bdsct//KK69kW/bLL79U06ZN5enpKX9/f/Xv3z/H7V/vuV3NbDZr+vTpqlevnjw8PBQYGKgRI0bo4sWLVstVrVpV3bt3z7adUaNGZeszp9qnTZuW7TWVpJSUFE2cOFE1atSQu7u7QkJC9PzzzyslJSXH1+pqbdu21W233Zat/e2335bJZNKxY8csbd999526deum4OBgubu7q3r16nrttddyPMR38+bNuueee1SmTBl5eXmpQYMGeu+996yW2b9/v+677z75+/vLw8NDzZo10/Lly3OtM6f9MG/evGzbvdF75pVXXpHJZFJMTIxV+9atW7P1+Z///EdVq1a1Wu7kyZPy9PTM9vqkp6fr9ddfV61ateTu7m5V59atW3N8Xldv53qfo3Xr1lktv3jxYst7uFy5cnr44Yd1+vTp624jS1xcnJ5++mlVrVpV7u7uqlSpkgYNGqSYmBjLZ/R6j6z3ZdbruH//fvXr108+Pj4qW7asnnrqqWyHaM+dO1ft27dXQECA3N3dVbduXc2ePTtbbbZ8Rmzp02QyacyYMdnmde7cWSaTKcdtXuvq18DZ2VkVK1bU8OHDFRcXd931ypUrly2wS9K9994rSYqMjLS0fffdd0pLS9MTTzxhtd3HH39cp06dUkREhCTJzc0tW2DPrc+1a9fqwoULVn1K0siRI5WUlKQff/zxuvVn7edrHy4uLlbL2fKezNonuX2eDcNQ1apV1atXr2zrXrlyRb6+vhoxYsR16wZw63C58SIAUPxlBeyyZctKyjxcc9myZbr//vsVGhqq6Ohoffjhh2rTpo327dun4OBgSVJGRoa6d++u1atXq3///nrqqad06dIlrVy5Unv27FH16tUt23jwwQd1zz33WG13/PjxOdbzxhtvyGQyady4cTp37pymT5+ujh07aufOnfL09JQkrVmzRl27dlXTpk01ceJEOTk5Wf5I/+2339S8efNs/VaqVEmTJ0+WJCUmJurxxx/PcdsTJkxQv3799Mgjj+j8+fOaOXOm7rrrLu3YsUN+fn7Z1hk+fLjuvPNOSdLSpUv17bffWs0fMWKE5SiH0aNH6+jRo3r//fe1Y8cObdy4Ua6urjm+DraIi4uzPLermc1m9ezZU7///ruGDx+usLAw7d69W++++67+/vtvLVu27Ka3nWXevHkqXbq0xo4dq9KlS2vNmjV6+eWXlZCQoGnTplmWW7lypbp3764KFSroqaeeUlBQkCIjI/XDDz/oqaeekiTt3btXrVq1UsWKFfXCCy/Iy8tLixYtUu/evfXNN99YwsfV6tSpoxdffFFS5ujl008/bTU/P++Z/Hj55ZdzPG/4f//7nyZMmKB7771X48aNk7u7u3777Td99NFHeerX3d1dn3zyiVXbli1bNGPGDKu2rPfa7bffrsmTJys6OlrvvfeeNm7cmOt7OEtiYqLuvPNORUZGaujQoWrSpIliYmK0fPlynTp1SmFhYfriiy8sy3/00UeKjIzUu+++a2lr0KCBVZ/9+vVT1apVNXnyZG3atEkzZszQxYsX9fnnn1uWmT17turVq6eePXvKxcVF33//vZ544gmZzWaNHDkyT6/PtWzp08PDQ/Pnz9e0adMsn8dTp05p9erV8vDwyPM27733XvXp00fp6emKiIjQRx99pMuXL1u9ZnkVFRUlKTPUZ9mxY4e8vLwUFhZmtWzWe3fHjh1q3bq1zX1Ksjr6SpKaNm0qJycn7dixQw8//PAN6509e7ZKly5tmXZy+nfsKz/vyUaNGumZZ56RlPmF8ssvv2yZZzKZ9PDDD2vq1KmKjY2Vv7+/Zd7333+vhISEPNUM4BZhAMAtZO7cuYYkY9WqVcb58+eNkydPGgsXLjTKli1reHp6GqdOnTIMwzCuXLliZGRkWK179OhRw93d3Zg0aZKl7f/+7/8MScY777yTbVtms9myniRj2rRp2ZapV6+e0aZNG8v02rVrDUlGxYoVjYSEBEv7okWLDEnGe++9Z+m7Zs2aRufOnS3bMQzDSE5ONkJDQ41OnTpl21bLli2N2267zTJ9/vx5Q5IxceJES9uxY8cMZ2dn44033rBad/fu3YaLi0u29oMHDxqSjM8++8zSNnHiROPq/z5+++03Q5Ixf/58q3VXrFiRrb1KlSpGt27dstU+cuRI49r/kq6t/fnnnzcCAgKMpk2bWr2mX3zxheHk5GT89ttvVuvPmTPHkGRs3Lgx2/au1qZNG6NevXrZ2qdNm2ZIMo4ePWppS05OzrbciBEjjFKlShlXrlwxDMMw0tPTjdDQUKNKlSrGxYsXrZa9el926NDBqF+/vmW9rPktW7Y0atasmW07rVq1Mtq1a2eZznrfzZ0717JuXt8zWfvw/PnzVtvYsmWLVZ+GYRiDBw82qlSpYpnes2eP4eTkZHTt2jXb6xMeHm6EhYVZbT/rM7lly5Zsz+lqgwcPNry8vLK1L1682JBkrF271jAMw0hNTTUCAgKM2267zbh8+bJluR9++MGQZLz88svX3c7LL79sSDKWLl2abd7VdV9d19XP/2pZr2PPnj2t2p944glDkrFr1y5LW07vnc6dOxvVqlWzarPlM2JLn506dTLKlStnLFmyxNL+2muvGS1btsx1m9e69jNpGJn/7tStW/eG614rJSXFqFu3rhEaGmqkpaVZ2rt165atfsMwjKSkJEOS8cILL1y3344dOxo+Pj5Wn72RI0cazs7OOS5fvnx5o3///tftM7fPS5b8vCeDg4ON7t27W6Zz+uwdOHDAkGTMnj3bat2ePXsaVatWzfH9CuDWxOH
"text/plain": [
"<Figure size 1200x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Среднее значение населения: 39877695.56842105\n",
"Медиана населения: 8976711.0\n"
]
}
],
2024-10-10 15:58:16 +04:00
"source": [
2024-10-12 02:16:37 +04:00
"plt.figure(figsize=(12, 6))\n",
"sns.histplot(data=data['Population 2020'], kde=True, bins=30)\n",
"plt.title('Распределение населения по странам в 2020 году')\n",
"plt.xlabel('Население')\n",
"plt.ylabel('Количество стран')\n",
"plt.show()\n",
"\n",
"mean_population = data['Population 2020'].mean()\n",
"median_population = data['Population 2020'].median()\n",
"print(f\"Среднее значение населения: {mean_population}\")\n",
"print(f\"Медиана населения: {median_population}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Определяем смещение. Если разница между средним и медианным значениями существенна, это может указывать на смещение. В данном случаем имеется смещение."
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Просачивания данных в столбце 'Net Change' не обнаружено.\n",
"Просачивания данных в столбце 'Migrants (net)' не обнаружено.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_13360\\2903715557.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" data['Net Change'] = pd.to_numeric(data['Net Change'].astype(str).str.replace(',', ''), errors='coerce')\n",
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_13360\\2903715557.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" data['Migrants (net)'] = pd.to_numeric(data['Migrants (net)'].astype(str).str.replace(',', ''), errors='coerce')\n",
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_13360\\2903715557.py:4: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" data['Population 2020'] = pd.to_numeric(data['Population 2020'].astype(str).str.replace(',', ''), errors='coerce')\n"
]
}
],
"source": [
"# Приведение столбцов к числовым значениям\n",
"data['Net Change'] = pd.to_numeric(data['Net Change'].astype(str).str.replace(',', ''), errors='coerce')\n",
"data['Migrants (net)'] = pd.to_numeric(data['Migrants (net)'].astype(str).str.replace(',', ''), errors='coerce')\n",
"data['Population 2020'] = pd.to_numeric(data['Population 2020'].astype(str).str.replace(',', ''), errors='coerce')\n",
2024-10-10 15:58:16 +04:00
"\n",
2024-10-12 02:16:37 +04:00
"invalid_net_change = data[data['Net Change'] > data['Population 2020']]\n",
"invalid_migrants = data[data['Migrants (net)'] > data['Population 2020']]\n",
2024-10-10 15:58:16 +04:00
"\n",
2024-10-12 02:16:37 +04:00
"if not invalid_net_change.empty:\n",
" print(\"Просачивание данных: Прирост населения превышает текущее население в следующих строках:\")\n",
" print(invalid_net_change)\n",
"else:\n",
" print(\"Просачивания данных в столбце 'Net Change' не обнаружено.\")\n",
2024-10-10 15:58:16 +04:00
"\n",
2024-10-12 02:16:37 +04:00
"if not invalid_migrants.empty:\n",
" print(\"Просачивание данных: Мигранты превышают текущее население в следующих строках:\")\n",
" print(invalid_migrants)\n",
"else:\n",
" print(\"Просачивания данных в столбце 'Migrants (net)' не обнаружено.\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Тут я хочу выявить просачивание данных. Для этого хочу сравнить прирост населения и количество мигрантов с численности населения в 2020 году. Логично, если они окажутся больше общий численности населения это окажется странным. В данном случае все оказалось нормально и не какие данные не просачились."
]
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер данных после удаления выбросов: (183, 11)\n",
"Обрабатываем столбец: Fert. Rate, медианное значение: 2.3\n",
"Обрабатываем столбец: Density (P/Km²), медианное значение: 83.0\n",
"Обрабатываем столбец: Med. Age, медианное значение: 30.0\n",
"Обрабатываем столбец: Urban Pop %, медианное значение: 60.0\n",
"Обрабатываем столбец: World Share, медианное значение: 0.11499999999999999\n",
"Выбросы заменены медианными значениями.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_13360\\3365745342.py:18: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" data[column] = np.where(col_z_score > 3, median_value, data[column])\n",
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_13360\\3365745342.py:18: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" data[column] = np.where(col_z_score > 3, median_value, data[column])\n",
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_13360\\3365745342.py:18: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" data[column] = np.where(col_z_score > 3, median_value, data[column])\n",
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_13360\\3365745342.py:18: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" data[column] = np.where(col_z_score > 3, median_value, data[column])\n",
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_13360\\3365745342.py:18: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" data[column] = np.where(col_z_score > 3, median_value, data[column])\n"
]
}
],
"source": [
"data_no_outliers = data[(z_scores < 3).all(axis=1)]\n",
"print(f\"Размер данных после удаления выбросов: {data_no_outliers.shape}\")\n",
"\n",
"columns_to_check = ['Fert. Rate', 'Density (P/Km²)', 'Med. Age', 'Urban Pop %', 'World Share']\n",
"\n",
"for column in columns_to_check:\n",
" col_z_score = np.abs(stats.zscore(data[column]))\n",
2024-10-10 15:58:16 +04:00
" \n",
2024-10-12 02:16:37 +04:00
" median_value = data[column].median()\n",
" \n",
" print(f\"Обрабатываем столбец: {column}, медианное значение: {median_value}\")\n",
" \n",
" data[column] = np.where(col_z_score > 3, median_value, data[column])\n",
2024-10-10 15:58:16 +04:00
"\n",
2024-10-12 02:16:37 +04:00
"print(\"Выбросы заменены медианными значениями.\")\n",
2024-10-10 15:58:16 +04:00
"\n",
"\n",
2024-10-12 02:16:37 +04:00
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Решили проблему с выбрасами "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3>Оценка качества набора данных:</h3>\n",
"\n",
"**Информативность:** Набор данных содержит информацию о населении стран мира на 2020 год, включая демографические показатели, такие как плотность населения, прирост населения, уровень урбанизации и другие. Колонки достаточно подробные и содержат ключевые метрики.\n",
"\n",
"**Степень покрытия:** В наборе данных представлены 235 стран и зависимых территорий, что охватывает практически весь мир. Это достаточно полный охват для анализа глобального населения.\n",
"\n",
"**Соответствие реальным данным:** Данные взяты из показателей 2020 года.\n",
"\n",
"**Согласованность меток:** Имена колонок не всегда очевидны (например, \"Density (P/Km²)\", \"World Share\"). Также данные представлены в разных форматах, что требует предобработки, так как числовые значения сохранены как строки."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3>Устранение проблемы пропущенных данных</h3>"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Данные после заполнения средним значением: 0 пропущенных значений осталось\n"
]
}
],
"source": [
"data_filled_mean = data.copy()\n",
"\n",
"data_filled_mean[\"Migrants (net)\"] = pd.to_numeric(data_filled_mean[\"Migrants (net)\"], errors='coerce')\n",
"\n",
"mean_value = data_filled_mean[\"Migrants (net)\"].mean()\n",
"data_filled_mean[\"Migrants (net)\"] = data_filled_mean[\"Migrants (net)\"].fillna(mean_value)\n",
"print(f\"Данные после заполнения средним значением: {data_filled_mean['Migrants (net)'].isnull().sum()} пропущенных значений осталось\")\n"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка: 114 строк\n",
"Валидационная выборка: 38 строк\n",
"Тестовая выборка: 38 строк\n"
]
}
],
"source": [
"columns_to_drop = [\"Population 2020\", \"no\", \"Country (or dependency)\"]\n",
"columns_to_drop = [col for col in columns_to_drop if col in data_filled_mean.columns]\n",
"\n",
"X = data_filled_mean.drop(columns=columns_to_drop)\n",
"y = data_filled_mean[\"Population 2020\"]\n",
"\n",
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n",
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n",
"\n",
"print(f\"Обучающая выборка: {X_train.shape[0]} строк\")\n",
"print(f\"Валидационная выборка: {X_val.shape[0]} строк\")\n",
"print(f\"Тестовая выборка: {X_test.shape[0]} строк\")\n"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение в обучающей выборке:\n",
" count 1.140000e+02\n",
"mean 4.809021e+07\n",
"std 1.880257e+08\n",
"min 9.792900e+04\n",
"25% 1.400362e+06\n",
"50% 9.272022e+06\n",
"75% 2.615868e+07\n",
"max 1.439324e+09\n",
"Name: Population 2020, dtype: float64\n",
"Распределение в валидационной выборке:\n",
" count 3.800000e+01\n",
"mean 2.704393e+07\n",
"std 4.201158e+07\n",
"min 1.640930e+05\n",
"25% 4.895855e+06\n",
"50% 1.052552e+07\n",
"75% 3.117359e+07\n",
"max 2.125594e+08\n",
"Name: Population 2020, dtype: float64\n",
"Распределение в тестовой выборке:\n",
" count 3.800000e+01\n",
"mean 2.807392e+07\n",
"std 5.746454e+07\n",
"min 1.067660e+05\n",
"25% 9.193338e+05\n",
"50% 5.568913e+06\n",
"75% 3.241794e+07\n",
"max 3.310027e+08\n",
"Name: Population 2020, dtype: float64\n"
]
}
],
"source": [
"# Проверка распределения целевой переменной в обучающей, валидационной и тестовой выборках\n",
"print(\"Распределение в обучающей выборке:\\n\", y_train.describe())\n",
"print(\"Распределение в валидационной выборке:\\n\", y_val.describe())\n",
"print(\"Распределение в тестовой выборке:\\n\", y_test.describe())\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Дисбаланс данных:\n",
"\n",
"В обучающей выборке среднее значение численности населения составляет около 48 миллионов, однако максимальное значение достигает 1.44 миллиарда, что указывает на значительный разброс данных.\n",
"Стандартное отклонение в обучающей выборке очень велико (188 миллионов), что указывает на присутствие значительного числа стран с огромным населением (например, Китай и Индия), наряду с малонаселёнными странами.\n",
"В валидационной и тестовой выборках также наблюдается высокий разброс, хотя их максимальные значения намного ниже (212 и 331 миллионов соответственно)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Методы приращения данных могут быть полезный, поэтому я ими воспользуюсь"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки после Oversampling: 114 строк\n",
"Размер обучающей выборки после Undersampling: 114 строк\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"# Oversampling (увеличение выборки) для обучающих данных\n",
"ros = RandomOverSampler(random_state=42)\n",
"X_train_res, y_train_res = ros.fit_resample(X_train, y_train)\n",
"print(f\"Размер обучающей выборки после Oversampling: {X_train_res.shape[0]} строк\")\n",
"\n",
"# Undersampling (уменьшение выборки) для обучающих данных\n",
"rus = RandomUnderSampler(random_state=42)\n",
"X_train_res_under, y_train_res_under = rus.fit_resample(X_train, y_train)\n",
"print(f\"Размер обучающей выборки после Undersampling: {X_train_res_under.shape[0]} строк\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h1>3 Датасет - Данные о миллионерах</h1>"
]
},
{
"cell_type": "code",
"execution_count": 189,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Name', 'Networth', 'Age', 'Country', 'Source', 'Industry'], dtype='object') \n",
"\n"
]
}
],
"source": [
"df3 = pd.read_csv(\"..//static//csv//Forbes Billionaires.csv\", index_col=\"Rank \")\n",
"\n",
"print(df3.columns, \"\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Основные столбцы:</br>\n",
"**Rank:** Ранг миллиардера</br>\n",
"**Name:** Имя миллиардера</br>\n",
"**Networth:** Состояние миллиардера (в миллиардах долларов)</br>\n",
"**Age:** Возраст миллиардера</br>\n",
"**Country:** Страна проживания миллиардера</br>\n",
"**Source:** Основной источник дохода</br>\n",
"**Industry:** Отрасль, к которой относится основная деятельность миллиардера</br>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2>Проблемная область<h2>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Данный набор данных относится к анализу благосостояния, его распределению по возрастам, странам и отраслям. Проблемная область связана с миллионерами и с изучением неравенства богатства, выявлением тенденций в источниках доходов и влиянии различных факторов на накопление капитала."
2024-10-10 15:58:16 +04:00
]
2024-10-12 12:05:52 +04:00
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3> Анализ содержимого<h3>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- **Объекты наблюдения:** Миллиардеры\n",
"- **Атрибуты объектов:** Ранг, имя, состояние, возраст, страна, источник дохода, отрасль\n",
"- **Связи между объектами:** Можно выявить связи между возрастом и состоянием, страной проживания и источником дохода, а также отраслью и уровнем благосостояния."
]
},
{
"cell_type": "code",
"execution_count": 206,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA+EAAAJOCAYAAADYhAZpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydd1hT1xvHv0lIwt7IqAsUBwjugShqndTZWvdAcbRuq7XWiVtbra3bn3W2zjrqrHvhwI3inrgFRfYMJOf3B+aaSxJIEEjA9/M8PJpzzz33e88d733PeI+AMcZAEARBEARBEARBEEShIzS0AIIgCIIgCIIgCIL4XCAnnCAIgiAIgiAIgiCKCHLCCYIgCIIgCIIgCKKIICecIAiCIAiCIAiCIIoIcsIJgiAIgiAIgiAIooggJ5wgCIIgCIIgCIIgighywgmCIAiCIAiCIAiiiCAnnCAIgiAIgiAIgiCKCHLCCYIgCIIgCIIgCKKIICecIAiCIAiCIAzAnDlzoFAoAAAKhQJz5841sCKCIIoCcsKLOY8fP8Z3330HDw8PmJqawtraGv7+/li0aBHS0tIMLY8gCIIgChSye0RJYsOGDViwYAFevnyJ3377DRs2bDC0JIIgigABY4wZWgSRPw4cOIAuXbpAKpWib9++qFatGmQyGc6ePYudO3eiX79+WLVqlaFlEgRBEESBQHaPKGls27YNffv2hUwmg1QqxcaNG/Htt98aWhZBEIUMOeHFlMjISPj6+qJ06dI4ceIEXF1dedsfPXqEAwcOYNSoUQZSSBAEQRAFB9k9oqTy9u1bPHr0CJ6ennBycjK0HIIgigAajl5M+fXXX5GcnIw1a9aofYgAQMWKFXkfIgKBgPsTiUT44osvMHjwYMTHx/P2e/LkCbp06QI3NzcIhUJun2rVqnF5zp49i0aNGsHR0RGmpqbw8PDA+PHjkZ6ezuVZv349BAIBrly5ove5KfeVSCR49+4db1tYWBinKWfZFy9eRJs2bWBjYwNzc3M0adIE586d4+WZMGECTE1NeemnTp2CQCDAqVOnuLRz587B1NQUEyZMUNPXtGlTXn0q/6ZNm8bladKkCapXr67x/CpXrozWrVvnWgfly5fnyhUKhXBxcUG3bt3w/PlzXr6UlBSMHTsWZcqUgVQqReXKlbFgwQKotq29f/8egYGBKF26NKRSKVxdXdGrVy88e/aMy/P06VMIBAIsWLAAv//+O8qVKwczMzM0adIEt27d4h0zIiIC/fr144aCuri4IDg4GO/fv1c7j1evXmHAgAFwc3ODVCqFu7s7hgwZAplMxl3n3P7Wr1/PlXXixAk0btwYFhYWsLW1RceOHXH37l3e8aZNmwaBQIBSpUohMzOTt23Lli1cuTExMbnWf79+/VC+fHle2osXL2BmZgaBQICnT5/muj8A3Lt3D127doWTkxPMzMxQuXJlTJo0iZcnPDwcgYGBsLa2hqWlJZo3b44LFy6olRUfH48ffvgB5cuXh1QqRenSpdG3b1/ExMRw929uf6r3pq7HDA8PR5s2beDk5MQrq127dlwebc95TEyM2nGV10aV5ORkuLi4qD1/gG7Pc0Fdb6J4QHavZNs9ADh48CCaNGkCKysrWFtbo27duti8eXOuGlT/lAgEAgwfPhybNm1C5cqVYWpqitq1ayM0NJR3vGfPnmHo0KGoXLkyzMzM4ODggC5duqi94/V51wG6vWeVZT59+hSlSpVCw4YN4eDgAF9fXzX7p4m8bKim96/SLllbW8PBwQGjRo3i3cMAkJWVhZkzZ6JChQqQSqUoX748Jk6ciIyMDC5PfHw8qlatinr16vGmgGiyncOHD4elpSWuXbvGpZUvXx79+vXj5du+fTsEAgFvf+W3ifJPLBajfPnyGDduHGQyGZcvNjYWP/74I3x8fGBpaQlra2sEBgbixo0bvGMo7/sdO3ao1aelpSVPk+r1UaJQKLRenx07dqBOnTqwsrLiaV6wYIHasVTR9ziayO0bQcnbt28xYMAAODs7w9TUFNWrV9c49UGhUGDRokXw8fGBqakpnJyc0KZNG+7ez+sZbNq0qd7HVOYrW7YsRCIRV5alpSWXR/U7NSfVqlXjHVfT+w0A2rZtq/F5ffXqFYKDg+Hs7AypVApvb2+sXbuWl0f1W+v69etq+yt1a7q3tGGic07CqNi3bx88PDzQsGFDnff5+uuv8c033yArKwthYWFYtWoV0tLS8PfffwMA5HI5OnTogGfPnmH06NGoVKkSBAIBZs+ezSsnKSkJVatWRdeuXWFubo6wsDD8+uuvSE1NxZIlSwrsHEUiETZu3IgffviBS1u3bh1MTU3VjMaJEycQGBiI2rVrIyQkBEKhEOvWrcOXX36JM2fOoF69egCyA6A8fPgQX3/9NS5evAh3d3e140ZGRqJTp05o164d5syZo1Fb6dKlueApycnJGDJkCG97nz59MGjQINy6dYv3IXf58mU8ePAAkydPzvP8GzdujMGDB0OhUODWrVv4448/8Pr1a5w5cwYAwBhDhw4dcPLkSQwYMAA1atTA4cOHMW7cOLx69Qq///47AEAmk8HKygqjRo2Cg4MDHj9+jCVLliAiIgI3b97kHfOvv/5CUlIShg0bhvT0dCxatAhffvklbt68CWdnZwDA0aNH8eTJE/Tv3x8uLi64ffs2Vq1ahdu3b+PChQvch9Dr169Rr149xMfHY/DgwahSpQpevXqFHTt2IDU1FQEBAdy9B4C7z1QdVeX9fezYMQQGBsLDwwPTpk1DWloalixZAn9/f1y7dk3N6CclJWH//v34+uuvuTRt946uTJ06Ved9IyIi0LhxY4jFYgwePBjly5fH48ePsW/fPu48b9++jcaNG8Pa2ho//fQTxGIx/ve//6Fp06Y4ffo06tevDyD7/mrcuDHu3r2L4OBg1KpVCzExMdi7dy9evnyJqlWr8upx1apVuHv3Lnf9AcDX11evYyYkJCAwMBCMMYwZMwZlypQBAN6zWBD89ttviI6OVkvX9XlWUhjXmzA+yO6VbLu3fv16BAcHw9vbGxMmTICtrS3Cw8Nx6NAh9OzZE5MmTcLAgQMBZDu/P/zwAwYPHozGjRtrLO/06dPYtm0bRo4cCalUiuXLl6NNmza4dOkSp+/y5cs4f/48unfvjtKlS+Pp06dYsWIFmjZtijt37sDc3DxXzZrQ9T2rib///lvNLufFjBkzeNdU07VR0rVrV5QvXx5z587FhQsXsHjxYsTFxeGvv/7i8gwcOBAbNmzAt99+i7Fjx+LixYuYO3cu7t69i3///RcAYGtri/3796NBgwYICgrCtm3b1BpZAWDJkiVYsWIFdu3ahVq1amk9h6ysLLVGalWU1zkjIwOHDx/GggULYGpqipkzZwLIbkjbvXs3unTpAnd3d0RHR+N///sfmjRpgjt37sDNzS33StQRbdcnLCwMXbt2RfXq1TFv3jzY2Nhw92hBHkcTeX0jODo6Ii0tDU2bNsWjR48wfPhwuLu7Y/v27ejXrx/i4+N5jZcDBgzA+vXrERgYiIEDByIrKwtnzpzBhQsXUKdOHd73xpkzZ7Bq1Sr8/vvvcHR0BADuW1GfYwYFBeHYsWMYMWIEqlevDpFIhFWrVvEabj6V0NBQ/Pfff2rp0dHRaNCgAddw5+TkhIMHD2LAgAFITEzE6NGjeflNTU2xbt06LFq0iEvbsGEDJBKJ/t8bjCh2JCQkMACsY8eOOu8DgIWEhPDSGjZsyLy8vLjf9+/fZwDY3LlzefmaNGnCvL29cy3/q6++YtWqVeN+r1u3jgFgly9f1lljzn179OjBfHx8uPSUlBRmbW3NevbsyStboVAwT09P1rp1a6ZQKLj8qampzN3dnbVs2ZJXfkpKCqtTpw7z9vZmCQkJ7OTJkwwAO3nyJIuPj2deXl6sbt26LDU1VaO+hg0b8s713bt3avUbHx/PTE1N2fjx43n7jhw5kllYWLDk5ORc66BcuXIsKCiIl9azZ09mbm7O/d69ezcDwGbNmsXL9+233zKBQMAePXqktfxff/2VAWAxMTGMMcYiIyMZAGZ
"text/plain": [
"<Figure size 1000x600 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"\n",
"# Связь между возрастом и состоянием\n",
"plt.subplot(2, 2, 1)\n",
"sns.scatterplot(data=df3, x='Age', y='Networth')\n",
"plt.title('Связь между возрастом и состоянием')\n",
"plt.xlabel('Возраст')\n",
"plt.ylabel('Состояние (млрд)')\n",
"\n",
"# Связь между страной проживания и состоянием (топ-10 стран)\n",
"plt.subplot(2, 2, 2)\n",
"top_countries = df3['Country'].value_counts().index[:10]\n",
"sns.boxplot(data=df3[df3['Country'].isin(top_countries)], x='Country', y='Networth')\n",
"plt.title('Связь между страной проживания и состоянием')\n",
"plt.xticks(rotation=90)\n",
"plt.xlabel('Страна')\n",
"plt.ylabel('Состояние (млрд)')\n",
"\n",
"# Связь между источником дохода и состоянием (топ-10 источников дохода)\n",
"plt.subplot(2, 2, 3)\n",
"top_sources = df3['Source'].value_counts().index[:10]\n",
"sns.boxplot(data=df3[df3['Source'].isin(top_sources)], x='Source', y='Networth')\n",
"plt.title('Связь между источником дохода и состоянием')\n",
"plt.xticks(rotation=90)\n",
"plt.xlabel('Источник дохода')\n",
"plt.ylabel('Состояние (млрд)')\n",
"\n",
"# Связь между отраслью и состоянием (топ-10 отраслей)\n",
"plt.subplot(2, 2, 4)\n",
"top_industries = df3['Industry'].value_counts().index[:10]\n",
"sns.boxplot(data=df3[df3['Industry'].isin(top_industries)], x='Industry', y='Networth')\n",
"plt.title('Связь между отраслью и состоянием')\n",
"plt.xticks(rotation=90)\n",
"plt.xlabel('Отрасль')\n",
"plt.ylabel('Состояние (млрд)')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Тут для наглядности вывел графики зависимостей"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3>Примеры бизнес-целей<h3>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" 1. Бизнес может использовать эти данные для анализа того, какие отрасли являются наиболее прибыльными и перспективными.\n",
" 2. Компании могут использовать эти данные для анализа возрастных моделей накопления капитала, что помогает лучше планировать инвестиции и маркетинг.\n",
" 3. Бизнес может понять, какие страны являются наиболее подходящими для инвестиций и предпринимательства."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3>Цели технического проекта<h3>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Вход: Данные по состоянию и источникам доходов.<br>\n",
"Целевой признак: Отрасль, состояние.\n",
"\n",
"2. Вход: Данные по возрасту и состоянию.<br>\n",
"Целевой признак: Возраст, состояние.\n",
"\n",
"3. Вход: Данные по странам и состоянию.<br>\n",
"Целевой признак: Страна, состояние."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3>Выявляем проблемы(и решаем их)<h3>"
]
},
{
"cell_type": "code",
"execution_count": 225,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABKMAAAHWCAYAAACrLUrEAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABKhklEQVR4nO3dd5RV5b0//s/MwAwdRLpIFUWUpihBr4KRKIgJMf5sQSMxRqIY201uTDBgzDVYYhI7eMUSMXZjIRYwBlADiCixYwErAiK9Dszs3x+uOV8OQxkQ9rTXa61Zi9n72c9+yjnDc95nn31ykiRJAgAAAABSkFveDQAAAACg+hBGAQAAAJAaYRQAAAAAqRFGAQAAAJAaYRQAAAAAqRFGAQAAAJAaYRQAAAAAqRFGAQAAAJAaYRR8A8XFxbF48eKYO3dueTcFAIAUWQcC7DxhFOygBQsWxEUXXRRt27aN/Pz8aNq0aXTp0iVWrFhR3k0DAGA3sg4E2DVqlHcDYFe566674sc//nHWtqZNm8YBBxwQ//M//xMDBw78xuf44IMP4qijjooNGzbEBRdcEAcddFDUqFEjateuHXXr1v3G9QMAsOOsAwEqF2EUVc4VV1wR7du3jyRJYuHChXHXXXfFcccdF08++WQcf/zx36juYcOGRX5+fkyfPj322muvXdRiAAB2BetAgMpBGEWVM3DgwOjVq1fm95/85CfRvHnzuO+++77RImTWrFnx/PPPx8SJEy1AAAAqIOtAgMrBPaOo8ho1ahS1a9eOGjX+X/b60UcfRU5OTvzxj3/c6nGXX3555OTkZH6fPn161KpVKz788MM44IADoqCgIFq0aBHDhg2LJUuWZB3br1+/OPDAA2PWrFlx2GGHRe3ataN9+/YxZsyYUudZtGhRZqFUq1at6N69e9x9992lyhUXF8f1118fXbt2jVq1akXTpk1jwIAB8corr2TK5OTkxPnnn7/VPt11112Rk5MTH3300VbLREQMHTo0cnJytvozefLkrPIPPfRQHHzwwVG7du1o0qRJnH766fH5559v8xwlli1bFhdffHG0a9cuCgoKonXr1vGjH/0oFi9evMvHaFt9ysnJiX79+u3wOUvKtWnTJvLy8jJ11atXL1NmW4+3Aw88MOu8kydP3uIYDxo0KHJycuLyyy/PbNvSfD777LNx2GGHRZ06daJhw4Zx/PHHx5tvvrm14QeAKs068P+paOvAkvaU/NSpUye6du0at99+e6myzz//fBxxxBFRt27daNSoUQwePDjeeeedrDK33nprdO/ePRo2bBh169aN7t27x7hx40r1rV69ejF37tw49thjo27dutGqVau44oorIkmSrLJ//OMf47DDDos999wzateuHQcffHA8/PDDW+zL+PHj49BDD406derEHnvsEUceeWRMnDgxIiLatWu3zfFs165dJEkS7dq1i8GDB5eqe926ddGwYcMYNmzYdscUKhNXRlHlLF++PBYvXhxJksSiRYvixhtvjFWrVsXpp5/+jer96quvYt26dXHuuefGt7/97fjZz34WH374Ydx8880xY8aMmDFjRhQUFGTKL126NI477rg4+eST47TTTosHH3wwzj333MjPz4+zzjorIiLWrl0b/fr1iw8++CDOP//8aN++fTz00EMxdOjQWLZsWVx44YWZ+n7yk5/EXXfdFQMHDoyzzz47Nm7cGC+88EJMnz496x3AXaWgoKDUYmDmzJlxww03ZG0ruUfDIYccEqNHj46FCxfG9ddfHy+99FK89tpr0ahRo62eY9WqVXHEEUfEO++8E2eddVYcdNBBsXjx4njiiSfis88+iyZNmuzSMbrnnnsyZV944YW47bbb4s9//nM0adIkIiKaN28eETs2L2eeeWY899xz8fOf/zy6d+8eeXl5cdttt8Wrr76602O/ualTp8ZTTz213XIvvPBCHHfccdG2bdsYNWpUbNiwIW655ZY4/PDDY+bMmbHvvvvusjYBQEVkHbhrpLEOLFGyFluxYkXccccd8dOf/jTatWsX/fv3j4iI5557LgYOHBgdOnSIyy+/PNauXRs33nhjHH744fHqq69Gu3btIiJi5cqVccwxx0THjh0jSZJ48MEH4+yzz45GjRrFiSeemDlfUVFRDBgwIL71rW/FNddcE88880yMGjUqNm7cGFdccUWm3PXXXx/f+973YsiQIVFYWBj3339/nHTSSTFhwoQYNGhQptzvfve7uPzyy+Owww6LK664IvLz82PGjBnx/PPPxzHHHBN/+ctfYtWqVRER8c4778Qf/vCH+M1vfhP7779/RETUq1cvcnJy4vTTT49rrrkmlixZEo0bN87U/+STT8aKFSu+8WMYKpwEqog777wziYhSPwUFBcldd92VVXbevHlJRCTXXnvtVusbNWpUsulTpOT3o48+Otm4cWOp8954442ZbX379k0iIrnuuusy29avX5/06NEjadasWVJYWJgkSZL85S9/SSIiGT9+fKZcYWFh0qdPn6RevXrJihUrkiRJkueffz6JiOSCCy4o1c7i4uLMvyMiGT58+HbHaN68eVstkyRJcuaZZyZ169Yttf2hhx5KIiL517/+lWlrs2bNkgMPPDBZu3ZtptyECROSiEhGjhy5zfOMHDkyiYjk0Ucf3Wq/dvUYldjWWJT1nGvXrk1yc3OTYcOGZR2/+fht6/F2wAEHJH379s38/q9//StrjJMkSXr37p0MHDgwiYhk1KhRW+3DwQcfnDRs2DBZsGBBpsx7772X1KxZMznxxBNLnRsAqgrrwMq3DtxSe957770kIpJrrrkms61k3L766qvMtv/85z9Jbm5u8qMf/Wir9W/cuDFp0KBBcv7552f1LSKSn//855ltxcXFyaBBg5L8/Pzkyy+/zGxfs2ZNVn2FhYXJgQcemHz729/ObHv//feT3Nzc5IQTTkiKioqyym9p/bmldV6JOXPmJBGR3HrrrVnbv/e97yXt2rXbYn1QmfmYHlXOzTffHJMmTYpJkybF+PHj46ijjoqzzz47Hn300VJl16xZE4sXL46lS5eWujR3ay655JLIy8vL/H7GGWdE8+bN4x//+EdWuRo1amRdTpufnx/Dhg2LRYsWxaxZsyIi4qmnnooWLVrEaaedlilXs2bNuOCCC2LVqlUxZcqUiIh45JFHIicnJ0aNGlWqPZteQh7x9aW8ixcvjq+++iqKi4vL1Ked9corr8SiRYvivPPOi1q1amW2Dxo0KDp37lxqTDb3yCOPRPfu3eOEE04ota+kX7tjjLanrOdcvXp1FBcXx5577lmmekseb5v+FBUVbfOYRx99NGbOnBlXXXXVVsssXbo03nvvvZg1a1YMGTIkc4VXRESnTp3ie9/7XjzzzDPbPRcAVHbWgZVnHVhi6dKlsXjx4pg7d278+c9/jry8vOjbt29ERHzxxRcxe/bsGDp0aNbVQt26dYvvfOc7pa4cLyoqisWLF8fHH38cf/7zn2PFihVxxBFHlDrnph9nLPl4Y2FhYTz33HOZ7bVr185q4/Lly+OII47Iuvr9sccei+Li4hg5cmTk5ma/tN7R9ee+++4bvXv3jnvvvTezbcmSJfH000/HkCFDdrg+qOiEUVQ5hx56aPTv3z/69+8fQ4YMiX/84x/RpUuXzH8ymxo1alQ0bdo0GjduHHXq1IlBgwbF+++/v8V6S/4D6Ny5c9b2vLy86NSpU6nP37dq1arU1/yWfEyqpOzHH38cnTp1KvWfV8llux9//HFERHz44YfRqlWrrP+Et2bcuHHRtGnTaNKkSdSuXTuOPPLIrPsJ7Eol7dtvv/1K7evcuXNm/9Z8+OGHceCBB273HLt6jLanrOfcc889o1OnTnH77bfHxIkTY9GiRbF48eJYv379Fustebxt+vPuu+9utR1FRUXxm9/8JoYMGRLdunXbarmDDjooMwdbmov9998/Vq9enXUfLgCoiqwDK886sMRBBx0UTZs2jY4dO8Ydd9wRN910Uxx66KHbPcf+++8fixcvjtWrV2e2vf/++9G0adNo165djBgxIm655ZY4+eSTs47
"text/plain": [
"<Figure size 1500x500 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер данных до удаления выбросов: (2600, 6)\n",
"Размер данных после удаления выбросов: (2366, 6)\n"
]
}
],
"source": [
"fig, axs = plt.subplots(1, 2, figsize=(15, 5))\n",
"\n",
"sns.boxplot(data=df3, x='Networth', ax=axs[0])\n",
"axs[0].set_title(\"Выбросы по состоянию\")\n",
"\n",
"sns.boxplot(data=df3, x='Age', ax=axs[1])\n",
"axs[1].set_title(\"Выбросы по возрасту\")\n",
"\n",
"plt.show()\n",
"print(\"Размер данных до удаления выбросов: \", df3.shape)\n",
"\n",
"# Функция для удаления выбросов с помощью IQR\n",
"def remove_outliers(df, column):\n",
" Q1 = df[column].quantile(0.25)\n",
" Q3 = df[column].quantile(0.75)\n",
" IQR = Q3 - Q1\n",
" return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]\n",
"\n",
"df3_cleaned = remove_outliers(df3, 'Networth')\n",
"df3_cleaned = remove_outliers(df3_cleaned, 'Age')\n",
"\n",
"print(\"Размер данных после удаления выбросов: \", df3_cleaned.shape)\n"
]
},
{
"cell_type": "code",
"execution_count": 241,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_13360\\2893838167.py:3: FutureWarning: \n",
"\n",
"Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.\n",
"\n",
" sns.barplot(x=country_dist.values, y=country_dist.index, palette='coolwarm')\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA5YAAAIQCAYAAAD6lfEPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABhNElEQVR4nO3deVhU5f//8dcoOCAIKC6gIS7gjnv6UXOH3EutbDE/oLmUmln6yczcSyu3NDWzRSwt00wtK/d9yR2X3E2SDDU3wA0Rzu+PfszXkUXgoAP6fFzXXBdzzn3u8z7nnkFe3mfOWAzDMAQAAAAAQBblcXQBAAAAAIDcjWAJAAAAADCFYAkAAAAAMIVgCQAAAAAwhWAJAAAAADCFYAkAAAAAMIVgCQAAAAAwhWAJAAAAADCFYAkAAAAAMIVgCQAAgByhdevW6tGjh6PLuO8uXLggNzc3/fLLL44uBcgygiWAHC08PFwWi8X2cHFxUbly5dS3b1+dPXvW0eUBALLJ5s2btWLFCg0aNEiSVKpUKbvf/2k9wsPD73utn3zyiZ555hmVLFlSFotFYWFhaba9fPmyevbsqSJFisjNzU1NmzbV7t277dp4e3ure/fuGjp06D2uHLh3nBxdAABkxKhRo1S6dGnduHFDmzZt0ieffKJffvlFBw4cUP78+R1dHgDApHHjxql58+YKCAiQJH300Ue6cuWKbf0vv/yib7/9VpMmTVLhwoVty+vXr3/fa/3ggw8UFxenOnXqKDo6Os12SUlJatOmjfbu3av//e9/Kly4sKZPn64mTZpo165dCgwMtLV9+eWXNWXKFK1Zs0bNmjW7H4cBZCuCJYBcoVWrVqpdu7YkqXv37vL29tbEiRO1ZMkSPf/88w6uDgBgxrlz5/Tzzz9rxowZtmXt27e3a3PmzBl9++23at++vUqVKnV/C7zD+vXrbbOV7u7uabb7/vvvtWXLFi1YsEBPP/20JKlTp04qV66chg8frm+++cbWtmLFiqpSpYrCw8MJlsiVuBQWQK6U/I/uyZMnJUkXL17UwIEDFRQUJHd3d3l4eKhVq1bau3dvim1v3LihESNGqFy5cnJxcZGvr686duyoEydOSJIiIyPTveyqSZMmtr7WrVsni8Wi7777Tm+//bZ8fHzk5uamJ554QlFRUSn2vW3bNrVs2VKenp7Knz+/GjdurM2bN6d6jE2aNEl1/yNGjEjRds6cOapVq5ZcXV1VqFAhPffcc6nuP71ju11SUpI++ugjVa5cWS4uLipWrJh69eqlS5cu2bUrVaqU2rZtm2I/ffv2TdFnarWPGzcuxTmVpPj4eA0fPlwBAQGyWq3y8/PTm2++qfj4+FTP1e2Sz9udf5RKUq9evWSxWFSlSpUU5+TOy+n69OmT5iVuaY3NnX1k5HxnZv8ZfZ1n9nUpKc06161bZ9cuo2NjsVjUt29fzZ07V+XLl5eLi4tq1aqlDRs2pNj3nj171KpVK3l4eMjd3V3NmzfXb7/9Ztfmzsvi8+fPr6CgIH3++eepHk96297tPZWRetKSlJSkyZMnKygoSC4uLipSpIhatmypnTt32s5LRn6/ZGYMN27caLssM3lMXn/9dV2/ft2uXVhYWKoh6Pvvv08x1pnp02KxqHr16in6HTt27F2DV7Kff/5Zt27dUnBw8F3bpuXq1asaMGCA/Pz8ZLVaVb58eY0fP16GYdi1S+/8ZzSw+vv7p/gdl5rvv/9exYoVU8eOHW3LihQpok6dOmnJkiUp3jchISH66aefUtQM5AbMWALIlZJDoLe3tyTpjz/+0OLFi/XMM8+odOnSOnv2rD799FM1btxYBw8eVPHixSVJiYmJatu2rVavXq3nnntOr732muLi4rRy5UodOHBAZcuWte3j+eefV+vWre32O3jw4FTree+992SxWDRo0CCdO3dOH330kYKDgxURESFXV1dJ0po1a9SqVSvVqlVLw4cPV548eTRr1iw1a9ZMGzduVJ06dVL0+8gjj2js2LGSpCtXruiVV15Jdd9Dhw5Vp06d1L17d/3zzz/6+OOP1ahRI+3Zs0deXl4ptunZs6caNmwoSfrhhx+0aNEiu/W9evVSeHi4unbtqn79+unkyZOaOnWq9uzZo82bN8vZ2TnV85AZly9fth3b7ZKSkvTEE09o06ZN6tmzpypWrKj9+/dr0qRJOnr0qBYvXnzXvl1cXPTzzz/r3LlzKlq0qCTp+vXr+u677+Ti4nLX7Y8fP67PPvss3TYVKlTQkCFDJEnnz5/X66+/nmbbu53vjO4/o6/zZBl5Xd4uJCRE//3vfyVJO3bs0JQpU+zWZ3Zs1q9fr++++079+vWT1WrV9OnT1bJlS23fvt0W7n///Xc1bNhQHh4eevPNN+Xs7KxPP/1UTZo00fr161W3bl27PpMvg4yNjdWXX36pHj16qFSpUhkKJMmX1CdL7T2V2Xru9NJLLyk8PFytWrVS9+7ddevWLW3cuFG//fabateura+//trWduPGjZo5c6bdpZ3FihWz6y8jY7hgwQJdu3ZNr7zyiry9vbV9+3Z9/PHH+uuvv7RgwYK7npfUZKZPJycn/f7779qzZ49q1KhhWx4eHp6h95skbdmyRd7e3vL3989SvYZh6IknntDatWv10ksvqXr16lq+fLn+97//6fTp05o0aZJd+9tf68kmTJiQ4j/PzNqzZ49q1qypPHns53Lq1KmjmTNn6ujRowoKCrItr1WrliZNmqTff//d7j/AgFzBAIAcbNasWYYkY9WqVcY///xjREVFGfPmzTO8vb0NV1dX46+//jIMwzBu3LhhJCYm2m178uRJw2q1GqNGjbIt+/LLLw1JxsSJE1PsKykpybadJGPcuHEp2lSuXNlo3Lix7fnatWsNSUaJEiWM2NhY2/L58+cbkozJkyfb+g4MDDRatGhh249hGMa1a9eM0qVLGyEhISn2Vb9+faNKlSq25//8848hyRg+fLhtWWRkpJE3b17jvffes9t2//79hpOTU4rlx44dMyQZs2fPti0bPny4cfs/Bxs3bjQkGXPnzrXbdtmyZSmW+/v7G23atElRe58+fYw7/4m5s/Y333zTKFq0qFGrVi27c/r1118befLkMTZu3Gi3/YwZMwxJxubNm1Ps73aNGzc2KleubFStWtUYP368Xb+PPPKI0bBhQ6Ny5cq25cnjPWvWLNuyTp06GVWqVDH8/PyM0NDQFPto0KCB0bRp03T7MIyMne/M7D+jr/OMvi6T3bx505Bk9O3b17ZswYIFhiRj7dq1tmWZGRtJhiRj586dtmV//vmn4eLiYnTo0MG2rH379ka+fPmMEydO2Jb9/fffRoECBYxGjRrZliX/Ljh58qRt2dGjRw1JxocffmikJ3nbHTt22C1P7T2V0XpSs2bNGkOS0a9fvxTrbn/fp3dMyTIzhteuXUux/dixYw2LxWL8+eeftmWhoaGGm5tbirapjXVm+2zXrp3d62fjxo2Gq6ur0b59+1T3eafHHnvMqFWrVrptxo0bl+b5Wrx4sSHJePfdd+2WP/3004bFYjGOHz9uWybJ6NOnT4o+2rRpY/j7+9+11ju5ubml+nsieV23bt1SLP/5558NScayZcvslm/ZssWQZHz33XeZrgNwNC6FBZArBAcHq0iRIvLz89Nzzz0nd3d3LVq0SCVKlJAkWa1W2/8IJyYm6sKFC3J3d1f58uXt7r63cOFCFS5cWK+++mqKfWTksqa0/Pe//1WBAgVsz59++mn5+vrabh0fERGhY8eO6YUXXtCFCxd0/vx5nT9/XlevXlXz5s21YcMGJSUl2fV548aNu/5v/w8//KCkpCR16tTJ1uf58+fl4+OjwMBArV271q79zZs3Jf17vtKyYMECeXp6KiQkxK7PWrVqyd3dPUWfCQkJdu3Onz+vGzdupFv36dOn9fHHH2vo0KEpLpNbsGCBKlasqAoVKtj1mXz58537T0vXrl01a9Ys2/NZs2YpNDQ0xczBnXbt2qUFCxZo7Nixaba9efNmuufw9nZS+uc7M/vP6Os82d1el8mSx+tur7fMjk29evVUq1Y
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_13360\\2893838167.py:9: FutureWarning: \n",
"\n",
"Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.\n",
"\n",
" sns.barplot(x=industry_dist.values, y=industry_dist.index, palette='coolwarm')\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA+EAAAIQCAYAAADuAG/uAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACD5klEQVR4nOzdeXwN5////+cRsi+EEDSRkEWoXbXW0NKEova1JXYtVUssqbcllJRau6JLYi21FEXV8kFJtVQFLbXErqmlrUSkgmR+f/jlfB0JkuCk9HG/3eZ2c2auueY1cw7t81zXzDEZhmEIAAAAAAA8cvnyugAAAAAAAP4rCOEAAAAAAFgJIRwAAAAAACshhAMAAAAAYCWEcAAAAAAArIQQDgAAAACAlRDCAQAAAACwEkI4AAAAAABWQggHAAAAAMBKCOEAAADAYyg5OVlFixbVwoUL87oUq1u/fr2cnZ118eLFvC4FyDFCOAA8JmJiYmQymcyLvb29AgIC1L9/f50/fz6vywMAWNnMmTPl4uKiDh066OTJkxb/jbjXcvLkSavWmZycrDFjxig0NFTu7u4ymUyKiYm5a/tDhw4pNDRUzs7Ocnd316uvvpopbIeGhsrPz09RUVGPuHrg4cuf1wUAAHJm3Lhx8vX11bVr17Rjxw59/PHHWrdunX755Rc5OjrmdXkAACu4ceOGZs6cqUGDBsnGxkYeHh6aP3++RZupU6fq7Nmzmj59usV6Dw8Pa5aqS5cuady4cfL29lalSpW0devWu7Y9e/as6tWrJzc3N02cOFHJycmaMmWKDhw4oF27dsnW1tbctk+fPgoPD1dkZKRcXFyscCbAw0EIB4DHTOPGjVW9enVJUs+ePVW4cGFNmzZNq1atUseOHfO4OgCANaxZs0YXL15Uu3btJElOTk565ZVXLNosXrxYf//9d6b11la8eHElJCTI09NTP/30k5555pm7tp04caKuXr2qPXv2yNvbW5JUo0YNNWrUSDExMerdu7e5bevWrfXGG29o6dKl6t69+yM/D+BhYTo6ADzmnn/+eUnSiRMnJEl//fWXwsPDVaFCBTk7O8vV1VWNGzfWvn37Mu177do1jR07VgEBAbK3t1fx4sXVqlUrxcfHS9J9pzfWr1/f3NfWrVtlMpm0ZMkSvfXWW/L09JSTk5OaN2+uM2fOZDr2jz/+qNDQULm5ucnR0VHBwcGKjY3N8hzr16+f5fHHjh2bqe2CBQtUrVo1OTg4yN3dXR06dMjy+Pc6t9ulp6drxowZKl++vOzt7VWsWDH16dNHf//9t0U7Hx8fNW3aNNNx+vfvn6nPrGp/9913M11TSUpNTdWYMWPk5+cnOzs7eXl5adiwYUpNTc3yWt0u47q1aNEi07Y+ffrIZDLp6aefznRN7pwm2q9fP5lMJoWFhd31GHcud/aRneudk+Nn93Oe08+lpLvWeefoXXbfG5PJpP79+2vhwoUKDAyUvb29qlWrpu+++y7Tsffu3avGjRvL1dVVzs7OeuGFF/TDDz9YtLnz1hRHR0dVqFBBn376aZbnc6997/d3Kjv13M3Vq1c1ZMgQeXl5yc7OToGBgZoyZYoMw7C4Ntn5NyYn7+P27dvVtm1beXt7m9+XQYMG6Z9//slU42+//aZ27drJw8NDDg4OCgwM1MiRIzO18/Hxue9n4ubNm3r77bcVEBAgOzs7i3Y//fRTpr4GDhyY6TghISEymUxZ/ltyp5UrV8rHx0dlypS5b9u7OX78uNq2bSt3d3c5Ojrqueee09q1ay3aZFz7uy1Z/btwJzs7O3l6emarpuXLl6tp06bmAC5JDRs2VEBAgL788kuLtkWLFlXFihW1atWqbPUN/FswEg4Aj7mMwFy4cGFJt/6nauXKlWrbtq18fX11/vx5zZ49W8HBwTp48KBKlCghSUpLS1PTpk21efNmdejQQW+++aauXLmijRs36pdffrH4H7uOHTuqSZMmFseNiIjIsp4JEybIZDJp+PDhunDhgmbMmKGGDRsqLi5ODg4OkqT/+7//U+PGjVWtWjWNGTNG+fLlU3R0tJ5//nlt375dNWrUyNTvU089Zb73Lzk5Wa+99lqWxx41apTatWunnj176uLFi3r//fdVr1497d27VwULFsy0T+/evVW3bl1J0ooVK/TVV19ZbO/Tp49iYmLUrVs3DRgwQCdOnNAHH3ygvXv3KjY2VgUKFMjyOuTE5cuXs7yvMT09Xc2bN9eOHTvUu3dvBQUF6cCBA5o+fbqOHDmilStX3rdve3t7rV27VhcuXFDRokUlSf/884+WLFkie3v7++5/7NgxffLJJ/dsU7ZsWXNwuXTpkgYNGnTXtve73tk9fnY/5xmy87m8XaNGjdSlSxdJ0u7du/Xee+9ZbM/pe7Nt2zYtWbJEAwYMkJ2dnT766COFhoZq165d5i9Cfv31V9WtW1eurq4aNmyYChQooNmzZ6t+/fratm2bnn32WYs+p0+friJFiigpKUmff/65evXqJR8fHzVs2PCe11T6f7e1ZMjq71RO67mdYRhq3ry5tmzZoh49eqhy5cr69ttvNXToUJ07d848Pfr26dPbt2/XnDlzzOclScWKFbPoNzvv49KlS5WSkqLXXntNhQsX1q5du/T+++/r7NmzWrp0qbmv/fv3q27duipQoIB69+4tHx8fxcfH6+uvv9aECRMynVPdunXNo7CHDh3SxIkTLbZPnTpVo0aNUsuWLTV8+HDZ2dmZz+lO9vb2Wrhwod59913zvyFnz57V5s2bs/X3UpK+//57Va1aNVtts3L+/HnVqlVLKSkpGjBggAoXLqy5c+eqefPmWrZsmVq2bGnRfsCAAZlGsHv27Jnr42fl3LlzunDhgnm21+1q1KihdevWZVpfrVq1bP1bCPyrGACAx0J0dLQhydi0aZNx8eJF48yZM8bixYuNwoULGw4ODsbZs2cNwzCMa9euGWlpaRb7njhxwrCzszPGjRtnXvf5558bkoxp06ZlOlZ6erp5P0nGu+++m6lN+fLljeDgYPPrLVu2GJKMkiVLGklJSeb1X375pSHJmDlzprlvf39/IyQkxHwcwzCMlJQUw9fX12jUqFGmY9WqVct4+umnza8vXrxoSDLGjBljXnfy5EnDxsbGmDBhgsW+Bw4cMPLnz59p/dGjRw1Jxty5c83rxowZY9z+n8bt27cbkoyFCxda7Lt+/fpM60uVKmW89NJLmWrv16+fced/bu+sfdiwYUbRokWNatWqWVzT+fPnG/ny5TO2b99usf+sWbMMSUZsbGym490uODjYKF++vFGxYkVjypQpFv0+9dRTRt26dY3y5cub12e839HR0eZ17dq1M55++mnDy8vL6Nq1a6Zj1K5d22jQoME9+zCM7F3vnBw/u5/z7H4uM1y/ft2QZPTv39+8bunSpYYkY8uWLeZ1OXlvJBmSjJ9++sm87tSpU4a9vb3RsmVL87oWLVoYtra2Rnx8vHnd77//bri4uBj16tUzr8v4t+DEiRPmdUeOHDEkGZMnTzbuJWPf3bt3W6zP6u9UduvJysqVKw1Jxttvv22xvk2bNobJZDKOHTt219puP68MOXkfU1JSMu0fFRVlmEwm49SpU+Z19erVM1xcXCzWGYZh8e9ShpIlSxrdunXLVM/tn4maNWsaQUFBFvtndb1LlSplNGrUyChSpIixbNky8/rx48cbtWrVuuu/Jbe7ceOGYTKZjCFDhtyz3UsvvWSUKlUqy20DBw40JFl8hq9cuWL4+voaPj4+5r9fGee6dOnSTH04OTll+e/CvezevTvLfyNu3zZv3rxM24YOHWpIMq5du2axfuLEiYYk4/z58zmqA8hLTEcHgMdMw4YN5eHhIS8vL3Xo0EHOzs766quvVLJkSUm3pv3ly3frn/e0tDT9+eefcnZ2VmBgoH7++WdzP8uXL1eRIkX0xhtvZDrGndOnc6JLly4WD8hp06aNihcvbh7BiIuL09GjR9WpUyf9+eefunTpki5duqSrV6/qhRde0Hfffaf09HSLPq9du3bf0aEVK1YoPT1d7dq1M/d56dIleXp6yt/fX1u
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(Country\n",
" United States 27.653846\n",
" China 19.807692\n",
" India 6.192308\n",
" Germany 5.000000\n",
" Russia 3.115385\n",
" Hong Kong 2.576923\n",
" Canada 2.461538\n",
" Brazil 2.307692\n",
" Italy 2.000000\n",
" Taiwan 1.961538\n",
" Name: proportion, dtype: float64,\n",
" Industry\n",
" Finance & Investments 14.846154\n",
" Technology 12.653846\n",
" Manufacturing 12.384615\n",
" Fashion & Retail 9.461538\n",
" Healthcare 8.153846\n",
" Food & Beverage 7.730769\n",
" Real Estate 7.269231\n",
" diversified 6.846154\n",
" Media & Entertainment 3.653846\n",
" Energy 3.576923\n",
" Name: proportion, dtype: float64)"
]
},
"execution_count": 241,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"country_dist = df3['Country'].value_counts().head(10) # Топ-10 стран\n",
"sns.barplot(x=country_dist.values, y=country_dist.index, palette='coolwarm')\n",
"plt.title('Распределение миллиардеров по странам (Топ-10)')\n",
"plt.show()\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"industry_dist = df3['Industry'].value_counts().head(10) # Топ-10 отраслей\n",
"sns.barplot(x=industry_dist.values, y=industry_dist.index, palette='coolwarm')\n",
"plt.title('Распределение миллиардеров по отраслям (Топ-10)')\n",
"plt.show()\n",
"\n",
"# Процентное распределение по странам и отраслям(для наглядности)\n",
"country_percentage = df3['Country'].value_counts(normalize=True) * 100\n",
"industry_percentage = df3['Industry'].value_counts(normalize=True) * 100\n",
"\n",
"country_percentage.head(10), industry_percentage.head(10)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Хотел проверить смещение. В данном случае имеется смещение по регионам или отраслям. Но думаю это особенность данного датасета, а не самих данных."
]
},
{
"cell_type": "code",
"execution_count": 246,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2598, 75, 18, np.int64(0))"
]
},
"execution_count": 246,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 1. Проверим уникальность по ключевым столбцам\n",
"unique_names = df3['Name'].nunique()\n",
"unique_countries = df3['Country'].nunique()\n",
"unique_industries = df3['Industry'].nunique()\n",
"\n",
"# 2. Проверка дубликатов\n",
"duplicates_count = df3.duplicated().sum()\n",
"\n",
"unique_names, unique_countries, unique_industries, duplicates_count\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверка оценки информативности. Тут все нормально"
]
},
{
"cell_type": "code",
"execution_count": 248,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_13360\\1549126591.py:7: FutureWarning: \n",
"\n",
"Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.\n",
"\n",
" sns.countplot(data=df3, y='Industry', order=df3['Industry'].value_counts().index, palette='coolwarm')\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA/4AAAHWCAYAAADO/FK/AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACwX0lEQVR4nOzdfVzO9////9tRdH7iLApRKHKeOZlzhpXzc0ZGzg0zLCfNWDHCsDNzso1yOgzDG8vwwWg2J1vYWGjCNuebElOp4/eHn+PrEKlE0+7Xy+V1ufR6vZ4nj9dLNo/jeXIYjEajERERERERERHJkyxyOwAREREREREReXqU+IuIiIiIiIjkYUr8RURERERERPIwJf4iIiIiIiIieZgSfxEREREREZE8TIm/iIiIiIiISB6mxF9EREREREQkD1PiLyIiIiIiIpKHKfEXERERERERycOU+IuIiIiIiIjkYUr8RUREnmMREREYDAbTYWNjg7e3N8OHD+fSpUu5HZ6IiIj8C+TL7QBERETkyU2ePBlPT09u377Nvn37mD9/Plu3buXnn3/Gzs4ut8MTERGRXKTEX0REJA9o2bIlNWvWBGDAgAEULlyYOXPmsHHjRnr06JHL0YmIiEhu0lR/ERGRPOill14C4MyZMwD89ddfBAUFUaVKFRwcHHBycqJly5YcOXIkXd3bt28TEhKCt7c3NjY2uLm50alTJ2JjYwGIi4szW17w4NGkSRNTW7t378ZgMLB69WreeustXF1dsbe3p127dpw/fz5d3z/88AP+/v44OztjZ2dH48aNiYqKeugzNmnS5KH9h4SEpCu7fPlyXnjhBWxtbSlUqBCvvPLKQ/vP6Nnul5aWxgcffEClSpWwsbGhWLFiDB48mL///tusnIeHB23atEnXz/Dhw9O1+bDY33vvvXTvFCApKYl33nmHcuXKYW1tjbu7O2PHjiUpKemh7+p+j3pv9464uDiz8vPmzaNSpUpYW1tTvHhxhg0bxvXr1x/bD8BPP/1Ey5YtcXJywsHBgWbNmvH999+b7j+4VOVhR0REBACBgYE4ODjw22+/4efnh729PcWLF2fy5MkYjUazfmfNmkW9evUoXLgwtra2vPDCC6xdu/ahMS5fvpzatWtjZ2dHwYIFadSoEd98841ZmXu/xw8eHh4eZuV+++03unbtSvHixbGwsDCVq1y58kPbio6ONqv/xx9/YGlpicFgeGS8IiLZoRF/ERGRPOhekl64cGHgbkKyYcMGunbtiqenJ5cuXWLhwoU0btyY48ePU7x4cQBSU1Np06YNO3fu5JVXXuGNN97gxo0bbN++nZ9//pmyZcua+ujRowetWrUy6zc4OPih8UydOhWDwcC4ceO4fPkyH3zwAc2bNyc6OhpbW1sA/u///o+WLVvywgsv8M4772BhYUF4eDgvvfQSe/fupXbt2unaLVmyJGFhYQAkJiby2muvPbTviRMn0q1bNwYMGMCVK1f4+OOPadSoET/99BMFChRIV2fQoEE0bNgQgPXr1/PVV1+Z3R88eDARERH07duXESNGcObMGebOnctPP/1EVFQU+fPnf+h7yIrr16+bnu1+aWlptGvXjn379jFo0CB8fHw4duwY77//PidPnmTDhg2Pbfv+93bP1q1b+eKLL8yuhYSEEBoaSvPmzXnttdeIiYlh/vz5HDx48LHP+csvv9CwYUOcnJwYO3Ys+fPnZ+HChTRp0oQ9e/ZQp04dGjVqxLJly0x1pk6dCsCECRNM1+rVq2f6OTU1FX9/f1588UVmzpxJZGQk77zzDnfu3GHy5Mmmch9++CHt2rUjICCA5ORkVq1aRdeuXdm8eTOtW7c2lQsNDSUkJIR69eoxefJkrKys+OGHH/i///s/Xn755XTP9NZbb+Hj4wPAp59+yrlz58xia9euHWfPnmXkyJF4e3tjMBhMz/QgGxsbwsPD+fDDD03XlixZgpWVFbdv337kexURyRajiIiIPLfCw8ONgHHHjh3GK1euGM+fP29ctWqVsXDhwkZbW1vj77//bjQajcbbt28bU1NTzeqeOXPGaG1tbZw8ebLp2uLFi42Acc6cOen6SktLM9UDjO+99166MpUqVTI2btzYdL5r1y4jYCxRooQxISHBdH3NmjVGwPjhhx+a2vby8jL6+fmZ+jEajcZbt24ZPT09jS1atEjXV7169YyVK1c2nV+5csUIGN955x3Ttbi4OKOlpaVx6tSpZnWPHTtmzJcvX7rrp06dMgLGJUuWmK698847xvv/ybR3714jYFyxYoVZ3cjIyHTXS5cubWzdunW62IcNG2Z88J9hD8Y+duxYY9GiRY0vvPCC2TtdtmyZ0cLCwrh3716z+gsWLDACxqioqHT93a9x48bGSpUqpbv+3nvvGQHjmTNnjEaj0Xj58mWjlZWV8eWXXzb73Zk7d64RMC5evDjDfjp06GC0srIyxsbGmq79+eefRkdHR2OjRo0eGdv9z3q/Pn36GAHj66+/brqWlpZmbN26tdHKysp45coV0/Vbt26Z1U1OTjZWrlzZ+NJLL5munTp1ymhhYWHs2LFjur8b9/8OGo1G4/bt242Acc+ePWbxlC5d2nQeExNjBIxhYWHpnun+933v70SPHj2MhQsXNiYlJZnueXl5GXv27GkEjF9++eVD34OISHZoqr+IiEge0Lx5c1xcXHB3d+eVV17BwcGBr776ihIlSgBgbW2NhcXd/+2npqZy7do1HBwcKF++PD/++KOpnXXr1lGkSBFef/31dH08ODU9K3r37o2jo6PpvEuXLri5ubF161YAoqOjOXXqFD179uTatWtcvXqVq1evcvPmTZo1a8a3335LWlqaWZu3b9/GxsYmw37Xr19PWloa3bp1M7V59epVXF1d8fLyYteuXWblk5OTgbvv61G+/PJLnJ2dadGihVmbL7zwAg4ODunaTElJMSt39erVx47o/vHHH3z88cdMnDgRBweHdP37+PhQoUIFszbvLe94sP/s2rFjB8nJyYwcOdL0uwMwcOBAnJyc2LJlyyPrpqam8s0339ChQwfKlCljuu7m5kbPnj3Zt28fCQkJ2Ypr+PDhpp8NBgPDhw8nOTmZHTt2mK7fm0UC8PfffxMfH0/Dhg3Nftc3bNhAWloakyZNMnu+e+3eLzO/Fzdu3AD+3yybx2nbti0Gg4FNmzYBsHfvXn7//Xe6d++eqfoiIlmhqf4iIiJ5wCeffIK3tzf58uWjWLFilC9f3iyZSUtL48MPP2TevHmcOXOG1NRU0737E5XY2FjKly9Pvnw5+08ELy8vs3ODwUC5cuVM68lPnToFQJ8+fR7ZRnx8PAULFjSdX716NV27Dzp16hRGo/GR5R6cqn5v7fqDyfaDbcbHx1O0aNGH3r98+bLZ+TfffIOLi0uGcT7onXfeoXjx4gwePDjdWu9Tp05x4sSJR7b5YP/ZdfbsWQDKly9vdt3KyooyZcqY7j/MlStXuHXrVrq6AD4+PqSlpXH+/HkqVaqUpZgsLCzMPkgA8Pb2BjDbm2Dz5s28++67REdHm+17cH9CHxsbi4WFBRUrVnxsv5n5vShfvjwFCxZk9uzZVKxY0TTVPyUl5aHl8+fPT69evVi8eDFdunRh8eLFdO7cGScnp8fGIyKSVUr8RURE8oDatWubdvV/mGnTpjFx4kT69evHlClTKFSoEBYWFowcOTLdSHpuuBfDe++9R/Xq1R9a5v6kKzk5mQsXLtCiRYvHtmswGPj666+xtLTMsE2AixcvAuDq6pphm0WLFmXFihUPvf9gQl6nTh3effdds2tz585l48aND61/4sQJIiIiWL58+UPX0KelpVGlShXmzJnz0Pru7u6PjP2/YO/evbRr145GjRoxb9483NzcyJ8/P+Hh4axcuTJbbWbm98LBwYHVq1fTr18/GjRoYHbvUR9w9OvXD19fX2JiYvjyyy9No/8iIjlNib+IiMh/wNq1a2natCmLFi0yu379+nWKFCliOi9btiw//PADKSkpObJB3T33RvTvMRqNnD59mqpVq5r6BXBycqJ58+aPbe/IkSOkpKRk+GHHvXa
"text/plain": [
"<Figure size 1000x500 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Проверка на согласованность категорий\n",
"unique_sources = df3['Source'].unique()\n",
"unique_industries = df3['Industry'].unique()\n",
"\n",
"# Пример для визуального анализа\n",
"plt.figure(figsize=(10, 5))\n",
"sns.countplot(data=df3, y='Industry', order=df3['Industry'].value_counts().index, palette='coolwarm')\n",
"plt.title('Распределение по отраслям')\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Для оценки покрытия мы смотрим на то, насколько разнообразны данные по странам, отраслям и возрастам."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Устранение проблемы пропущенных данных"
]
},
{
"cell_type": "code",
"execution_count": 249,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\salih\\AppData\\Local\\Temp\\ipykernel_13360\\207427758.py:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" df_fillna_mean[column].fillna(df_fillna_mean[column].mean(), inplace=True)\n"
]
},
{
"data": {
"text/plain": [
"(Name 0\n",
" Networth 0\n",
" Age 0\n",
" Country 0\n",
" Source 0\n",
" Industry 0\n",
" dtype: int64,\n",
" (2600, 6),\n",
" (2600, 6),\n",
" (2600, 6))"
]
},
"execution_count": 249,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"missing_values = df3.isnull().sum()\n",
"\n",
"df_dropna = df3.dropna()\n",
"\n",
"df_fillna_const = df3.fillna(0)\n",
"\n",
"df_fillna_mean = df3.copy()\n",
"for column in df_fillna_mean.select_dtypes(include=['float64', 'int64']):\n",
" df_fillna_mean[column].fillna(df_fillna_mean[column].mean(), inplace=True)\n",
"\n",
"missing_values, df_dropna.shape, df_fillna_const.shape, df_fillna_mean.shape\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "code",
"execution_count": 251,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((1560, 5), (520, 5), (520, 5))"
]
},
"execution_count": 251,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Разделим набор данных на признаки (X) и целевой признак (y)\n",
"X = df3.drop(columns=['Networth'])\n",
"y = df3['Networth']\n",
"\n",
"# Разделение на обучающую, контрольную и тестовую выборки\n",
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n",
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n",
"\n",
"# Проверка размера выборок\n",
"(X_train.shape, X_val.shape, X_test.shape)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оценка сбалансированности выборок\n"
]
},
{
"cell_type": "code",
"execution_count": 252,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(count 1560.000000\n",
" mean 5.208173\n",
" std 12.653032\n",
" min 1.000000\n",
" 25% 1.500000\n",
" 50% 2.400000\n",
" 75% 4.300000\n",
" max 219.000000\n",
" Name: Networth, dtype: float64,\n",
" count 520.000000\n",
" mean 4.443654\n",
" std 7.267615\n",
" min 1.000000\n",
" 25% 1.500000\n",
" 50% 2.400000\n",
" 75% 4.825000\n",
" max 91.400000\n",
" Name: Networth, dtype: float64,\n",
" count 520.000000\n",
" mean 4.235577\n",
" std 5.861496\n",
" min 1.000000\n",
" 25% 1.600000\n",
" 50% 2.500000\n",
" 75% 4.500000\n",
" max 60.000000\n",
" Name: Networth, dtype: float64)"
]
},
"execution_count": 252,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Проверка распределения целевого признака по выборкам\n",
"train_dist = y_train.describe()\n",
"val_dist = y_val.describe()\n",
"test_dist = y_test.describe()\n",
"\n",
"train_dist, val_dist, test_dist\n"
]
},
{
"cell_type": "code",
"execution_count": 273,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размеры после oversampling: (15270, 19) (15270,)\n",
"Размеры после undersampling: (14965, 19) (14965,)\n"
]
}
],
"source": [
"oversampler = RandomOverSampler(random_state=12)\n",
"X_train_over, y_train_over = oversampler.fit_resample(X_train, y_train)\n",
"\n",
"undersampler = RandomUnderSampler(random_state=12)\n",
"X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)\n",
"\n",
"print(\"Размеры после oversampling:\", X_train_over.shape, y_train_over.shape)\n",
"print(\"Размеры после undersampling:\", X_train_under.shape, y_train_under.shape)"
]
2024-10-10 15:58:16 +04:00
}
],
"metadata": {
"kernelspec": {
"display_name": "aimenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}