AIM-PIbd-31-Razubaev-S-M/Lab2/lab2.ipynb

2443 lines
1.2 MiB
Plaintext
Raw Permalink Normal View History

2024-10-18 19:14:48 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Лабораторная 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Информация об экономике стран"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 111,
2024-10-18 19:14:48 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['stock index', 'country', 'year', 'index price', 'log_indexprice',\n",
" 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent',\n",
" 'percapitaincome', 'unemploymentrate', 'manufacturingoutput',\n",
" 'tradebalance', 'USTreasury'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"df = pd.read_csv(\".//static//scv//Economic Data - 9 Countries (1980-2020).csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Столбцы на русском:\n",
"'stock index' - индекс акций\n",
"'country' - страна\n",
"'year'- год\n",
"'index price' - индекс стоимости\n",
"'log_indexprice' - индексная цена журнала\n",
"'inflationrate' - ставка инфляции\n",
"'oil prices' - цена на нефть\n",
"'exchange_rate' - ставка обмена\n",
"'gdppercent' - процент ВВП\n",
"'percapitaincome' - доход на душу населения\n",
"'unemploymentrate' - уровень безработицы\n",
"'manufacturingoutput' - объем производства\n",
"'tradebalance' - торговый баланс\n",
"'USTreasury' - UST казначейство"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 112,
2024-10-18 19:14:48 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 369 entries, 0 to 368\n",
"Data columns (total 14 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 stock index 369 non-null object \n",
" 1 country 369 non-null object \n",
" 2 year 369 non-null float64\n",
" 3 index price 317 non-null float64\n",
" 4 log_indexprice 369 non-null float64\n",
" 5 inflationrate 326 non-null float64\n",
" 6 oil prices 369 non-null float64\n",
" 7 exchange_rate 367 non-null float64\n",
" 8 gdppercent 350 non-null float64\n",
" 9 percapitaincome 368 non-null float64\n",
" 10 unemploymentrate 348 non-null float64\n",
" 11 manufacturingoutput 278 non-null float64\n",
" 12 tradebalance 365 non-null float64\n",
" 13 USTreasury 369 non-null float64\n",
"dtypes: float64(12), object(2)\n",
"memory usage: 40.5+ KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>stock index</th>\n",
" <th>country</th>\n",
" <th>year</th>\n",
" <th>index price</th>\n",
" <th>log_indexprice</th>\n",
" <th>inflationrate</th>\n",
" <th>oil prices</th>\n",
" <th>exchange_rate</th>\n",
" <th>gdppercent</th>\n",
" <th>percapitaincome</th>\n",
" <th>unemploymentrate</th>\n",
" <th>manufacturingoutput</th>\n",
" <th>tradebalance</th>\n",
" <th>USTreasury</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>NASDAQ</td>\n",
" <td>United States of America</td>\n",
" <td>1980.0</td>\n",
" <td>168.61</td>\n",
" <td>2.23</td>\n",
" <td>0.14</td>\n",
" <td>21.59</td>\n",
" <td>1.0</td>\n",
" <td>0.09</td>\n",
" <td>12575.0</td>\n",
" <td>0.07</td>\n",
" <td>NaN</td>\n",
" <td>-13.06</td>\n",
" <td>0.11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NASDAQ</td>\n",
" <td>United States of America</td>\n",
" <td>1981.0</td>\n",
" <td>203.15</td>\n",
" <td>2.31</td>\n",
" <td>0.10</td>\n",
" <td>31.77</td>\n",
" <td>1.0</td>\n",
" <td>0.12</td>\n",
" <td>13976.0</td>\n",
" <td>0.08</td>\n",
" <td>NaN</td>\n",
" <td>-12.52</td>\n",
" <td>0.14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NASDAQ</td>\n",
" <td>United States of America</td>\n",
" <td>1982.0</td>\n",
" <td>188.98</td>\n",
" <td>2.28</td>\n",
" <td>0.06</td>\n",
" <td>28.52</td>\n",
" <td>1.0</td>\n",
" <td>0.04</td>\n",
" <td>14434.0</td>\n",
" <td>0.10</td>\n",
" <td>NaN</td>\n",
" <td>-19.97</td>\n",
" <td>0.13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NASDAQ</td>\n",
" <td>United States of America</td>\n",
" <td>1983.0</td>\n",
" <td>285.43</td>\n",
" <td>2.46</td>\n",
" <td>0.03</td>\n",
" <td>26.19</td>\n",
" <td>1.0</td>\n",
" <td>0.09</td>\n",
" <td>15544.0</td>\n",
" <td>0.10</td>\n",
" <td>NaN</td>\n",
" <td>-51.64</td>\n",
" <td>0.11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>NASDAQ</td>\n",
" <td>United States of America</td>\n",
" <td>1984.0</td>\n",
" <td>248.89</td>\n",
" <td>2.40</td>\n",
" <td>0.04</td>\n",
" <td>25.88</td>\n",
" <td>1.0</td>\n",
" <td>0.11</td>\n",
" <td>17121.0</td>\n",
" <td>0.08</td>\n",
" <td>NaN</td>\n",
" <td>-102.73</td>\n",
" <td>0.12</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" stock index country year index price log_indexprice \\\n",
"0 NASDAQ United States of America 1980.0 168.61 2.23 \n",
"1 NASDAQ United States of America 1981.0 203.15 2.31 \n",
"2 NASDAQ United States of America 1982.0 188.98 2.28 \n",
"3 NASDAQ United States of America 1983.0 285.43 2.46 \n",
"4 NASDAQ United States of America 1984.0 248.89 2.40 \n",
"\n",
" inflationrate oil prices exchange_rate gdppercent percapitaincome \\\n",
"0 0.14 21.59 1.0 0.09 12575.0 \n",
"1 0.10 31.77 1.0 0.12 13976.0 \n",
"2 0.06 28.52 1.0 0.04 14434.0 \n",
"3 0.03 26.19 1.0 0.09 15544.0 \n",
"4 0.04 25.88 1.0 0.11 17121.0 \n",
"\n",
" unemploymentrate manufacturingoutput tradebalance USTreasury \n",
"0 0.07 NaN -13.06 0.11 \n",
"1 0.08 NaN -12.52 0.14 \n",
"2 0.10 NaN -19.97 0.13 \n",
"3 0.10 NaN -51.64 0.11 \n",
"4 0.08 NaN -102.73 0.12 "
]
},
2024-10-18 21:22:08 +04:00
"execution_count": 112,
2024-10-18 19:14:48 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.info()\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Объект наблюдения - экономика\n",
"Атрибуты - содержит набор информации об обучении, такие как:\n",
"Фондовый рынок, ВВП, страна, год, стоимость топлива, уровень инфлции,уровень безработицы и так далее"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 113,
2024-10-18 19:14:48 +04:00
"metadata": {},
"outputs": [
{
"data": {
2024-10-18 20:20:52 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2kAAAIjCAYAAACZPFMYAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd5xdVb3//9fe+/Qzc6aX1EmDVEIgKAmhRFpALCAWEDWUyxUuqBB/Fu7Xi4JXvXpFUAGxAEERFbhYKAIxEAQJLRAICemTOpnJ9DPltF1+f0wyMGRS5pxJZiZ5P32MZvZes9dnn52J53PWWp9leJ7nISIiIiIiIoOCOdABiIiIiIiIyLuUpImIiIiIiAwiStJEREREREQGESVpIiIiIiIig4iSNBERERERkUFESZqIiIiIiMggoiRNRERERERkEFGSJiIiIiIiMogoSRMRERERERlElKSJiIiIiIgMIkrSRERkQDz88MMYhtHr17Rp0wY6vCNWe3s73/72tznnnHMoLi7GMAwWLlw40GGJiBxRfAMdgIiIHNn+8z//k8mTJ3d//73vfW8Ao5GGhgZuvvlmRo8ezbHHHsuSJUsGOiQRkSOOkjQRERlQZ511FnPnzu3+/je/+Q0NDQ0DF9ARbtiwYezYsYPKykpee+01PvCBDwx0SCIiRxxNdxQRkQGRTqcBMM39/1/RwoULMQyDTZs2dR9zXZfp06fvMR3vrbfe4tJLL2XcuHGEQiEqKyu5/PLLaWxs7HHN73znO71OtfT53v38cu7cuUybNo1ly5Zx0kknEQ6HGTt2LHfdddce93LjjTcyc+ZMCgoKiEajnHLKKTz77LM92m3atKm7n7/85S89ziWTSYqKijAMgx//+Md7xFleXk4mk+nxM3/4wx+6r/fexPavf/0r5513HsOHDycYDDJ+/Hi++93v4jjOfl/rYDBIZWXlftuJiMjBo5E0EREZELuTtGAwmNXP/+53v2PFihV7HF+0aBEbN27ksssuo7KykpUrV/KrX/2KlStX8tJLL2EYRo/2v/jFL8jLy+v+/v1JY3NzMx/+8If59Kc/zcUXX8yDDz7I1VdfTSAQ4PLLLwcgHo/zm9/8hosvvpgrr7yStrY27r77bubNm8crr7zCjBkzelwzFApx7733cv7553cfe+SRR0gmk3u937a2Nh577DEuuOCC7mP33nsvoVBoj59buHAheXl5LFiwgLy8PJ555hluvPFG4vE4//u//7vXPkREZHBQkiYiIgOitbUVgHA43OefTaVS3HjjjZx77rn8/e9/73HuP/7jP/jqV7/a49isWbO4+OKLeeGFFzjllFN6nPvkJz9JaWnpXvuqqanhlltuYcGCBQB88Ytf5MQTT+SGG27g85//PH6/n6KiIjZt2kQgEOj+uSuvvJJJkybx85//nLvvvrvHNS+44AIeeugh6urqqKioAOCee+7hE5/4BA888ECvcVxwwQXcc8893Unali1bWLx4MZ/5zGf4wx/+0KPtAw880ON1veqqq7jqqqu48847+e///u+sE2MRETk0NN1RREQGxO7ph2VlZX3+2TvuuIPGxka+/e1v73HuvclJMpmkoaGBWbNmAfD666/3uS+fz8cXv/jF7u8DgQBf/OIX2blzJ8uWLQPAsqzuBM11XZqamrBtmxNOOKHXPo8//nimTp3K7373OwA2b97Ms88+y6WXXrrXOC6//HKefPJJamtrAbjvvvuYPXs2Rx999B5t3/satLW10dDQwCmnnEJnZyerV6/u82sgIiKHlpI0EREZEJs3b8bn8/U5SWttbeX73/8+CxYs6B6Feq+mpia+8pWvUFFRQTgcpqysjLFjx3b/bF8NHz6caDTa49juxOi9a+Tuu+8+pk+fTigUoqSkhLKyMh5//PG99nnZZZdx7733Al3TE0866SSOOuqovcYxY8YMpk2bxm9/+1s8z2PhwoVcdtllvbZduXIlF1xwAQUFBcRiMcrKyvjc5z4HZPcaiIjIoaUkTUREBsSaNWsYN25cj0IdB+KHP/whpmnyta99rdfzn/70p/n1r3/NVVddxSOPPMLTTz/Nk08+CXSNch0M999/P5deeinjx4/n7rvv5sknn2TRokWcfvrpe+3zc5/7HOvXr+ell17ivvvu22vC9V6XX3459957L8899xy1tbV8+tOf3qNNS0sLp512Gm+++SY333wzjz76KIsWLeKHP/whcPBeAxER6T9akyYiIodcKpVi+fLlPQpnHIiamhp++tOf8oMf/ID8/Pw9KjY2NzezePFibrrpJm688cbu4+vWrcs61pqaGjo6OnqMpq1duxaAMWPGAF0bc48bN45HHnmkR2GS3qZj7lZSUsLHPvax7qmTn/70p/e79cAll1zC1772Nb7yla/wyU9+kvz8/D3aLFmyhMbGRh555BFOPfXU7uPV1dUHdL8iIjLwNJImIiKH3AMPPEAqleKMM87o08/ddNNNVFRUcNVVV/V63rIsADzP63H8tttuyypOANu2+eUvf9n9fTqd5pe//CVlZWXMnDlzr/2+/PLLLF26dJ/Xvvzyy3nrrbf41Kc+1aPC5N4UFxfz8Y9/nLfeequ7suT79RZLOp3mzjvv3O/1RURkcNBImoiIHDIdHR38/Oc/5+abb8ayLDzP4/777+/Rpq6ujvb2du6//37OOuusHuvOnn76aX7/+9/3qKL4XrFYjFNPPZUf/ehHZDIZRowYwdNPP53TKNLw4cP54Q9/yKZNmzj66KP505/+xPLly/nVr36F3+8H4CMf+QiPPPIIF1xwAeeddx7V1dXcddddTJkyhfb29r1e+5xzzqG+vv6AErTdFi5cyB133LHXipQnnXQSRUVFzJ8/ny9/+csYhsHvfve7PRLXfbn99ttpaWmhpqYGgEcffZRt27YB8KUvfYmCgoIDvpaIiPSdkjQRETlk6uvrueGGG7q/f2/VxPf7/Oc/z7PPPtsjSZsxYwYXX3zxPvt44IEH+NKXvsQdd9yB53mcffbZ/P3vf2f48OFZxVxUVMR9993Hl770JX79619TUVHB7bffzpVXXtnd5tJLL6W2tpZf/vKXPPXUU0yZMoX777+fhx56iCVLluz12oZh7LP8f2/C4fA+ty0oKSnhscce46tf/Srf+ta3KCoq4nOf+xxnnHEG8+bNO6A+fvzjH7N58+bu7x955BEeeeQRoGstnZI0EZGDy/D68tGaiIhIDjZt2sTYsWN59tlnmTt3bs7tDra5c+fS0NDA22+/PWAxiIjIkUdr0kRERERERAYRJWkiInLI5OXlcckll/S6v1k27URERA5Hmu4oIiKyF5ruKCIiA0FJmoiIiIiIyCCi6Y4iIiIiIiKDiJI0ERERERGRQUT7pPUT13WpqakhPz8fwzAGOhwRERERERkgnufR1tbG8OHDMc2+j4spSesnNTU1jBo1aqDDEBERERGRQWLr1q2MHDmyzz+nJK2f5OfnA10PIhaL7bVdJpPh6aef5uyzz8bv9x+q8GSA6bkfmfTcj0x67kcePfMjk577kelAn3s8HmfUqFHdOUJfKUnrJ7unOMZisf0maZFIhFgspl/oI4ie+5FJz/3IpOd+5NEzPzLpuR+Z+vrcs10GpcIhIiIiIiIig4iSNBERERERkUFESZqIiIiIiMggoiRNRERERERkEFGSJiIiIiIiMogoSRMRERERERlElKSJiIiIiIgMIkrSREREREREBhElaSIiIiIiIoOIkjQREREREZFBREmaiIiIiIjIIKIkTUREREREZBBRkiYiB8zzEnhuM56XGehQRERERA5bvoEOQEQGP8/egpdaDOnXwLPBjEHwZAiegWHmDXR4IiIiIocVJWkisk9eZi1e+x3g1oJRBEYI3Ea8zj9A5m3I+xKGGRvoMEVEREQOG5ruKCJ75Xk2Xuf94O4EawJYZV2jaNYIsEZD5i285FMDHaaIiIjIYUVJmojsnb0anE1dSZnxvn8ujCAYMUj9C8/tHJDwRERERA5HStJEZO+cevAyYIR7P2/kg9cGXvOhjUtERETkMKYkTUT2zgh0/a/n7KV
2024-10-18 19:14:48 +04:00
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"\n",
"plt.scatter(df['inflationrate'], df['percapitaincome'], c=df['percapitaincome'], alpha=0.6)\n",
"\n",
2024-10-18 20:20:52 +04:00
"plt.title(\"Диаграмма 1\")\n",
2024-10-18 19:14:48 +04:00
"plt.ylabel(\"Доход на душу населения\")\n",
"plt.xlabel(\"Уровень инфляции\")\n",
"plt.grid(visible='true')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 114,
2024-10-18 19:14:48 +04:00
"metadata": {},
"outputs": [
{
"data": {
2024-10-18 20:20:52 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA/IAAAIjCAYAAACgdyAGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACpxElEQVR4nOzdd3xTdfcH8M/N7l7pgFJoyxDK3lNARUEFBFTQx4H4/HAiKMgjOBAnoMCDAgou9HEiCgoqCDJUliDIKHu00N2me4/k/v5Ib0qhQFOS3uTm8369+pKmNzenoNiTc77nCKIoiiAiIiIiIiIit6CSOwAiIiIiIiIiqj8m8kRERERERERuhIk8ERERERERkRthIk9ERERERETkRpjIExEREREREbkRJvJEREREREREboSJPBEREREREZEbYSJPRERERERE5EaYyBMRERERERG5ESbyRERERERERG6EiTwREZFMvvvuOwiCUOdHhw4d5A7PY+3duxeTJk1C+/bt4ePjg+bNm2Ps2LE4efKk3KEREREBADRyB0BEROTpnn/+ebRr1872+RtvvCFjNDRv3jzs2LEDd999Nzp16oT09HQsWbIE3bp1w+7du/kmCxERyU4QRVGUOwgiIiJP9N133+Huu+/G1q1bMXjwYNvjgwcPhslkQnx8vHzBebCdO3eiR48e0Ol0tsdOnTqFjh074q677sIXX3whY3RERERsrSciIpJNRUUFAECluvr/jj/99FMIgoDExETbYxaLBZ06dYIgCPj0009tjx86dAgPPfQQYmNjYTAYEBERgYcffhjZ2dm17jl79uw62/o1mpqGvcGDB6NDhw7Yt28f+vXrBy8vL8TExGDZsmWXfC+zZs1C9+7dERAQAB8fH1x//fXYunVrresSExNtr/PDDz/U+lpZWRmCgoIgCALmz59/SZxhYWGorKys9Zyvv/7adj+TyWR7/Mcff8Ttt9+Opk2bQq/Xo2XLlnjttddgNpuv+nvdr1+/Wkk8ALRu3Rrt27fHsWPHrvp8IiIiZ2NrPRERkUykRF6v1zfo+Z9//jkOHz58yeObNm3C2bNnMWHCBERERODIkSP44IMPcOTIEezevRuCINS6/v3334evr6/t84vfWMjNzcVtt92GsWPH4t5778W3336Lxx9/HDqdDg8//DAAoKCgAB999BHuvfdeTJw4EYWFhfj4448xdOhQ7NmzB126dKl1T4PBgBUrVmDUqFG2x1avXo2ysrLLfr+FhYX46aefMHr0aNtjK1asgMFguOR5n376KXx9fTF16lT4+vpiy5YtmDVrFgoKCvD2229f9jUuRxRFZGRkoH379nY/l4iIyNGYyBMREckkPz8fAODl5WX3c8vLyzFr1izceuutWL9+fa2vPfHEE5g2bVqtx/r06YN7770X27dvx/XXX1/ra3fddReMRuNlXys1NRULFizA1KlTAQCPPvooevfujZkzZ+KBBx6AVqtFUFAQEhMTa1WyJ06ciLZt22Lx4sX4+OOPa91z9OjRWLVqFTIyMhAeHg4A+OSTTzBmzBh89dVXdcYxevRofPLJJ7ZE/vz589i8eTPGjRuHr7/+uta1X331Va3f18ceewyPPfYY3nvvPbz++ut2v3ny5ZdfIiUlBa+++qpdzyMiInIGttYTERHJRGp1Dw0Ntfu5S5cuRXZ2Nl5++eVLvnZhAltWVgaTyYQ+ffoAAPbv32/3a2k0Gjz66KO2z3U6HR599FFkZmZi3759AAC1Wm1L4i0WC3JyclBVVYUePXrU+ZrdunVD+/bt8fnnnwMAzp07h61bt+Khhx66bBwPP/wwNmzYgPT0dADAZ599hr59+6JNmzaXXHvh70FhYSFMJhOuv/56lJSU4Pjx43Z9/8ePH8eTTz6Jvn37Yvz48XY9l4iIyBmYyBMREcnk3Llz0Gg0dify+fn5ePPNNzF16lRbNftCOTk5mDJlCsLDw+Hl5YXQ0FDExMTYnmuvpk2bwsfHp9ZjUvJ84Zn9zz77DJ06dYLBYEBISAhCQ0Px888/X/Y1J0yYgBUrVgCwtsL369cPrVu3vmwcXbp0QYcOHfC///0Poiji008/xYQJE+q89siRIxg9ejQCAgLg7++P0NBQ3H///QDs+z1IT0/H7bffjoCAAHz33XdQq9X1fi4REZGzMJEnIiKSyYkTJxAbG1truFx9zJs3DyqVCtOnT6/z62PHjsWHH36Ixx57DKtXr8bGjRuxYcMGANZquTN88cUXeOihh9CyZUt8/PHH2LBhAzZt2oQbb7zxsq95//334/Tp09i9ezc+++yzyyblF3r44YexYsUK/P7770hPT8fYsWMvuSYvLw+DBg3CwYMH8eqrr2LdunXYtGkT5s2bB6D+vwf5+fm49dZbkZeXhw0bNqBp06b1eh4REZGz8Yw8ERGRDMrLy3HgwIFaw97qIzU1Fe+88w7mzJkDPz+/SybR5+bmYvPmzXjllVcwa9Ys2+OnTp1qcKypqakoLi6uVZU/efIkACA6OhqAdZVebGwsVq9eXWuYXl2t/5KQkBCMHDnS1qY/duzYWpPn63Lfffdh+vTpmDJlCu666y74+fldcs22bduQnZ2N1atXY+DAgbbHExIS6vX9AtYjCSNGjMDJkyfx22+/IS4urt7PJSIicjZW5ImIiGTw1Vdfoby8HDfddJNdz3vllVcQHh6Oxx57rM6vS63foijWenzRokUNihMAqqqqsHz5ctvnFRUVWL58OUJDQ9G9e/fLvu5ff/2FXbt2XfHeDz/8MA4dOoS777671uT8ywkODsYdd9yBQ4cO2SbmX6yuWCoqKvDee+9d9f4AYDabMW7cOOzatQurVq1C37596/U8IiKixsKKPBERUSMqLi7G4sWL8eqrr0KtVkMURXzxxRe1rsnIyEBRURG++OIL3HzzzbXOwW/cuBFffvnlJXvOJf7+/hg4cCDeeustVFZWIjIyEhs3brSrGn2xpk2bYt68eUhMTESbNm2wcuVKHDhwAB988AG0Wi0AYPjw4Vi9ejVGjx6N22+/HQkJCVi2bBni4uJQVFR02XsPGzYMWVlZ9UriJZ9++imWLl162Un7/fr1Q1BQEMaPH4/JkydDEAR8/vnnl7y5cTnTpk3D2rVrMWLECOTk5Fzy5yOdtSciIpILE3kiIqJGlJWVhZkzZ9o+v3Aa/MUeeOABbN26tVYi36VLF9x7771XfI2vvvoKTz31FJYuXQpRFHHLLbdg/fr1DT7jHRQUhM8++wxPPfUUPvzwQ4SHh2PJkiWYOHGi7ZqHHnoI6enpWL58OX799VfExcXhiy++wKpVq7Bt27bL3lsQhCuuvquLl5fXFVf2hYSE4KeffsK0adPw4osvIigoCPfffz9uuukmDB069Kr3P3DgAABg3bp1WLdu3SVfZyJPRERyE8T6vj1NRERE1ywxMRExMTHYunUrBg8efM3XOdvgwYNhMpkQHx8vWwxERERUG8/IExEREREREbkRJvJERESNyNfXF/fdd1+d+98bch0RERF5HrbWExER0WWxtZ6IiMj1MJEnIiIiIiIiciNsrSciIiIiIiJyI0zkiYiIiIiIiNwI98jXwWKxIDU1FX5+fhAEQe5wiIiIiIiISOFEUURhYSGaNm0KlerKNXcm8nVITU1FVFSU3GEQERERERGRh0lKSkKzZs2ueA0T+Tr4+fkBsP4G+vv7yxwNERERERERKV1BQQGioqJs+eiVMJGvg9RO7+/vz0SeiIiIiIiIGk19jndz2B0RERERERGRG2EiT0RERERERORGmMgTERERERERuREm8kRERERERERuhIk8ERERERERkRthIk9ERERERETkRpjIExEREREREbkRJvJEREREREREboSJPBEREREREZEbYSJPRERERERE5EaYyBMRERERERG5ESbyRERERERERG6EiTwRERERERGRG2EiT0RERERERORGXCKRX7p0KaKjo2EwGNC7d2/s2bPnstceOXIEd955J6KjoyEIAhYtWlTndSkpKbj//vsREhICLy8vdOzYEX///beTvgMiIiIiIiKixiF7Ir9
2024-10-18 19:14:48 +04:00
"text/plain": [
"<Figure size 1200x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
2024-10-18 21:22:08 +04:00
"df_dependence = df.groupby('gdppercent')['unemploymentrate'].mean().reset_index()\n",
2024-10-18 19:14:48 +04:00
"\n",
"plt.figure(figsize=(12, 6))\n",
"\n",
2024-10-18 21:22:08 +04:00
"plt.plot(df_dependence['gdppercent'], df_dependence['unemploymentrate'], marker='.')\n",
2024-10-18 19:14:48 +04:00
"\n",
"plt.title(\"Диаграмма 2\")\n",
2024-10-18 20:20:52 +04:00
"plt.xlabel(\"Уровень ВВП\")\n",
"plt.ylabel(\"Уровень безработицы\")\n",
2024-10-18 19:14:48 +04:00
"\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Присутствует связь между атрибутами, уровень инфляции влияет и зависит от многих атрибутов.\n",
"Для примера на графике приведена связь между инфляцией и доходом на душу населения. На втором графике показана связь уровня ВВП и безработицы\n",
"Примеры бизнес целей\n",
"\n",
" 1.Прогнозирование уровня инфляции на основе уровня ВВП.\n",
" 2.Наблюдение за изменениями уровня безработицы с уровнем ВВП.\n",
" \n",
"Эффект для бизнеса: влияние на инвестиции индекса акций и цен на нефть, исследование влияния фондового индекса на инвестиции, исследования инфляции и покупательской способности.\n",
"Цели технического проекта\n",
"\n",
"Для первой цели:\n",
"\n",
"Вход: Доход на душу населения\n",
"Целевой признак: Уровень инфляции.\n",
"\n",
"Для второй цели:\n",
"\n",
"Вход: Уровень безработицы\n",
"Целевой признак: Уровень ВВП"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверка на выбросы"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 115,
2024-10-18 19:14:48 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пропущенные значения по столбцам:\n",
"stock index 0\n",
"country 0\n",
"year 0\n",
"index price 52\n",
"log_indexprice 0\n",
"inflationrate 43\n",
"oil prices 0\n",
"exchange_rate 2\n",
"gdppercent 19\n",
"percapitaincome 1\n",
"unemploymentrate 21\n",
"manufacturingoutput 91\n",
"tradebalance 4\n",
"USTreasury 0\n",
"dtype: int64\n",
"\n",
"Статистический обзор данных:\n",
" year index price log_indexprice inflationrate oil prices \\\n",
"count 369.000000 317.000000 369.000000 326.000000 369.000000 \n",
"mean 2000.000000 7898.648297 3.610542 0.041748 39.743171 \n",
"std 11.848225 7811.336862 0.482481 0.039579 25.452654 \n",
"min 1980.000000 168.610000 2.230000 -0.040000 11.350000 \n",
"25% 1990.000000 2407.100000 3.320000 0.020000 19.410000 \n",
"50% 2000.000000 5160.100000 3.600000 0.030000 28.520000 \n",
"75% 2010.000000 10279.500000 3.980000 0.057500 57.880000 \n",
"max 2020.000000 47751.330000 4.680000 0.240000 98.560000 \n",
"\n",
" exchange_rate gdppercent percapitaincome unemploymentrate \\\n",
"count 367.000000 350.000000 368.000000 348.000000 \n",
"mean 27.897548 0.037114 20719.964674 0.068908 \n",
"std 49.620521 0.037850 17435.037783 0.043207 \n",
"min 0.900000 -0.110000 27.000000 0.020000 \n",
"25% 1.330000 0.020000 2090.250000 0.040000 \n",
"50% 5.440000 0.030000 19969.500000 0.060000 \n",
"75% 15.055000 0.060000 36384.000000 0.090000 \n",
"max 249.050000 0.150000 65280.000000 0.260000 \n",
"\n",
" manufacturingoutput tradebalance USTreasury \n",
"count 278.000000 365.000000 369.000000 \n",
"mean 328.084820 -15.996384 0.059024 \n",
"std 622.395923 154.557170 0.033086 \n",
"min 0.590000 -770.930000 0.010000 \n",
"25% 80.380000 -25.370000 0.030000 \n",
"50% 188.160000 -0.140000 0.050000 \n",
"75% 271.977500 19.080000 0.080000 \n",
"max 3868.460000 366.140000 0.140000 \n"
]
}
],
"source": [
"null_values = df.isnull().sum()\n",
"print(\"Пропущенные значения по столбцам:\")\n",
"print(null_values)\n",
"\n",
"stat_summary = df.describe()\n",
"print(\"\\nСтатистический обзор данных:\")\n",
"print(stat_summary)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"На основе данных выше можно выделить большое количество столбцов с пропущенными значениями\n",
"Также проверим данные на выбросы и дубликаты:"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 116,
2024-10-18 19:14:48 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Коэффициент асимметрии для столбца 'year': 0.0\n",
"\n",
"Коэффициент асимметрии для столбца 'index price': 1.7605604508668822\n",
"\n",
"Коэффициент асимметрии для столбца 'log_indexprice': -0.23716751168770417\n",
"\n",
"Коэффициент асимметрии для столбца 'inflationrate': 1.5616085380027898\n",
"\n",
"Коэффициент асимметрии для столбца 'oil prices': 0.9915046764713877\n",
"\n",
"Коэффициент асимметрии для столбца 'exchange_rate': 2.1575952097650455\n",
"\n",
"Коэффициент асимметрии для столбца 'gdppercent': -0.038272329611460466\n",
"\n",
"Коэффициент асимметрии для столбца 'percapitaincome': 0.3051430219264069\n",
"\n",
"Коэффициент асимметрии для столбца 'unemploymentrate': 1.8092896369785585\n",
"\n",
"Коэффициент асимметрии для столбца 'manufacturingoutput': 4.195480293406057\n",
"\n",
"Коэффициент асимметрии для столбца 'tradebalance': -2.266183907194849\n",
"\n",
"Коэффициент асимметрии для столбца 'USTreasury': 0.6687596580836408\n",
"\n",
"Количество дубликатов: 0\n"
]
}
],
"source": [
"for column in df.select_dtypes(include=[np.number]).columns:\n",
" skewness = df[column].skew()\n",
" print(f\"\\nКоэффициент асимметрии для столбца '{column}': {skewness}\")\n",
"\n",
"duplicates = df.duplicated().sum()\n",
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"На основе данных выше можно сказать, что для столбца объем производства присутствует выброс.\n",
"Удаляем все найденные пустые значения."
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 117,
2024-10-18 19:14:48 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"В наборе данных 'Economic' было удалено 150 строк с пустыми значениями.\n"
]
}
],
"source": [
"def drop_missing_values(dataframe, name):\n",
" before_shape = dataframe.shape \n",
" cleaned_dataframe = dataframe.dropna() \n",
" after_shape = cleaned_dataframe.shape \n",
" print(f\"В наборе данных '{name}' было удалено {before_shape[0] - after_shape[0]} строк с пустыми значениями.\")\n",
" return cleaned_dataframe\n",
"\n",
"cleaned_df = drop_missing_values(df, \"Economic\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Очистка данных от шумов:"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 118,
2024-10-18 19:14:48 +04:00
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2gAAAIjCAYAAAB2/jgmAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAByTElEQVR4nO3deXxU1f3/8fdMQhKWZJIAyQQaICyKkc2AQFCECpaIgtTWhYIiWlBcqnUFW42xtWipuxaU/gQV91b9itooRVGxwSiLGgJUMAGUhEACSViSQOb8/kgzZcgkmaxzJ3k9H495wNx77rmfc+bO5H7m3jnHZowxAgAAAAD4nd3fAQAAAAAAqpCgAQAAAIBFkKABAAAAgEWQoAEAAACARZCgAQAAAIBFkKABAAAAgEWQoAEAAACARZCgAQAAAIBFkKABAAAAgEWQoAEAALRhP/zwg5YvX+5+npubq5deesl/AQGoEwkaAEv7+9//LpvN5vUxaNAgf4cHAJZns9l0ww036IMPPlBubq7uvPNOffbZZ/4OC0Atgv0dAAD44u6779Zpp53mfv7AAw/4MRoACBw9e/bUnDlzlJKSIkmKi4vTmjVr/BsUgFrZjDHG30EAQG3+/ve/65JLLtHHH3+s8ePHu5ePHz9e+/fvV1ZWlv+CA4AAsmPHDu3fv1+DBg1S586d/R0OgFpwiyMAS6uoqJAk2e31f1wtX75cNptNubm57mUul0tDhgyRzWbz+A3GN998o6uuukp9+/ZVWFiYnE6nrr76ahUWFnrUed9993m9vTI4+H83IIwfP16DBg3S+vXrNWbMGHXs2FEJCQlasmRJjbbce++9Gj58uBwOhzp37qyxY8fq448/9iiXm5vr3s/bb7/tsa6srExRUVGy2Wz6y1/+UiPOmJgYHTt2zGObV155xV3f/v373cv/7//+TxdccIF69Oih0NBQ9evXT3/4wx9UWVlZb19X72/r1q269NJLFRERoa5du+rmm29WWVmZR9lly5bp3HPPVUxMjEJDQ5WYmKjFixd7rfef//ynxo0bp/DwcEVEROjMM8/Uyy+/7FHmiy++0OTJkxUVFaXOnTtryJAhevzxxz3KbN26Vb/85S8VHR2tsLAwjRgxQu+8845HmYYcL1dddZXH6x8VFaXx48fXuE3M1z6tPmZO9pe//KVGTH369NFVV13lUe6NN96QzWZTnz59PJYXFBTommuuUa9evRQUFOSOt0uXLjX2dbI+ffrUejuxzWarUX7FihUaPny4OnbsqOjoaF1++eXavXu313bW996QpPLycqWmpqp///4KDQ1VfHy87rzzTpWXl9cou2bNGp/jPFn1seut/Sf2c0OOD0nu90L37t3VsWNHnXrqqfrd737nsc+6HtVXtMaPH+/xZZRUdceA3W6v8V5444033K9Bt27dNHPmTP34448eZa666ir3cdKvXz+NGjVKRUVF6tixY432AbAGbnEEYGnVCVpoaGijtn/xxRf17bff1li+atUqff/995o9e7acTqc2b96sZ599Vps3b9a6detqnMAtXrzY4yT35ITxwIEDmjx5si699FJNnz5dr7/+uubNm6eQkBBdffXVkqSSkhL97W9/0/Tp0zVnzhyVlpbq//2//6dJkyYpMzNTw4YN86gzLCxMy5Yt07Rp09zL3nzzzRoJ0IlKS0v17rvv6uc//7l72bJlyxQWFlZju+XLl6tLly669dZb1aVLF3300Ue69957VVJSokWLFtW6jxNdeuml6tOnjxYuXKh169bpiSee0IEDB/TCCy949N3pp5+uqVOnKjg4WCtXrtT1118vl8ulG264wSOeq6++WqeffroWLFigyMhIbdy4Uenp6frVr34lqep1u/DCCxUXF6ebb75ZTqdTW7Zs0bvvvqubb75ZkrR582adddZZ6tmzp+bPn6/OnTvr9ddf17Rp0/SPf/zDo29OVtvxIkndunXTo48+Kqlq0IXHH39ckydP1u7duxUZGdlsfVqf48ePu0/8TzZr1iz961//0k033aShQ4cqKChIzz77rDZs2OBT3cOGDdNtt93mseyFF17QqlWrPJY98MADuueee3TppZfq17/+tfbt26cnn3xS55xzjjZu3OjuD8m394bL5dLUqVO1du1azZ07V6eddpq+/fZbPfroo/rPf/5T44uKar/5zW905pln1hpnc6vt+Pjmm280duxYdejQQXPnzlWfPn20Y8cOrVy5Ug888IAuvvhi9e/f313+t7/9rU477TTNnTvXvezEW7hPtGzZMv3+97/Xww8/7H4fSFXH2uzZs3XmmWdq4cKF2rt3rx5//HF9/vnnNV6Dk9177711fo4A8DMDABb22GOPGUnm66+/9lg+btw4c/rpp3ssW7ZsmZFkcnJyjDHGlJWVmV69epnzzz/fSDLLli1zlz1y5EiNfb3yyitGkvn000/dy1JTU40ks2/fvlpjHDdunJFkHn74Yfey8vJyM2zYMBMTE2MqKiqMMcYcP37clJeXe2x74MABExsba66++mr3spycHCPJTJ8+3QQHB5v8/Hz3ugkTJphf/epXRpJZtGhRjTinT59uLrzwQvfynTt3GrvdbqZPn16jHd764NprrzWdOnUyZWVltbb3xP1NnTrVY/n1119f4/Xytp9JkyaZvn37up8fPHjQhIeHm1GjRpmjR496lHW5XMaYqv5LSEgwvXv3NgcOHPBaxpiqPho8eLBHG1wulxkzZowZMGCAe1lDjpdZs2aZ3r17e+zz2WefNZJMZmZmnW311qfejl9jjFm0aJFHTMYY07t3bzNr1iz387/+9a8mNDTU/PSnP/WI6ejRo8Zut5trr73Wo85Zs2aZzp0719jXyXr37m0uuOCCGstvuOEGc+LpQm5urgkKCjIPPPCAR7lvv/3WBAcHeyz39b3x4osvGrvdbj777DOPOpcsWWIkmc8//9xj+Ycffmgkmb///e+1xlmbtLQ0I8njmKlu/4n93JDj45xzzjHh4eFm586dHnWevI/a9nWicePGmXHjxhljjHnvvfdMcHCwue222zzKVFRUmJiYGDNo0CCP98u7775rJJl7773XvezkYzcrK8vY7XZ3O0481gBYA7c4ArC06lsOu3fv3uBtn376aRUWFio1NbXGuo4dO7r/X1ZWpv3792v06NGS5PPVhhMFBwfr2muvdT8PCQnRtddeq4KCAq1fv16SFBQUpJCQEElVVwyKiop0/PhxjRgxwus+k5KSdPrpp+vFF1+UJO3cuVMff/xxjdvdTnT11VcrPT1d+fn5kqTnn39eycnJOuWUU2qUPbEPSktLtX//fo0dO1ZHjhzR1q1bfWr3iVfAJOmmm26SJL3//vte91NcXKz9+/dr3Lhx+v7771VcXCyp6spYaWmp5s+fr7CwMI86q69mbty4UTk5ObrllltqXB2oLlNUVKSPPvpIl156qbtN+/fvV2FhoSZNmqTvvvuuxi1g1eo6XqSq16y6vk2bNumFF15QXFycx5WPhvRpZWWlu77qx5EjR7zuu9qRI0d0//3368Ybb1SvXr081h0+fFgul0tdu3ats46mevPNN+VyuXTppZd6xO50OjVgwIAat+z68t544403dNppp2ngwIEedZ577rmSVKPO6qs/Jx8rvoiJiZFUdRW0IWo7Pvbt26dPP/1UV199dY3XxJdbLmuTmZmpSy+9VL/4xS9qXH396quvVFBQoOuvv96jDy644AINHDhQ7733Xq31LliwQElJSbrkkksaHRuAlsUtjgAsbefOnQoODm5wglZcXKw//elPuvXWWxUbG1tjfVFRkdLS0vTqq6+qoKCgxrYN1aNHjxo/uq9OinJzc93J3/PPP6+HH35YW7du9fitWEJCgtd6Z8+erWeffVa33367li9frjFjxmjAgAG1xjFs2DANGjRIL7zwgu644w4tX75cd999d43fBklVtwL+/ve/10cffaSSkhKPdb72wcmx9OvXT3a73eN3LZ9//rlSU1OVkZFRIwEpLi6Ww+HQjh07JKnOqRN8KbN9+3YZY3TPPffonnvu8VqmoKBAPXv
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выбросы в датасете:\n",
" stock index country year index price log_indexprice inflationrate \\\n",
"229 SZCOMP China 2004.0 1467.57 3.17 0.04 \n",
"230 SZCOMP China 2005.0 1144.54 3.06 0.02 \n",
"231 SZCOMP China 2006.0 1687.14 3.23 0.02 \n",
"232 SZCOMP China 2007.0 4329.44 3.64 0.05 \n",
"233 SZCOMP China 2008.0 2912.90 3.46 0.06 \n",
"234 SZCOMP China 2009.0 2737.01 3.44 -0.01 \n",
"235 SZCOMP China 2010.0 2795.88 3.45 0.03 \n",
"236 SZCOMP China 2011.0 2639.19 3.42 0.06 \n",
"237 SZCOMP China 2012.0 2211.11 3.34 0.03 \n",
"238 SZCOMP China 2013.0 2182.52 3.34 0.03 \n",
"239 SZCOMP China 2014.0 2279.75 3.36 0.02 \n",
"240 SZCOMP China 2015.0 3657.40 3.56 0.01 \n",
"241 SZCOMP China 2016.0 2978.14 3.47 0.02 \n",
"242 SZCOMP China 2017.0 3257.35 3.51 0.02 \n",
"243 SZCOMP China 2018.0 2920.18 3.47 0.02 \n",
"244 SZCOMP China 2019.0 2928.94 3.47 0.03 \n",
"245 SZCOMP China 2020.0 3109.78 3.49 0.02 \n",
"271 DAX 30 Germany 2005.0 5408.25 3.73 0.02 \n",
"272 DAX 30 Germany 2006.0 6596.91 3.82 0.02 \n",
"273 DAX 30 Germany 2007.0 8067.31 3.91 0.02 \n",
"274 DAX 30 Germany 2008.0 4810.20 3.68 0.03 \n",
"276 DAX 30 Germany 2010.0 6914.19 3.84 0.01 \n",
"277 DAX 30 Germany 2011.0 5898.35 3.77 0.02 \n",
"280 DAX 30 Germany 2014.0 9805.55 3.99 0.01 \n",
"281 DAX 30 Germany 2015.0 10743.01 4.03 0.01 \n",
"283 DAX 30 Germany 2017.0 12917.64 4.11 0.02 \n",
"284 DAX 30 Germany 2018.0 10558.96 4.02 0.02 \n",
"285 DAX 30 Germany 2019.0 13249.01 4.12 0.01 \n",
"286 DAX 30 Germany 2020.0 13718.78 4.14 0.01 \n",
"\n",
" oil prices exchange_rate gdppercent percapitaincome unemploymentrate \\\n",
"229 43.15 8.28 0.10 1509.0 0.04 \n",
"230 59.41 8.19 0.11 1753.0 0.04 \n",
"231 61.96 7.97 0.13 2099.0 0.04 \n",
"232 91.69 7.61 0.14 2694.0 0.04 \n",
"233 41.12 6.95 0.10 3468.0 0.04 \n",
"234 74.47 6.83 0.09 3832.0 0.04 \n",
"235 89.15 6.77 0.11 4550.0 0.04 \n",
"236 98.56 6.46 0.10 5618.0 0.04 \n",
"237 87.86 6.31 0.08 6317.0 0.04 \n",
"238 97.63 6.15 0.08 7051.0 0.05 \n",
"239 59.29 6.16 0.07 7679.0 0.05 \n",
"240 37.19 6.28 0.07 8067.0 0.05 \n",
"241 51.97 6.64 0.07 8148.0 0.05 \n",
"242 57.88 6.76 0.07 8879.0 0.04 \n",
"243 49.52 6.61 0.07 9977.0 0.04 \n",
"244 59.88 6.91 0.06 10217.0 0.05 \n",
"245 47.02 6.90 0.02 10500.0 0.05 \n",
"271 59.41 1.24 0.01 34520.0 0.12 \n",
"272 61.96 1.26 0.04 36354.0 0.11 \n",
"273 91.69 1.37 0.03 41640.0 0.09 \n",
"274 41.12 1.47 0.01 45613.0 0.08 \n",
"276 89.15 1.33 0.04 41572.0 0.08 \n",
"277 98.56 1.39 0.04 46706.0 0.07 \n",
"280 59.29 1.33 0.02 48024.0 0.07 \n",
"281 37.19 1.11 0.01 41103.0 0.06 \n",
"283 57.88 1.13 0.03 44553.0 0.06 \n",
"284 49.52 1.18 0.01 47811.0 0.05 \n",
"285 59.88 1.12 0.01 46468.0 0.05 \n",
"286 47.02 1.14 -0.05 45724.0 0.06 \n",
"\n",
" manufacturingoutput tradebalance USTreasury \n",
"229 625.22 51.17 0.04 \n",
"230 733.66 124.63 0.04 \n",
"231 893.13 208.92 0.05 \n",
"232 1149.72 308.04 0.05 \n",
"233 1475.66 348.83 0.04 \n",
"234 1611.95 220.13 0.03 \n",
"235 1924.32 222.40 0.03 \n",
"236 2421.37 180.89 0.03 \n",
"237 2690.09 231.87 0.02 \n",
"238 2935.34 234.87 0.02 \n",
"239 3184.24 221.55 0.03 \n",
"240 3202.50 358.84 0.02 \n",
"241 3153.12 255.48 0.02 \n",
"242 3460.33 215.70 0.02 \n",
"243 3868.46 106.71 0.03 \n",
"244 3823.41 164.99 0.02 \n",
"245 3853.81 366.14 0.01 \n",
"271 571.36 148.05 0.04 \n",
"272 618.70 162.20 0.05 \n",
"273 714.38 231.95 0.05 \n",
"274 750.91 227.47 0.04 \n",
"276 669.57 178.90 0.03 \n",
"277 758.60 184.02 0.03 \n",
"280 786.55 257.40 0.03 \n",
"281 683.20 255.02 0.02 \n",
"283 752.02 257.66 0.02 \n",
"284 795.96 243.72 0.03 \n",
"285 737.94 223.82 0.02 \n",
"286 678.29 221.53 0.01 \n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2IAAAIjCAYAAABh3KjvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB0Y0lEQVR4nO3deXxU1f3/8fdMQhICWYGQYDEJoNWIgKBAVMQqCi4o1q1UFJcixaVuVcFWI1rFre4WFH8Ciitt9SvWpiIqKgViWVSI+EUNiJIQIWRhCYHM+f2R70wzyUwyM5m5s+T1fDzygNw5997PWe5kPrk359iMMUYAAAAAAMvYwx0AAAAAAHQ2JGIAAAAAYDESMQAAAACwGIkYAAAAAFiMRAwAAAAALEYiBgAAAAAWIxEDAAAAAIuRiAEAAACAxUjEAAAAAMBiJGIAAAAAYDESMQBR669//atsNpvHr4EDB4Y7PAAAAK/iwx0AAHTUHXfcoSOPPNL1/X333RfGaAAAANpHIgYg6p122mk6+eSTXd8///zz2rFjR/gCAgAAaAePJgKIWg0NDZIku739t7L58+fLZrNp8+bNrm0Oh0ODBg2SzWbT/PnzXdu/+OILXX755erXr5+SkpKUnZ2tK6+8Ujt37nQ75t133+3xscj4+P/+juvkk0/WwIEDtXr1ah1//PHq2rWr8vPzNWfOnFZ1ueuuuzRs2DClpaWpW7duGjVqlD788EO3cps3b3ad56233nJ7rb6+XhkZGbLZbHrkkUdaxZmVlaUDBw647fPqq6+6jtc8ef2f//kfnXXWWerTp48SExPVv39/3XvvvWpsbGy3rZ3n27hxoy666CKlpqaqR48euuGGG1RfX+9Wdt68eTrllFOUlZWlxMREFRQUaPbs2R6P+89//lOjR49WSkqKUlNTddxxx+mVV15xK7Nq1SqdeeaZysjIULdu3TRo0CA98cQTbmU2btyoCy64QJmZmUpKStKxxx6rt99+262MP+Pl8ssvd+v/jIwMnXzyyfrkk0/cjulrmzrHTEuPPPJIq5jy8vJ0+eWXu5VbtGiRbDab8vLy3LZXVlbqqquu0qGHHqq4uDhXvN27d291rpby8vK8PgZss9ncyh48eFD33nuv+vfvr8TEROXl5emOO+7Q/v37Wx3Xlz5tPubbOq/D4dDjjz+uo446SklJSerdu7emTp2qXbt2+VS/lu340UcfyWaz6aOPPnJtO/nkk91+6SNJn332mcd4JGnhwoUaPny4kpOTlZGRoZNOOknvvfee65xttamz/5z1bz7m6urqNGzYMOXn56u8vNxrOUm69tprZbPZWtUPQPhxRwxA1HImYomJiQHt/9JLL+nLL79stX3JkiX67rvvdMUVVyg7O1sbNmzQc889pw0bNmjlypWtPnDNnj3b7cNsy8Rw165dOvPMM3XRRRdp4sSJeuONNzRt2jQlJCToyiuvlCTV1tbq+eef18SJEzVlyhTV1dXp//2//6exY8eqpKREQ4YMcTtmUlKS5s2bpwkTJri2/f3vf2+V6DRXV1end955R+edd55r27x585SUlNRqv/nz56t79+66+eab1b17d33wwQe66667VFtbq4cfftjrOZq76KKLlJeXp1mzZmnlypV68skntWvXLr344otubXfUUUfpnHPOUXx8vBYvXqxrrrlGDodD1157rVs8V155pY466ijNmDFD6enpWrt2rYqLi/XrX/9aUlO/nX322crJydENN9yg7OxsffXVV3rnnXd0ww03SJI2bNigE044QYcccoimT5+ubt266Y033tCECRP0t7/9za1tWvI2XiSpZ8+eeuyxxyRJP/zwg5544gmdeeaZ2rp1q9LT04PWpu05ePCg/vCHP3h8bfLkyXr//fd1/fXXa/DgwYqLi9Nzzz2nNWvW+HTsIUOG6JZbbnHb9uKLL2rJkiVu237zm99owYIFuuCCC3TLLbdo1apVmjVrlr766iu9+eabrnK+9GlzV199tUaNGiWpaaw3P5YkTZ06VfPnz9cVV1yh3/3udyorK9PTTz+ttWvXavny5erSpYtP9fTX7bff7nH7zJkzdffdd+v444/XPffco4SEBK1atUoffPCBTj/9dD3++OPavXu3JOmrr77S/fff7/aYtbcE+cCBAzr//PP1/fffa/ny5crJyfEa2zfffKO5c+d2sIYAQsYAQJR6/PHHjSTz+eefu20fPXq0Oeqoo9y2zZs3z0gyZWVlxhhj6uvrzaGHHmrOOOMMI8nMmzfPVXbv3r2tzvXqq68aSebjjz92bSsqKjKSzE8//eQ1xtGjRxtJ5s9//rNr2/79+82QIUNMVlaWaWhoMMYYc/DgQbN//363fXft2mV69+5trrzySte2srIyI8lMnDjRxMfHm4qKCtdrp556qvn1r39tJJmHH364VZwTJ040Z599tmv7li1bjN1uNxMnTmxVD09tMHXqVJOcnGzq6+u91rf5+c455xy37ddcc02r/vJ0nrFjx5p+/fq5vq+urjYpKSlmxIgRZt++fW5lHQ6HMaap/fLz801ubq7ZtWuXxzLGNLXR0Ucf7VYHh8Nhjj/+eHPYYYe5tvkzXiZPnmxyc3Pdzvncc88ZSaakpKTNunpqU0/j1xhjHn74YbeYjDEmNzfXTJ482fX9X/7yF5OYmGh+8YtfuMW0b98+Y7fbzdSpU92OOXnyZNOtW7dW52opNzfXnHXWWa22X3vttab5R4l169YZSeY3v/mNW7nf//73RpL54IMPjDG+9anTpk2bjCSzYMEC1zbnGHP65JNPjCTz8ssvu+1bXFzscXtL+fn55rLLLnPb9uGHHxpJ5sMPP3RtGz16tBk9erTr+3fffddIMuPGjXOLZ9OmTcZut5vzzjvPNDY2tlk/b+dycl7z8+bNMw6Hw1xyySUmOTnZrFq1yms5p4suusgMHDjQ9O3b122cAIgMPJoIIGo5HxXs1auX3/s+88wz2rlzp4qKilq91rVrV9f/6+vrtWPHDo0cOVKSfL570Fx8fLymTp3q+j4hIUFTp05VZWWlVq9eLUmKi4tTQkKCpKZHrKqqqnTw4EEde+yxHs85dOhQHXXUUXrppZckSVu2bNGHH37Y5uNHV155pYqLi1VRUSFJWrBggQoLC3X44Ye3Ktu8Derq6rRjxw6NGjVKe/fu1caNG32qd/M7WpJ0/fXXS5Leffddj+epqanRjh07NHr0aH333XeqqamR1HSnq66uTtOnT1dSUpLbMZ13J9euXauysjLdeOONrjtQLctUVVXpgw8+0EUXXeSq044dO7Rz506NHTtWmzZt0o8//uixLm2NF6mpz5zHW7dunV588UXl5OS4TSLjT5s2Nja6juf82rt3r8dzO+3du1f33HOPrrvuOh166KFur+3Zs0cOh0M9evRo8xgd5ezbm2++2W27807aP/7xD0m+9amTL3e+Fy1apLS0NJ122mlubTZs2DB179691SO+LWVlZemHH37woYb/ZYzRjBkzdP7552vEiBFur7311ltyOBy66667Wt0h9/QIo69uvfVWvfzyy3rjjTc0fPjwNsuuXr1aixYt0qxZs3x6fBuA9bgyAUStLVu2KD4+3u9ErKamRvfff79uvvlm9e7du9XrVVVVuuGGG9S7d2917dpVvXr1Un5+vmtff/Xp00fdunVz2+ZMfpr/vc+CBQs0aNAgJSUlqUePHurVq5f+8Y9/eD3nFVdcoXnz5klqeszr+OOP12GHHeY1jiFDhmjgwIF68cUXZYxxPcblyYYNG3TeeecpLS1Nqamp6tWrlyZNmiTJ9zZoGUv//v1lt9vd6rx8+XKNGTNG3bp1U3p6unr16qU77rjD7TzffvutJLW5JIEvZb755hsZY3TnnXeqV69ebl/OBKuysrLVfu2NF0naunWr61jHHHOMvv32W/3tb39ze7zMnzbduHGj1xi9efTRR1VfX+9qv+Z69Oihww47TM8//7zee+89VVZWaseOHR7/bqsjtmzZIrvdrgEDBrhtz87OVnp6urZs2SLJt/5yqq6uluT9UT1J2rRpk2pqapSVldW
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(cleaned_df['manufacturingoutput'], cleaned_df['gdppercent'])\n",
"plt.xlabel('Объем производства')\n",
"plt.ylabel('ВВП')\n",
"plt.title('Диаграмма рассеивания перед чисткой')\n",
"plt.show()\n",
"\n",
"Q1 = cleaned_df[\"manufacturingoutput\"].quantile(0.25)\n",
"Q3 = cleaned_df[\"manufacturingoutput\"].quantile(0.75)\n",
"\n",
"IQR = Q3 - Q1\n",
"\n",
"threshold = 1.5 * IQR\n",
"lower_bound = Q1 - threshold\n",
"upper_bound = Q3 + threshold\n",
"\n",
"outliers = (cleaned_df[\"manufacturingoutput\"] < lower_bound) | (cleaned_df[\"manufacturingoutput\"] > upper_bound)\n",
"\n",
"# Вывод выбросов\n",
"print(\"Выбросы в датасете:\")\n",
"print(cleaned_df[outliers])\n",
"\n",
"# Заменяем выбросы на медианные значения\n",
"median_score = cleaned_df[\"manufacturingoutput\"].median()\n",
"cleaned_df.loc[outliers, \"manufacturingoutput\"] = median_score\n",
"\n",
"# Визуализация данных после обработки\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(cleaned_df['manufacturingoutput'], cleaned_df['gdppercent'])\n",
"plt.xlabel('Объем производства')\n",
"plt.ylabel('ВВП')\n",
"plt.title('Диаграмма рассеивания после чистки')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 119,
2024-10-18 19:14:48 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 131\n",
"Размер контрольной выборки: 44\n",
"Размер тестовой выборки: 44\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"train_df, test_df = train_test_split(cleaned_df, test_size=0.2, random_state=42)\n",
"\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Видим недостаток баланса"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 120,
2024-10-18 19:14:48 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение ВВП в обучающей выборке:\n",
"gdppercent\n",
" 0.02 30\n",
" 0.04 25\n",
" 0.03 21\n",
" 0.01 13\n",
" 0.07 8\n",
" 0.08 8\n",
" 0.05 7\n",
"-0.01 5\n",
" 0.11 2\n",
" 0.09 2\n",
"-0.02 2\n",
" 0.10 2\n",
"-0.03 1\n",
" 0.14 1\n",
"-0.10 1\n",
" 0.06 1\n",
"-0.05 1\n",
"-0.04 1\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение ВВП в контрольной выборке:\n",
"gdppercent\n",
" 0.02 9\n",
" 0.03 7\n",
" 0.01 6\n",
" 0.07 4\n",
" 0.04 4\n",
" 0.05 4\n",
" 0.08 3\n",
" 0.06 3\n",
"-0.01 2\n",
" 0.10 1\n",
"-0.08 1\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение ВВП в тестовой выборке:\n",
"gdppercent\n",
" 0.02 12\n",
" 0.03 8\n",
" 0.01 7\n",
" 0.05 5\n",
" 0.04 3\n",
" 0.08 3\n",
"-0.01 2\n",
"-0.05 1\n",
" 0.06 1\n",
" 0.13 1\n",
" 0.07 1\n",
"Name: count, dtype: int64\n",
"\n"
]
}
],
"source": [
"def check_balance(df, name):\n",
" counts = df['gdppercent'].value_counts()\n",
" print(f\"Распределение ВВП в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"также используем oversampling и undersampling"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 121,
2024-10-18 19:14:48 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Оверсэмплинг:\n",
"Распределение ВВП в обучающей выборке:\n",
"gdppercent\n",
2024-10-18 20:20:52 +04:00
" 0.04 36\n",
2024-10-18 19:14:48 +04:00
" 0.02 30\n",
" 0.03 21\n",
" 0.01 13\n",
2024-10-18 20:20:52 +04:00
" 0.07 10\n",
" 0.08 10\n",
2024-10-18 19:14:48 +04:00
" 0.05 7\n",
"-0.01 5\n",
2024-10-18 20:20:52 +04:00
" 0.09 4\n",
" 0.11 3\n",
" 0.14 2\n",
" 0.10 2\n",
2024-10-18 19:14:48 +04:00
"-0.02 2\n",
2024-10-18 20:20:52 +04:00
"-0.03 1\n",
"-0.10 1\n",
2024-10-18 19:14:48 +04:00
" 0.06 1\n",
2024-10-18 20:20:52 +04:00
"-0.05 1\n",
"-0.04 1\n",
2024-10-18 19:14:48 +04:00
"Name: count, dtype: int64\n",
"\n",
"Распределение ВВП в контрольной выборке:\n",
"gdppercent\n",
2024-10-18 20:20:52 +04:00
" 0.02 9\n",
" 0.07 7\n",
" 0.03 7\n",
" 0.01 6\n",
" 0.05 5\n",
" 0.06 5\n",
" 0.04 4\n",
" 0.08 3\n",
"-0.01 2\n",
" 0.10 1\n",
"-0.08 1\n",
2024-10-18 19:14:48 +04:00
"Name: count, dtype: int64\n",
"\n",
"Распределение ВВП в тестовой выборке:\n",
"gdppercent\n",
" 0.02 12\n",
" 0.03 8\n",
" 0.01 7\n",
" 0.05 5\n",
" 0.04 3\n",
2024-10-18 20:20:52 +04:00
" 0.08 3\n",
"-0.01 2\n",
"-0.05 1\n",
2024-10-18 19:14:48 +04:00
" 0.06 1\n",
2024-10-18 20:20:52 +04:00
" 0.13 1\n",
" 0.07 1\n",
2024-10-18 19:14:48 +04:00
"Name: count, dtype: int64\n",
"\n",
"Андерсэмплинг:\n",
"Распределение ВВП в обучающей выборке:\n",
"gdppercent\n",
2024-10-18 20:20:52 +04:00
" 0.02 25\n",
" 0.04 25\n",
" 0.03 17\n",
" 0.07 8\n",
" 0.08 8\n",
" 0.01 7\n",
" 0.05 7\n",
"-0.01 3\n",
" 0.11 2\n",
" 0.09 2\n",
" 0.10 2\n",
"-0.04 1\n",
"-0.10 1\n",
"-0.02 1\n",
"-0.03 1\n",
" 0.14 1\n",
" 0.06 1\n",
2024-10-18 19:14:48 +04:00
"Name: count, dtype: int64\n",
"\n",
"Распределение ВВП в контрольной выборке:\n",
"gdppercent\n",
2024-10-18 20:20:52 +04:00
" 0.02 7\n",
" 0.03 6\n",
" 0.01 4\n",
" 0.07 4\n",
" 0.04 4\n",
" 0.05 4\n",
" 0.08 3\n",
" 0.06 3\n",
2024-10-18 19:14:48 +04:00
"-0.08 1\n",
2024-10-18 20:20:52 +04:00
"-0.01 1\n",
" 0.10 1\n",
2024-10-18 19:14:48 +04:00
"Name: count, dtype: int64\n",
"\n",
"Распределение ВВП в тестовой выборке:\n",
"gdppercent\n",
2024-10-18 20:20:52 +04:00
" 0.02 12\n",
" 0.03 8\n",
" 0.01 7\n",
" 0.05 5\n",
" 0.08 3\n",
" 0.04 3\n",
"-0.01 2\n",
" 0.06 1\n",
"-0.05 1\n",
" 0.13 1\n",
" 0.07 1\n",
2024-10-18 19:14:48 +04:00
"Name: count, dtype: int64\n",
"\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"def binning(target, bins):\n",
2024-10-18 20:20:52 +04:00
" return pd.qcut(target, q=bins, labels=False)\n",
2024-10-18 19:14:48 +04:00
"\n",
2024-10-18 20:20:52 +04:00
"train_df['gdppercent_binned'] = binning(train_df['gdppercent'], bins=2)\n",
"val_df['gdppercent_binned'] = binning(val_df['gdppercent'], bins=2)\n",
"test_df['gdppercent_binned'] = binning(test_df['gdppercent'], bins=2)\n",
2024-10-18 19:14:48 +04:00
"\n",
"def oversample(df, target_column):\n",
" X = df.drop(target_column, axis=1)\n",
" y = df[target_column]\n",
" \n",
" oversampler = RandomOverSampler(random_state=42)\n",
" x_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
" \n",
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1) \n",
" return resampled_df\n",
"\n",
"def undersample(df, target_column):\n",
" X = df.drop(target_column, axis=1)\n",
" y = df[target_column]\n",
" \n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" x_resampled, y_resampled = undersampler.fit_resample(X, y) # type: ignore\n",
" \n",
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_oversampled = oversample(train_df, 'gdppercent_binned')\n",
"val_df_oversampled = oversample(val_df, 'gdppercent_binned')\n",
"test_df_oversampled = oversample(test_df, 'gdppercent_binned')\n",
"\n",
"train_df_undersampled = undersample(train_df, 'gdppercent_binned')\n",
"val_df_undersampled = undersample(val_df, 'gdppercent_binned')\n",
"test_df_undersampled = undersample(test_df, 'gdppercent_binned')\n",
"\n",
"print(\"Оверсэмплинг:\")\n",
2024-10-18 20:20:52 +04:00
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
"\n",
"print(\"Андерсэмплинг:\")\n",
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
"check_balance(test_df_undersampled, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://www.kaggle.com/datasets/surajjha101/stores-area-and-sales-data - данный датасет содержит информацию о магазинах"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 122,
2024-10-18 20:20:52 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Store ID ', 'Store_Area', 'Items_Available', 'Daily_Customer_Count',\n",
" 'Store_Sales'],\n",
" dtype='object')\n"
]
}
],
"source": [
"df_stores = pd.read_csv(\".//static//scv//Stores.csv\")\n",
"print(df_stores.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Столбцы на русском:\n",
"Store ID - Айди магазина\n",
"Store_Area - Доступное пространство магазина\n",
"Items_Available - Доступные предметы\n",
"Daily_Customer_Count - Ежедневное количество клиентов\n",
"Stores_Sales - Продажи в магазинах"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 123,
2024-10-18 20:20:52 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 896 entries, 0 to 895\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype\n",
"--- ------ -------------- -----\n",
" 0 Store ID 896 non-null int64\n",
" 1 Store_Area 896 non-null int64\n",
" 2 Items_Available 896 non-null int64\n",
" 3 Daily_Customer_Count 896 non-null int64\n",
" 4 Store_Sales 896 non-null int64\n",
"dtypes: int64(5)\n",
"memory usage: 35.1 KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store ID</th>\n",
" <th>Store_Area</th>\n",
" <th>Items_Available</th>\n",
" <th>Daily_Customer_Count</th>\n",
" <th>Store_Sales</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1659</td>\n",
" <td>1961</td>\n",
" <td>530</td>\n",
" <td>66490</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1461</td>\n",
" <td>1752</td>\n",
" <td>210</td>\n",
" <td>39820</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>1340</td>\n",
" <td>1609</td>\n",
" <td>720</td>\n",
" <td>54010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>1451</td>\n",
" <td>1748</td>\n",
" <td>620</td>\n",
" <td>53730</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>1770</td>\n",
" <td>2111</td>\n",
" <td>450</td>\n",
" <td>46620</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store ID Store_Area Items_Available Daily_Customer_Count Store_Sales\n",
"0 1 1659 1961 530 66490\n",
"1 2 1461 1752 210 39820\n",
"2 3 1340 1609 720 54010\n",
"3 4 1451 1748 620 53730\n",
"4 5 1770 2111 450 46620"
]
},
2024-10-18 21:22:08 +04:00
"execution_count": 123,
2024-10-18 20:20:52 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_stores.info()\n",
"df_stores.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-10-18 21:22:08 +04:00
"Объект наблюдения - Розничная торговля\n",
2024-10-18 20:20:52 +04:00
"Атрибуты - содержит набор информации о магазин, такие как:\n",
"Айди, размер, количество доступных предметов, количество покупателей и выручка"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 124,
2024-10-18 20:20:52 +04:00
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA20AAAIjCAYAAACQ1/NiAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOz9d5xdZ3Xo/392Ob3Omd6bRr0XS3I37qbZgA2mYy6EJBByyY/cH3kRAiT3ciEkN7SEjmkmQGIM2MZIuBdZXVav03s5M6fXvff3j5FGHs/I6nNG0nrzAl5z9j5nr/Noyl7neZ61FMuyLIQQQgghhBBCzEpqoQMQQgghhBBCCHFqkrQJIYQQQgghxCwmSZsQQgghhBBCzGKStAkhhBBCCCHELCZJmxBCCCGEEELMYpK0CSGEEEIIIcQsJkmbEEIIIYQQQsxikrQJIYQQQgghxCwmSZsQQgghhBBCzGKStAkhhBBCCCHELCZJmxBCiFnrv/7rv1AUZdr/Ll68uNDhXbHi8Tj/8A//wB133EEoFEJRFB588MFChyWEEJctvdABCCGEEKfzd3/3dyxYsGDi6//9v/93AaMRw8PDfPGLX6Suro5ly5bxzDPPFDokIYS4rEnSJoQQYta79dZbufHGGye+/v73v8/w8HDhArrCVVZW0tfXR0VFBdu2bWPNmjWFDkkIIS5rsjxSCCHErJXNZgFQ1dP/uXrwwQdRFIX29vaJx0zTZOnSpVOW7+3evZsPfvCDNDU14XQ6qaio4IEHHmBkZGTSa37+85+fdmmmrp/8zPPGG29k8eLFbN++nauvvhqXy0VjYyPf/va3p7yXz33uc6xatYpAIIDH4+G6667j6aefnnRee3v7xHUeeeSRScfS6TRFRUUoisJXv/rVKXGWlZWRy+UmPecXv/jFxOu9OtH97W9/yxvf+EaqqqpwOBw0Nzfzj//4jxiGcdqxdjgcVFRUnPY8IYQQF4bMtAkhhJi1TiRtDofjnJ7/05/+lD179kx5fOPGjbS2tvKhD32IiooK9u3bx3e/+1327dvHyy+/jKIok87/j//4D7xe78TXr00iR0dHueuuu7jvvvu4//77+dWvfsWf//mfY7fbeeCBBwCIRqN8//vf5/777+cjH/kIsViMH/zgB9x+++1s2bKF5cuXT3pNp9PJj370I+6+++6Jxx5++GHS6fQp328sFuPRRx/lnnvumXjsRz/6EU6nc8rzHnzwQbxeL5/61Kfwer089dRTfO5znyMajfLP//zPp7yGEEKImSdJmxBCiFkrEokA4HK5zvq5mUyGz33uc9x555384Q9/mHTsL/7iL/ibv/mbSY+tW7eO+++/nxdeeIHrrrtu0rF3vOMdlJSUnPJavb29/Mu//Auf+tSnAPizP/sz1q5dy2c+8xne9773YbPZKCoqor29HbvdPvG8j3zkI8yfP59vfOMb/OAHP5j0mvfccw+//vWvGRgYoLy8HIAf/vCHvO1tb+Ohhx6aNo577rmHH/7whxNJW2dnJ08++STvfOc7+cUvfjHp3IceemjSuH7sYx/jYx/7GP/+7//OP/3TP51zoiyEEOLCk+WRQgghZq0TyxVLS0vP+rnf+ta3GBkZ4R/+4R+mHHt1spJOpxkeHmbdunUA7Nix46yvpes6f/Znfzbxtd1u58/+7M8YHBxk+/btAGiaNpGwmaZJOBwmn8+zevXqaa+5cuVKFi1axE9/+lMAOjo6ePrpp/ngBz94yjgeeOABnnjiCfr7+wH48Y9/zPr165k7d+6Uc189BrFYjOHhYa677jqSySQHDx486zEQQghx8UjSJoQQYtbq6OhA1/WzTtoikQj/5//8Hz71qU9NzFK9Wjgc5pOf/CTl5eW4XC5KS0tpbGyceO7ZqqqqwuPxTHrsRKL06j12P/7xj1m6dClOp5Pi4mJKS0t57LHHTnnND33oQ/zoRz8CxpczXn311bS0tJwyjuXLl7N48WJ+8pOfYFkWDz74IB/60IemPXffvn3cc889BAIB/H4/paWlvPe97wXObQyEEEJcPJK0CSGEmLUOHTpEU1PTpMIfZ+LLX/4yqqry6U9/etrj9913H9/73vf42Mc+xsMPP8yGDRt44okngPFZsIvhZz/7GR/84Adpbm7mBz/4AU888QQbN27kDW94wymv+d73vpejR4/y8ssv8+Mf//iUCdirPfDAA/zoRz/i2Wefpb+/n/vuu2/KOWNjY9xwww288sorfPGLX+T3v/89Gzdu5Mtf/jJw8cZACCHEuZE9bUIIIWalTCbDrl27JhXiOBO9vb187Wtf40tf+hI+n29KRcjR0VGefPJJvvCFL/C5z31u4vEjR46cc6y9vb0kEolJs22HDx8GoKGhARhvFN7U1MTDDz88qdDJdMs3TyguLuYtb3nLxFLL++6777StDt7znvfw6U9/mk9+8pO84x3vwOfzTTnnmWeeYWRkhIcffpjrr79+4vG2trYzer9CCCFmlsy0CSGEmJUeeughMpkMN99881k97wtf+ALl5eV87GMfm/a4pmkAWJY16fF/+7d/O6c4AfL5PN/5zncmvs5ms3znO9+htLSUVatWnfK6mzdvZtOmTa/72g888AC7d+/m3nvvnVTB8lRCoRBvfetb2b1790TlyteaLpZsNsu///u/n/b1hRBCzDyZaRNCCDGrJBIJvvGNb/DFL34RTdOwLIuf/exnk84ZGBggHo/zs5/9jFtvvXXSvrUNGzbw85//fFKVxlfz+/1cf/31fOUrXyGXy1FdXc2GDRvOa5apqqqKL3/5y7S3tzN37lx++ctfsmvXLr773e9is9kAeNOb3sTDDz/MPffcwxvf+Eba2tr49re/zcKFC4nH46d87TvuuIOhoaEzSthOePDBB/nWt751yoqXV199NUVFRXzgAx/gr/7qr1AUhZ/+9KdTEtnX881vfpOxsTF6e3sB+P3vf093dzcAn/jEJwgEAmf8WkIIIV6fJG1CCCFmlaGhIT7zmc9MfP3qqoyv9b73vY+nn356UtK2fPly7r///te9xkMPPcQnPvEJvvWtb2FZFrfddht/+MMfqKqqOqeYi4qK+PGPf8wnPvEJvve971FeXs43v/lNPvKRj0yc88EPfpD+/n6+853v8Mc//pGFCxfys5/9jF//+tc888wzp3xtRVFet93AdFwu1+u2SSguLubRRx/lb/7mb/jsZz9LUVER733ve7n55pu5/fbbz+gaX/3qV+no6Jj4+uGHH+bhhx8GxvfiSdImhBAXjmKdzcdqQgghxEXW3t5OY2MjTz/9NDfeeON5n3ex3XjjjQwPD7N3796CxSCEEOLyJnvahBBCCCGEEGIWk6RNCCHErOL1ennPe94zbX+1czlPCCGEuNTJ8kghhBDiPMjySCGEEBebJG1CCCGEEEIIMYvJ8kghhBBCCCGEmMUkaRNCCCGEEEKIWUz6tM0g0zTp7e3F5/OhKEqhwxFCCCGEEEIUiGVZxGIxqqqqUNXXn0uTpG0G9fb2UltbW+gwhBBCCCGEELNEV1cXNTU1r3uOJG0zyOfzAeP/MH6/f0avncvl2LBhA7fddhs2m21Gr30lkXGeOTLWM0fGeubIWM8cGeuZI2M9c2SsZ86FGOtoNEptbe1EjvB6Cpq0Pffcc/zzP/8z27dvp6+vj9/85jfcfffdwPhAfPazn+Xxxx+ntbWVQCDALbfcwv/9v/+XqqqqidcIh8N84hOf4Pe//z2qqvL2t7+dr33ta3i93olzdu/ezV/+5V+ydetWSktL+cQnPsHf/u3fTorl17/+NX//939Pe3s7LS0tfPnLX+auu+6aOG5ZFv/wD//A9773PcbGxrjmmmv4j//4D1paWs74/Z5YEun3+wuStLndbvx+v/wQX0QyzjNHxnrmyFjPHBnrmSNjPXNkrGeOjPXMuZBjfSbbpgpaiCSRSLBs2TK+9a1vTTmWTCbZsWMHf//3f8+OHTt4+OGHOXToEG95y1smnfee97yHffv2sXHjRh599FGee+45PvrRj04cj0aj3HbbbdTX17N9+3b++Z//mc9//vN897vfnTj
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"\n",
"plt.scatter(df_stores['Daily_Customer_Count'], df_stores['Store_Sales'], c=df_stores['Daily_Customer_Count'], alpha=0.6)\n",
"\n",
"plt.title(\"Диаграмма 1\")\n",
"plt.ylabel(\"Доход магазина\")\n",
"plt.xlabel(\"Количество посетителей\")\n",
"plt.grid(visible='true')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Присутствует связь между атрибутами, прибыль магазина зависит от количества товаров и количества покупателей. Для примера на графике приведена связь между прибылью и количеством покупателей. \n",
"Примеры бизнес целей\n",
"\n",
" 1.Прогнозирование уровня продаж на основе количества покупателей.\n",
" 2.Улучшение маркетинговых стратегий.\n",
" \n",
"Эффект для бизнеса: увеличение объема продаж и снижение затрат на хранение непродаваемых товаров, увеличение клиентской базы и повышение лояльности клиентов.\n",
"Цели технического проекта\n",
"\n",
"Для первой цели:\n",
"\n",
"Вход: площадь магазина\n",
"Целевой признак: сумма продаж.\n",
"\n",
"Для второй цели:\n",
"\n",
"Вход: количество клиентов за день\n",
"Целевой признак: сумма продаж"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверка на выбросы"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 125,
2024-10-18 20:20:52 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пропущенные значения по столбцам:\n",
"Store ID 0\n",
"Store_Area 0\n",
"Items_Available 0\n",
"Daily_Customer_Count 0\n",
"Store_Sales 0\n",
"dtype: int64\n",
"\n",
"Статистический обзор данных:\n",
" Store ID Store_Area Items_Available Daily_Customer_Count \\\n",
"count 896.000000 896.000000 896.000000 896.000000 \n",
"mean 448.500000 1485.409598 1782.035714 786.350446 \n",
"std 258.797218 250.237011 299.872053 265.389281 \n",
"min 1.000000 775.000000 932.000000 10.000000 \n",
"25% 224.750000 1316.750000 1575.500000 600.000000 \n",
"50% 448.500000 1477.000000 1773.500000 780.000000 \n",
"75% 672.250000 1653.500000 1982.750000 970.000000 \n",
"max 896.000000 2229.000000 2667.000000 1560.000000 \n",
"\n",
" Store_Sales \n",
"count 896.000000 \n",
"mean 59351.305804 \n",
"std 17190.741895 \n",
"min 14920.000000 \n",
"25% 46530.000000 \n",
"50% 58605.000000 \n",
"75% 71872.500000 \n",
"max 116320.000000 \n"
]
}
],
"source": [
"null_values = df_stores.isnull().sum()\n",
"print(\"Пропущенные значения по столбцам:\")\n",
"print(null_values)\n",
"\n",
"stat_summary = df_stores.describe()\n",
"print(\"\\nСтатистический обзор данных:\")\n",
"print(stat_summary)\n"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 126,
2024-10-18 20:20:52 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Коэффициент асимметрии для столбца 'Store ID ': 0.0\n",
"\n",
"Коэффициент асимметрии для столбца 'Store_Area': 0.030366862979232103\n",
"\n",
"Коэффициент асимметрии для столбца 'Items_Available': 0.03443939172681748\n",
"\n",
"Коэффициент асимметрии для столбца 'Daily_Customer_Count': 0.07463278790524851\n",
"\n",
"Коэффициент асимметрии для столбца 'Store_Sales': 0.14879364291055253\n",
"\n",
"Количество дубликатов: 0\n"
]
}
],
"source": [
"for column in df_stores.select_dtypes(include=[np.number]).columns:\n",
" skewness = df_stores[column].skew()\n",
" print(f\"\\nКоэффициент асимметрии для столбца '{column}': {skewness}\")\n",
"\n",
"duplicates = df_stores.duplicated().sum()\n",
"print(f\"\\nКоличество дубликатов: {duplicates}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"На основе данных выше можно сказать, что пустых значений и дубликатов нет, выбросы минимальны\n",
"Очистиим данные от шумов"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 127,
2024-10-18 20:20:52 +04:00
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA20AAAIjCAYAAACQ1/NiAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydeXQUVfbHv90hO2QjkA4ISdgJQSKoENkUgyAoyzg6ICoiAy7gIG6IgoDIIDoKODggqKAguIwLq4wg+EMwgAIBQkAgJqiYoCQhwQBJSNfvj1hNd6eW96qruqs793MO55Du11WvXr16de+7m0UQBAEEQRAEQRAEQRCEKbH6ugMEQRAEQRAEQRCEPKS0EQRBEARBEARBmBhS2giCIAiCIAiCIEwMKW0EQRAEQRAEQRAmhpQ2giAIgiAIgiAIE0NKG0EQBEEQBEEQhIkhpY0gCIIgCIIgCMLEkNJGEARBEARBEARhYkhpIwiCIAiCIAiCMDGktBEEQRAEQRAAgF9++QUrVqxw/F1QUID333/fdx0iCAIAKW0EQQQQ//3vf2GxWCT/paWl+bp7BEEQpsdisWDChAn43//+h4KCAjz99NP45ptvfN0tgqj3NPB1BwiCIPTm2WefRceOHR1/z5kzx4e9IQiC8B+aN2+OcePGYeDAgQCAxMREfP31177tFEEQsAiCIPi6EwRBEHrw3//+F3feeSe2b9+OG2+80fH5jTfeiLNnzyInJ8d3nSMIgvAj8vLycPbsWaSlpSEyMtLX3SGIeg+5RxIEETBUVVUBAKxW9aVtxYoVsFgsKCgocHxmt9tx9dVXw2KxuMR0HDp0CPfffz9atWqFsLAw2Gw2PPDAAyguLnY55syZMyVdMxs0uOLUcOONNyItLQ379u3DDTfcgPDwcKSkpGDJkiV1ruX5559Ht27dEB0djcjISPTu3Rvbt293aVdQUOA4z+eff+7y3aVLlxAbGwuLxYJ//etfdfrZtGlTVFdXu/xmzZo1juOdPXvW8fnatWsxePBgNGvWDKGhoWjdujVmz56Nmpoa1bEWz3fs2DHcddddiIqKQuPGjTFp0iRcunTJpe3y5cvRr18/NG3aFKGhoUhNTcXixYslj/vFF1+gb9++aNSoEaKionDddddh9erVLm327NmDQYMGITY2FpGRkbj66quxcOFClzbHjh3DX//6V8TFxSEsLAzXXnst1q1b59KGZ77cf//9Lvc/NjYWN954Yx0XM9YxFeeMO//617/q9Ck5ORn333+/S7uPP/4YFosFycnJLp//9ttvGDt2LFq2bImgoCBHfxs2bFjnXO4kJyfLuiJbLJY67VetWoVu3bohPDwccXFxGDFiBH7++WfJ61R7NgCgsrISM2bMQJs2bRAaGooWLVrg6aefRmVlZZ22X3/9NXM/3RHnrtT1O48zz/wA4HgWmjRpgvDwcLRv3x7PPfecyzmV/omWrxtvvNFlgwqo9SywWq11noWPP/7YcQ/i4+Nxzz334PTp0y5t7r//fsc8ad26Nbp3746SkhKEh4fXuT6CILwLuUcSBBEwiEpbaGiopt+vXLkShw8frvP5li1b8OOPP2LMmDGw2Ww4cuQIli5diiNHjmD37t11hLrFixe7CL7uSmRpaSkGDRqEu+66CyNHjsRHH32Ehx9+GCEhIXjggQcAAOXl5XjrrbcwcuRIjBs3DufPn8fbb7+NAQMGYO/evUhPT3c5ZlhYGJYvX45hw4Y5Pvv000/rKEXOnD9/Hhs2bMDw4cMdny1fvhxhYWF1frdixQo0bNgQjz/+OBo2bIht27bh+eefR3l5OV555RXZczhz1113ITk5GXPnzsXu3bvx+uuvo7S0FO+9957L2HXq1AlDhgxBgwYNsH79ejzyyCOw2+2YMGGCS38eeOABdOrUCVOnTkVMTAwOHDiAzZs34+677wZQe99uu+02JCYmYtKkSbDZbDh69Cg2bNiASZMmAQCOHDmCnj17onnz5njmmWcQGRmJjz76CMOGDcMnn3ziMjbuyM0XAIiPj8f8+fMB1CZ2WLhwIQYNGoSff/4ZMTExuo2pGpcvX3YoA+6MHj0aW7duxaOPPoouXbogKCgIS5cuxf79+5mOnZ6ejieeeMLls/feew9btmxx+WzOnDmYPn067rrrLvz973/H77//jn//+9/o06cPDhw44BgPgO3ZsNvtGDJkCHbu3Inx48ejY8eOOHz4MObPn4/jx4/X2bwQ+cc//oHrrrtOtp96Izc/Dh06hN69eyM4OBjjx49HcnIy8vLysH79esyZMwd/+ctf0KZNG0f7yZMno2PHjhg/frzjM2f3b2eWL1+OadOm4dVXX3U8B0DtXBszZgyuu+46zJ07F2fOnMHChQuxa9euOvfAneeff15xHSEIwksIBEEQAcKCBQsEAMLBgwddPu/bt6/QqVMnl8+WL18uABDy8/MFQRCES5cuCS1bthRuvfVWAYCwfPlyR9sLFy7UOdeaNWsEAMKOHTscn82YMUMAIPz++++yfezbt68AQHj11Vcdn1VWVgrp6elC06ZNhaqqKkEQBOHy5ctCZWWly29LS0uFhIQE4YEHHnB8lp+fLwAQRo4cKTRo0EAoKipyfHfzzTcLd999twBAeOWVV+r0c+TIkcJtt93m+PzUqVOC1WoVRo4cWec6pMbgwQcfFCIiIoRLly7JXq/z+YYMGeLy+SOPPFLnfkmdZ8CAAUKrVq0cf587d05o1KiR0L17d+HixYsube12uyAIteOXkpIiJCUlCaWlpZJtBKF2jDp37uxyDXa7XbjhhhuEtm3bOj7jmS+jR48WkpKSXM65dOlSAYCwd+9exWuVGlOp+SsIgvDKK6+49EkQBCEpKUkYPXq04+///Oc/QmhoqHDTTTe59OnixYuC1WoVHnzwQZdjjh49WoiMjKxzLneSkpKEwYMH1/l8woQJgrNoUVBQIAQFBQlz5sxxaXf48GGhQYMGLp+zPhsrV64UrFar8M0337gcc8mSJQIAYdeuXS6ff/nllwIA4b///a9sP+WYNWuWAMBlzojX7zzOPPOjT58+QqNGjYRTp065HNP9HHLncqZv375C3759BUEQhI0bNwoNGjQQnnjiCZc2VVVVQtOmTYW0tDSX52XDhg0CAOH55593fOY+d3NycgSr1eq4Due5RhCEdyH3SIIgAgbRXbFJkybcv33jjTdQXFyMGTNm1PkuPDzc8f9Lly7h7Nmz6NGjBwAwWyWcadCgAR588EHH3yEhIXjwwQfx22+/Yd++fQCAoKAghISEAKi1LJSUlODy5cu49tprJc/ZtWtXdOrUCStXrgQAnDp1Ctu3b6/jKufMAw88gM2bN6OoqAgA8O677yIjIwPt2rWr09Z5DM6fP4+zZ8+id+/euHDhAo4dO8Z03c6WMgB49NFHAQCbNm2SPE9ZWRnOnj2Lvn374scff0RZWRmAWgva+fPn8cwzzyAsLMzlmKLV88CBA8jPz8djjz1Wx4ogtikpKcG2bdtw1113Oa7p7NmzKC4uxoABA3DixIk67mMiSvMFqL1n4vGys7Px3nvvITEx0cVCwjOmNTU1juOJ/y5cuCB5bpELFy7ghRdewMSJE9GyZUuX7yoqKmC329G4cWPFY3jKp59+Crvdjrvuusul7zabDW3btq3j7svybHz88cfo2LEjOnTo4HLMfv36AUCdY4pWIve5wkLTpk0B1FpLeZCbH7///jt27NiBBx54oM49YXHXlGPv3r246667cMcdd9Sx0n7//ff47bff8Mgjj7iMweDBg9GhQwds3LhR9rhTp05F165dceedd2ruG0EQ+kDukQRBBAynTp1CgwYNuJW2srIy/POf/8Tjjz+OhISEOt+XlJRg1qxZ+OCDD/Dbb7/V+S0vzZo1qxPYLypKBQUFDoXw3Xffxauvvopjx465xJ6lpKRIHnfMmDFYunQpnnzySaxYsQI33HAD2rZtK9uP9PR0pKWl4b333sNTTz2FFStW4Nlnn60TawTUuhFOmzYN27ZtQ3l5uct3rGPg3pfWrVvDarW6xMns2rULM2bMQFZWVh2lpKy
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выбросы в датасете:\n",
" Store ID Store_Area Items_Available Daily_Customer_Count Store_Sales\n",
"39 40 1270 1516 10 45480\n",
"349 350 1886 2228 1530 40350\n",
"848 849 919 1099 1560 73810\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2wAAAIjCAYAAAB/FZhcAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAD6pElEQVR4nOydeXQUVfbHvx3IDtkIZFFIwqKySQQXIpswQRAQcf2BG6MMbuAgboiCoqgIOiKKgqCjCIKOMy6ImBEEh8WACAYMAYUQokICZiGBhCyk6/dHqKaXWt6rpbu6+37O4RzS/brqVdWr9+59d7MJgiCAIAiCIAiCIAiCsBwhvu4AQRAEQRAEQRAEIQ0pbARBEARBEARBEBaFFDaCIAiCIAiCIAiLQgobQRAEQRAEQRCERSGFjSAIgiAIgiAIwqKQwkYQBEEQBEEQBGFRSGEjCIIgCIIgCIKwKKSwEQRBEARBEARBWBRS2AiCIAiCIAiCICwKKWwEQRAEQRAEQRAWhRQ2giACnn//+9+w2WyS/3r06OHr7hEEQRAEQcjS0tcdIAiC8BZPPvkkunbt6vj7hRde8GFvCIIgCIIg1CGFjSCIoGHo0KG46qqrHH+/8847KCsr812HCIIgCIIgVCCXSIIgAp6GhgYAQEiI+pT3/vvvw2az4fDhw47P7HY7Lr74YthsNrz//vuOz/fs2YO//vWv6NixIyIiIpCcnIy7774b5eXlLsecNWuWpDtmy5bn9syuuuoq9OjRAzt37sSVV16JyMhIZGRkYPHixR7X8vTTT6NPnz6IjY1FdHQ0BgwYgI0bN7q0O3z4sOM8n3/+uct3dXV1iI+Ph81mwyuvvOLRz3bt2qGxsdHlN6tWrXIcz1nJ/eKLLzBy5EikpqYiPDwcnTp1wuzZs9HU1KR6r8Xz7d+/H7fccgtiYmLQpk0bTJkyBXV1dS5t33vvPQwZMgTt2rVDeHg4unXrhkWLFkke9+uvv8agQYPQunVrxMTE4LLLLsPKlStd2mzfvh0jRoxAfHw8oqOjcfHFF2PBggUubfbv34+bbroJCQkJiIiIwKWXXorVq1e7tOEZL3/9619dnn98fDyuuuoqbN682eWYrPdUHDPuvPLKKx59Sk9Px1//+leXdp988glsNhvS09NdPj9+/DgmTJiADh06oEWLFo7+tmrVyuNc7qSnp8u6H9tsNpe2Z86cwezZs9GpUyeEh4cjPT0dTz75JOrr6z2Oy/JMnce80nntdjtee+01dO/eHREREUhKSsK9996LyspKputzv4/fffcdbDYbvvvuO8dnV111lcvmEADs2LFDsj8AsGLFClx++eWIiopCfHw8Bg4ciG+++cZxTqV7Kj4/8fqdx9zJkyfRp08fZGRkoKSkRLYdAEyaNAk2m83j+giC8D1kYSMIIuARFbbw8HBNv1++fDl+/vlnj8/XrVuHQ4cO4a677kJycjL27t2LJUuWYO/evdi2bZuHYLZo0SIXodddgaysrMSIESNwyy23YNy4cfjXv/6F+++/H2FhYbj77rsBANXV1XjnnXcwbtw4TJw4ESdPnsS7776LYcOG4YcffkBmZqbLMSMiIvDee+9hzJgxjs8+/fRTD4XImZMnT2LNmjW4/vrrHZ+99957iIiI8Pjd+++/j1atWuHhhx9Gq1atsGHDBjz99NOorq7Gyy+/LHsOZ2655Rakp6djzpw52LZtG15//XVUVlbigw8+cLl33bt3x+jRo9GyZUt8+eWXeOCBB2C32zFp0iSX/tx9993o3r07pk+fjri4OPz000/IycnBrbfeCqD5uY0aNQopKSmYMmUKkpOTsW/fPqxZswZTpkwBAOzduxf9+vXDeeedhyeeeALR0dH417/+hTFjxuA///mPy71xR268AEBiYiLmz58PAPjjjz+wYMECjBgxAr///jvi4uIMu6dqnDlzBk899ZTkd+PHj8f69evx4IMPolevXmjRogWWLFmCXbt2MR07MzMTjzzyiMtnH3zwAdatW+fy2d/+9jcsW7YMN910Ex555BFs374dc+bMwb59+/DZZ5852rE8U2fuueceDBgwAEDzWHc+FgDce++9eP/993HXXXfh73//O4qKirBw4UL89NNP2Lp1K0JDQ5muk5dp06ZJfv7ss89i1qxZuPLKK/Hcc88hLCwM27dvx4YNG3D11Vfjtddew6lTpwAA+/btw4svvuji3i2nSDc2NuLGG2/Eb7/9hq1btyIlJUW2bwcPHsTSpUt1XiFBEKYhEARBBDivvfaaAEDYvXu3y+eDBg0Sunfv7vLZe++9JwAQioqKBEEQhLq6OqFDhw7CNddcIwAQ3nvvPUfb2tpaj3OtWrVKACBs2rTJ8dkzzzwjABD+/PNP2T4OGjRIACD84x//cHxWX18vZGZmCu3atRMaGhoEQRCEM2fOCPX19S6/raysFJKSkoS7777b8VlRUZEAQBg3bpzQsmVLobS01PHdX/7yF+HWW28VAAgvv/yyRz/HjRsnjBo1yvF5cXGxEBISIowbN87jOqTuwb333itERUUJdXV1stfrfL7Ro0e7fP7AAw94PC+p8wwbNkzo2LGj4+8TJ04IrVu3Fq644grh9OnTLm3tdrsgCM33LyMjQ0hLSxMqKysl2whC8z3q2bOnyzXY7XbhyiuvFLp06eL4jGe8jB8/XkhLS3M555IlSwQAwg8//KB4rVL3VGr8CoIgvPzyyy59EgRBSEtLE8aPH+/4+6233hLCw8OFwYMHu/Tp9OnTQkhIiHDvvfe6HHP8+PFCdHS0x7ncSUtLE0aOHOnx+aRJkwRnkSMvL08AIPztb39zaffoo48KAIQNGzYIgsD2TEUOHDggABCWLVvm+EwcYyKbN28WAAgffvihy29zcnIkP3cnIyNDuPPOO10+27hxowBA2Lhxo+OzQYMGCYMGDXL8vXbtWgGAMHz4cJf+HDhwQAgJCRGuv/56oampSfH65M4lIr7z7733nmC324XbbrtNiIqKErZv3y7bTuSWW24RevToIbRv395lnBAEYQ3IJZIgiIBHdFFs27Yt92/ffPNNlJeX45lnnvH4LjIy0vH/uro6lJWVoW/fvgDAbI1wpmXLlrj33nsdf4eFheHee+/F8ePHsXPnTgBAixYtEBYWBqDZtauiogJnzpzBpZdeKnnO3r17o3v37li+fDkAoLi4GBs3blR0e7r77ruRk5OD0tJSAMCyZcuQlZWFCy64wKOt8z04efIkysrKMGDAANTW1mL//v1M1+1sIQOABx98EACwdu1ayfNUVVWhrKwMgwYNwqFDh1BVVQWg2XJ28uRJPPHEE4iIiHA5pmjt/Omnn1BUVISHHnrIYdFyb1NRUYENGzbglltucVxTWVkZysvLMWzYMBw4cABHjhyRvBal8QI0PzPxeHl5efjggw+QkpLikgyH5542NTU5jif+q62tlTy3SG1tLZ577jlMnjwZHTp0cPmupqYGdrsdbdq0UTyGXsRn+/DDD7t8LlrmvvrqKwBsz1SExZL+ySefIDY2FkOHDnW5Z3369EGrVq08XIvdadeuHf744w+GKzyHIAiYPn06brzxRlxxxRUu333++eew2+14+umnPSzuUq6TrDz22GP48MMP8a9//QuXX365YtudO3fik08+wZw5c5jcxgmC8D70ZhIEEfAUFxejZcuW3ApbVVUVXnzxRTz88MNISkry+L6iogJTpkxBUlISIiMj0bZtW2RkZDh+y0tqaiqio6NdPhOVJOd4pGXLluHiiy9GREQE2rRpg7Zt2+Krr76SPeddd92F9957D0Cze9mVV16JLl26yPYjMzMTPXr0wAcffABBEBzuY1Ls3bsX119/PWJjYxETE4O2bdvi9ttvB8B+D9z70qlTJ4SEhLhc89atW5GdnY3o6GjExcWhbdu2ePLJJ13OU1hYCACKpRpY2hw8eBCCIGDmzJlo27atyz9RETt+/LjH79TGCwD8/vvvjmNdcsklKCwsxH/+8x8Xtzaee7p//37ZPsrx6quvoq6uznH/nGnTpg26dOmCd955B9988w2OHz+OsrIyybgyPRQ
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df_stores['Daily_Customer_Count'], df_stores['Store_Sales'])\n",
"plt.xlabel('Количество клиентов за день')\n",
"plt.ylabel('Сумма прибыли')\n",
"plt.title('Диаграмма рассеивания перед чисткой')\n",
"plt.show()\n",
"\n",
"Q1 = df_stores[\"Daily_Customer_Count\"].quantile(0.25)\n",
"Q3 = df_stores[\"Daily_Customer_Count\"].quantile(0.75)\n",
"\n",
"IQR = Q3 - Q1\n",
"\n",
"threshold = 1.5 * IQR\n",
"lower_bound = Q1 - threshold\n",
"upper_bound = Q3 + threshold\n",
"\n",
"outliers = (df_stores[\"Daily_Customer_Count\"] < lower_bound) | (df_stores[\"Daily_Customer_Count\"] > upper_bound)\n",
"\n",
"# Вывод выбросов\n",
"print(\"Выбросы в датасете:\")\n",
"print(df_stores[outliers])\n",
"\n",
"# Заменяем выбросы на медианные значения\n",
"median_score = df_stores[\"Daily_Customer_Count\"].median()\n",
"df_stores.loc[outliers, \"Daily_Customer_Count\"] = median_score\n",
"\n",
"# Визуализация данных после обработки\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df_stores['Daily_Customer_Count'], df_stores['Store_Sales'])\n",
"plt.xlabel('количество клиентов за день')\n",
"plt.ylabel('сумма прибыли')\n",
"plt.title('Диаграмма рассеивания после чистки')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 128,
2024-10-18 20:20:52 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 537\n",
"Размер контрольной выборки: 179\n",
"Размер тестовой выборки: 180\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"train_df, test_df = train_test_split(df_stores, test_size=0.2, random_state=42)\n",
"\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Видим недостаток баланса"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 129,
2024-10-18 20:20:52 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение суммы продаж в обучающей выборке:\n",
"Store_Sales\n",
"63540 3\n",
"69940 2\n",
"66050 2\n",
"41490 2\n",
"85670 2\n",
" ..\n",
"43720 1\n",
"17670 1\n",
"66490 1\n",
"58670 1\n",
"62470 1\n",
"Name: count, Length: 508, dtype: int64\n",
"\n",
"Распределение суммы продаж в контрольной выборке:\n",
"Store_Sales\n",
"54590 3\n",
"60060 2\n",
"74080 2\n",
"62380 2\n",
"39140 1\n",
" ..\n",
"67640 1\n",
"72350 1\n",
"21750 1\n",
"55170 1\n",
"69130 1\n",
"Name: count, Length: 174, dtype: int64\n",
"\n",
"Распределение суммы продаж в тестовой выборке:\n",
"Store_Sales\n",
"66000 2\n",
"59190 2\n",
"51480 2\n",
"76300 1\n",
"44890 1\n",
" ..\n",
"65970 1\n",
"70050 1\n",
"90180 1\n",
"71280 1\n",
"81470 1\n",
"Name: count, Length: 177, dtype: int64\n",
"\n"
]
}
],
"source": [
"def check_balance(df_stores, name):\n",
" counts = df_stores['Store_Sales'].value_counts()\n",
" print(f\"Распределение суммы продаж в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Применим андерсемплинг и оверсемплинг"
]
},
{
"cell_type": "code",
2024-10-18 21:22:08 +04:00
"execution_count": 130,
2024-10-18 20:20:52 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Оверсэмплинг:\n",
"Распределение суммы продаж в обучающей выборке:\n",
"Store_Sales\n",
"44660 3\n",
"70810 3\n",
"62470 3\n",
"72320 3\n",
"66050 3\n",
" ..\n",
"73710 3\n",
"67290 3\n",
"36290 3\n",
"49330 3\n",
"33010 3\n",
"Name: count, Length: 508, dtype: int64\n",
"\n",
"Распределение суммы продаж в контрольной выборке:\n",
"Store_Sales\n",
"41080 3\n",
"39140 3\n",
"54590 3\n",
"39820 3\n",
"54820 3\n",
" ..\n",
"67640 3\n",
"72350 3\n",
"21750 3\n",
"55170 3\n",
"69130 3\n",
"Name: count, Length: 174, dtype: int64\n",
"\n",
"Распределение суммы продаж в тестовой выборке:\n",
"Store_Sales\n",
"61080 2\n",
"76300 2\n",
"44890 2\n",
"36280 2\n",
"45480 2\n",
" ..\n",
"65970 2\n",
"70050 2\n",
"90180 2\n",
"71280 2\n",
"81470 2\n",
"Name: count, Length: 177, dtype: int64\n",
"\n",
"Андерсэмплинг:\n",
"Распределение суммы продаж в обучающей выборке:\n",
"Store_Sales\n",
"105150 1\n",
"16370 1\n",
"17670 1\n",
"21300 1\n",
"21470 1\n",
" ..\n",
"30610 1\n",
"30020 1\n",
"28330 1\n",
"27970 1\n",
"27840 1\n",
"Name: count, Length: 508, dtype: int64\n",
"\n",
"Распределение суммы продаж в контрольной выборке:\n",
"Store_Sales\n",
"20270 1\n",
"21750 1\n",
"23740 1\n",
"24410 1\n",
"25820 1\n",
" ..\n",
"87000 1\n",
"87330 1\n",
"89540 1\n",
"94690 1\n",
"102920 1\n",
"Name: count, Length: 174, dtype: int64\n",
"\n",
"Распределение суммы продаж в тестовой выборке:\n",
"Store_Sales\n",
"14920 1\n",
"22310 1\n",
"26770 1\n",
"32260 1\n",
"33730 1\n",
" ..\n",
"93530 1\n",
"93950 1\n",
"97260 1\n",
"102310 1\n",
"116320 1\n",
"Name: count, Length: 177, dtype: int64\n",
"\n"
]
}
],
"source": [
"def oversample(df, target_column):\n",
" X = df.drop(target_column, axis=1)\n",
" y = df[target_column]\n",
" \n",
" oversampler = RandomOverSampler(random_state=42)\n",
" x_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
" \n",
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1) \n",
" return resampled_df\n",
"\n",
"def undersample(df, target_column):\n",
" X = df.drop(target_column, axis=1)\n",
" y = df[target_column]\n",
" \n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" x_resampled, y_resampled = undersampler.fit_resample(X, y) # type: ignore\n",
" \n",
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_oversampled = oversample(train_df, 'Store_Sales')\n",
"val_df_oversampled = oversample(val_df, 'Store_Sales')\n",
"test_df_oversampled = oversample(test_df, 'Store_Sales')\n",
"\n",
"train_df_undersampled = undersample(train_df, 'Store_Sales')\n",
"val_df_undersampled = undersample(val_df, 'Store_Sales')\n",
"test_df_undersampled = undersample(test_df, 'Store_Sales')\n",
"\n",
"print(\"Оверсэмплинг:\")\n",
2024-10-18 21:22:08 +04:00
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
"\n",
"print(\"Андерсэмплинг:\")\n",
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
"check_balance(test_df_undersampled, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://www.kaggle.com/datasets/aravinii/house-price-prediction-treated-dataset - датасет имеет данные о домах и их ценах"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['date', 'price', 'bedrooms', 'grade', 'has_basement', 'living_in_m2',\n",
" 'renovated', 'nice_view', 'perfect_condition', 'real_bathrooms',\n",
" 'has_lavatory', 'single_floor', 'month', 'quartile_zone'],\n",
" dtype='object')\n"
]
}
],
"source": [
"df_houses = pd.read_csv(\".//static//scv//df_test.csv\")\n",
"print(df_houses.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Столбцы на русском\n",
"date - дата\n",
"price - цена\n",
"bedrooms - количество спален\n",
"grade - оценка\n",
"has_basement - наличие подвала\n",
"living_in_m2 - размер в кв метрах\n",
"renovated - отремонтирован\n",
"nice_view - наличие хорошего вида\n",
"perfect_condition - идеальное состояние\n",
"real_bathrooms - количество ванн\n",
"has_lavatory - наличие туалета\n",
"single_floor - один этаж\n",
"month - месяц\n",
"quartile_zone - зона квартиля"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 6948 entries, 0 to 6947\n",
"Data columns (total 14 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 date 6948 non-null object \n",
" 1 price 6948 non-null float64\n",
" 2 bedrooms 6948 non-null int64 \n",
" 3 grade 6948 non-null int64 \n",
" 4 has_basement 6948 non-null bool \n",
" 5 living_in_m2 6948 non-null float64\n",
" 6 renovated 6948 non-null bool \n",
" 7 nice_view 6948 non-null bool \n",
" 8 perfect_condition 6948 non-null bool \n",
" 9 real_bathrooms 6948 non-null int64 \n",
" 10 has_lavatory 6948 non-null bool \n",
" 11 single_floor 6948 non-null bool \n",
" 12 month 6948 non-null int64 \n",
" 13 quartile_zone 6948 non-null int64 \n",
"dtypes: bool(6), float64(2), int64(5), object(1)\n",
"memory usage: 475.1+ KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>grade</th>\n",
" <th>has_basement</th>\n",
" <th>living_in_m2</th>\n",
" <th>renovated</th>\n",
" <th>nice_view</th>\n",
" <th>perfect_condition</th>\n",
" <th>real_bathrooms</th>\n",
" <th>has_lavatory</th>\n",
" <th>single_floor</th>\n",
" <th>month</th>\n",
" <th>quartile_zone</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2015-03-19</td>\n",
" <td>235000.0</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>121.70293</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2014-09-05</td>\n",
" <td>998000.0</td>\n",
" <td>4</td>\n",
" <td>10</td>\n",
" <td>False</td>\n",
" <td>350.24431</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2014-12-19</td>\n",
" <td>430000.0</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>False</td>\n",
" <td>58.52889</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>12</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2014-05-14</td>\n",
" <td>300000.0</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>False</td>\n",
" <td>117.05778</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2014-08-14</td>\n",
" <td>243500.0</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>False</td>\n",
" <td>196.02533</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date price bedrooms grade has_basement living_in_m2 \\\n",
"0 2015-03-19 235000.0 3 7 False 121.70293 \n",
"1 2014-09-05 998000.0 4 10 False 350.24431 \n",
"2 2014-12-19 430000.0 1 7 False 58.52889 \n",
"3 2014-05-14 300000.0 3 6 False 117.05778 \n",
"4 2014-08-14 243500.0 3 6 False 196.02533 \n",
"\n",
" renovated nice_view perfect_condition real_bathrooms has_lavatory \\\n",
"0 False False False 1 False \n",
"1 False False False 3 True \n",
"2 False False False 1 False \n",
"3 False False False 1 False \n",
"4 False False False 3 False \n",
"\n",
" single_floor month quartile_zone \n",
"0 True 3 1 \n",
"1 False 9 1 \n",
"2 True 12 4 \n",
"3 True 5 3 \n",
"4 True 8 1 "
]
},
"execution_count": 132,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_houses.info()\n",
"df_houses.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Объект наблюдения - недвижимость\n",
"Атрибуты - содержит набор информации об обучении, такие как:\n",
"Цена, дата, количествоо спален, оценка, наличие подвала, площадь, состояние восстановления и так далее"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd5xkV3ng/d85N1XuNN09OWuUcxYSkggCgQnCGIzABmQTbIPZxfa+LxsIXpb9sMa73tcGAwYkkjAGiwxCQkgo5zya0eTYM93TobryDeec94+q6Zme7p7YMz3hfD8fha66de+pW9Vd96nzPM8RxhiDZVmWZVmWZVmWNSU50wOwLMuyLMuyLMs63tnAybIsy7Isy7Is6wBs4GRZlmVZlmVZlnUANnCyLMuyLMuyLMs6ABs4WZZlWZZlWZZlHYANnCzLsizLsizLsg7ABk6WZVmWZVmWZVkHYAMny7Isy7Isy7KsA7CBk2VZlmVZlmVZ1gHYwMmyLMuyLMuyLOsAbOBkWZZl7dcPf/hDhBCT/nPOOefM9PBOWZVKhU996lO8/vWvp7OzEyEEt91220wPy7Is66TlzvQALMuyrBPDf/7P/5kzzzxz7Of/8T/+xwyOxhocHORv//ZvWbhwIeeffz733XffTA/JsizrpGYDJ8uyLOugvPa1r+W6664b+/lrX/sag4ODMzegU9ycOXPYsWMHs2fP5sknn+TSSy+d6SFZlmWd1GyqnmVZlrVfURQBIOWBPzJuu+02hBBs2rRp7DatNeedd96EVLLnn3+e973vfSxdupRUKsXs2bO55ZZbGBoaGrfPT3/605OmCbrunu/+rrvuOs455xyeeuoprrrqKtLpNEuWLOHLX/7yhOfyyU9+kosvvpi2tjay2SzXXHMN995777jtNm3aNHacH//4x+PuazQadHR0IITgC1/4woRx9vT0EMfxuMd873vfG9vf3sHmT37yE974xjcyd+5cgiBg2bJl/Pf//t9RSh3wXAdBwOzZsw+4nWVZljU97IyTZVmWtV+7A6cgCA7r8d/+9rd54YUXJtx+9913s2HDBt7//vcze/ZsVq5cyVe/+lVWrlzJo48+ihBi3Pb//M//TC6XG/t530BuZGSEN7zhDbzjHe/gXe96F//2b//Gn/3Zn+H7PrfccgsApVKJr33ta7zrXe/iAx/4AOVyma9//eu87nWv4/HHH+eCCy4Yt89UKsWtt97KW9/61rHb7rjjDhqNxpTPt1wu8/Of/5ybbrpp7LZbb72VVCo14XG33XYbuVyOj3/84+RyOX7729/yyU9+klKpxN/93d9NeQzLsizr2LOBk2VZlrVfo6OjAKTT6UN+bBiGfPKTn+TGG2/kV7/61bj7/vzP/5y/+qu/GnfbFVdcwbve9S4efPBBrrnmmnH3vf3tb2fWrFlTHquvr4+///u/5+Mf/zgAH/rQh7j88sv5xCc+wR/90R/heR4dHR1s2rQJ3/fHHveBD3yAM844g3/8x3/k61//+rh93nTTTfzgBz+gv7+f3t5eAL7xjW/wtre9jdtvv33Scdx000184xvfGAuctmzZwj333MM73/lOvve9743b9vbbbx93Xj/84Q/z4Q9/mC996Ut89rOfPexg1bIsy5p+NlXPsizL2q/dqXPd3d2H/NgvfvGLDA0N8alPfWrCfXsHDI1Gg8HBQa644goAnn766UM+luu6fOhDHxr72fd9PvShDzEwMMBTTz0FgOM4Y0GT1prh4WGSJOGSSy6Z9JgXXXQRZ599Nt/+9rcB2Lx5M/feey/ve9/7phzHLbfcwp133snOnTsB+OY3v8mVV17JihUrJmy79zkol8sMDg5yzTXXUKvVWL169SGfA8uyLOvoOaUDp/vvv583velNzJ07d9I89oNhjOELX/gCK1asIAgC5s2bZztNWZZ1Utm8eTOu6x5y4DQ6OsrnPvc5Pv7xj4/N1uxteHiYj33sY/T29pJOp+nu7mbJkiVjjz1Uc+fOJZvNjrttd7Cyd83VN7/5Tc477zxSqRRdXV10d3fzi1/8Yspjvv/97+fWW28Fmql1V111FaeddtqU47jgggs455xz+Na3voUxhttuu433v//9k267cuVKbrrpJtra2igUCnR3d/Oe97wHOLxzYFmWZR09p3TgVK1WOf/88/niF7942Pv42Mc+xte+9jW+8IUvsHr1an76059y2WWXTeMoLcuyZtbLL7/M0qVLxzVjOBif//znkVLyN3/zN5Pe/453vIN/+Zd/4cMf/jB33HEHd911F3feeSfQnA06Gr7zne/wvve9j2XLlvH1r3+dO++8k7vvvptXvepVUx7zPe95D+vWrePRRx/lm9/85pRB0N5uueUWbr31Vn73u9+xc+dO3vGOd0zYplgscu211/Lcc8/xt3/7t/zsZz/j7rvv5vOf/zxw9M6BZVmWdXhO6RqnG2+8kRtvvHHK+8Mw5L/8l//C9773PYrFIueccw6f//znx9rxrlq1in/+53/mxRdf5PTTTwcY+7bUsizrZBCGIc8+++y45ggHo6+vj//7f/8v//N//k/y+fyETnkjIyPcc889fOYzn+GTn/zk2O1r16497LH29fVRrVbHzTqtWbMGgMWLFwPNxXyXLl3KHXfcMa75xGSphLt1dXXx5je/eSzt7x3veMcB27C/+93v5m/+5m/42Mc+xtvf/nby+fyEbe677z6Ghoa44447eOUrXzl2+8aNGw/q+VqWZVnH1ik943QgH/nIR3jkkUf413/9V55//nn+4A/+gNe//vVjH+w/+9nPWLp0KT//+c9ZsmQJixcv5k//9E8ZHh6e4ZFblmVNj9tvv50wDHn1q199SI/7zGc+Q29vLx/+8Icnvd9xHKCZ7ry3f/iHfziscQIkScJXvvKVsZ+jKOIrX/kK3d3dXHzxxVMe97HHHuORRx7Z775vueWWsc+BvTv7TaWzs5O3vOUtPP/882Md/fY12ViiKOJLX/rSAfdvWZZlHXun9IzT/mzZsoVbb72VLVu2MHfuXAD++q//mjvvvJNbb72Vz33uc2zYsIHNmzfzgx/8gG9961sopfiP//E/8va3v53f/va3M/wMLMuyDl+1WuUf//Ef+du//Vscx8EYw3e+851x2/T391OpVPjOd77Da1/72nF1THfddRff/e53x3Wv21uhUOCVr3wl/+t//S/iOGbevHncddddRzTbMnfuXD7/+c+zadMmVqxYwfe//32effZZvvrVr+J5HgC/93u/xx133MFNN93EG9/4RjZu3MiXv/xlzjrrLCqVypT7fv3rX8+uXbsOKmja7bbbbuOLX/zilJ0Ar7rqKjo6Onjve9/LX/7lXyKE4Nvf/vaEYHJ//umf/olisUhfXx/Q/EJv27ZtAHz0ox+lra3toPdlWZZl7Z8NnKbwwgsvoJSa0AUpDEO6urqAZv55GIZ861vfGtvu61//OhdffDEvv/zyWPqeZVnWiWbXrl184hOfGPt57251+/qjP/oj7r333nGB0wUXXMC73vWu/R7j9ttv56Mf/Shf/OIXMcZwww038Ktf/Wrsy6pD1dHRwTe/+U0++tGP8i//8i/09vbyT//0T3zgAx8Y2+Z973sfO3fu5Ctf+Qq//vWvOeuss/jOd77DD37wA+67774p9y2E2G8r9Mmk0+n9tnDv6uri5z//OX/1V3/Ff/2v/5WOjg7e85738OpXv5rXve51B3WML3zhC2zevHns5zvuuIM77rgDaNZm2cDJsixr+ghzKF9tncSEEPzoRz8ay+P//ve/z7vf/W5Wrlw5lk6xWy6XY/bs2XzqU5/ic5/73LgV4uv1OplMhrvuuovXvva1x/IpWJZlTZtNmzaxZMkS7r333rG6ziPZ7mi77rrrGBwc5MUXX5yxMViWZVknNzvjNIULL7wQpRQDAwMTFmHc7RWveAVJkrB+/XqWLVsG7ClEXrRo0TEbq2VZlmVZlmVZR9cpHThVKhXWrVs39vPGjRt59tln6ezsZMWKFbz73e/mj//4j/n7v/97LrzwQnbt2sU999zDeeedxxvf+EZe85rXcNF
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"\n",
"plt.scatter(df_houses['living_in_m2'], df_houses['price'], c=df_houses['price'], alpha=0.6)\n",
"\n",
"plt.title(\"Диаграмма 1\")\n",
"plt.ylabel(\"Цена\")\n",
"plt.xlabel(\"Площадь дома\")\n",
"plt.grid(visible='true')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Присутствует связь между атрибутами: уровень цены зависит от всех атрибутов дома.\n",
"Для примера на графике приведена связь между ценой и площадью дома.\n",
"Примеры бизнес целей\n",
"\n",
" 1.Прогнозирование рыночных тенденций.\n",
" 2.Увеличения доходов от продаж .\n",
" \n",
"Эффект для бизнеса: определение цен на объекты недвижимости, предсказание изменения цен на недвижимость.\n",
"Цели технического проекта\n",
"\n",
"Для первой цели:\n",
"\n",
"Вход: Площадь\n",
"Целевой признак: Цена\n",
"\n",
"Для второй цели:\n",
"\n",
"Вход: Дата и месяц\n",
"Целевой признак: Цена"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверка на выбросы"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пропущенные значения по столбцам:\n",
"date 0\n",
"price 0\n",
"bedrooms 0\n",
"grade 0\n",
"has_basement 0\n",
"living_in_m2 0\n",
"renovated 0\n",
"nice_view 0\n",
"perfect_condition 0\n",
"real_bathrooms 0\n",
"has_lavatory 0\n",
"single_floor 0\n",
"month 0\n",
"quartile_zone 0\n",
"dtype: int64\n",
"\n",
"Статистический обзор данных:\n",
" price bedrooms grade living_in_m2 real_bathrooms \\\n",
"count 6.948000e+03 6948.000000 6948.000000 6948.000000 6948.000000 \n",
"mean 5.001805e+05 3.350461 7.580455 187.564084 1.723805 \n",
"std 2.464662e+05 0.886833 1.098832 76.220497 0.697007 \n",
"min 7.500000e+04 0.000000 3.000000 34.374110 0.000000 \n",
"25% 3.200000e+05 3.000000 7.000000 131.341616 1.000000 \n",
"50% 4.470000e+05 3.000000 7.000000 176.469248 2.000000 \n",
"75% 6.200000e+05 4.000000 8.000000 231.328470 2.000000 \n",
"max 1.485000e+06 7.000000 12.000000 661.469360 6.000000 \n",
"\n",
" month quartile_zone \n",
"count 6948.000000 6948.000000 \n",
"mean 6.527778 2.467473 \n",
"std 3.106609 1.087747 \n",
"min 1.000000 1.000000 \n",
"25% 4.000000 2.000000 \n",
"50% 6.000000 2.000000 \n",
"75% 9.000000 3.000000 \n",
"max 12.000000 4.000000 \n"
]
}
],
"source": [
"null_values = df_houses.isnull().sum()\n",
"print(\"Пропущенные значения по столбцам:\")\n",
"print(null_values)\n",
"\n",
"stat_summary = df_houses.describe()\n",
"print(\"\\nСтатистический обзор данных:\")\n",
"print(stat_summary)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"На основе данных можно сделать вывод об отстутствии нулевых значений"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Коэффициент асимметрии для столбца 'price': 1.2360372254399883\n",
"\n",
"Коэффициент асимметрии для столбца 'bedrooms': 0.2707651264666999\n",
"\n",
"Коэффициент асимметрии для столбца 'grade': 0.5399780033708754\n",
"\n",
"Коэффициент асимметрии для столбца 'living_in_m2': 0.9231129090688741\n",
"\n",
"Коэффициент асимметрии для столбца 'real_bathrooms': 0.6726357568802298\n",
"\n",
"Коэффициент асимметрии для столбца 'month': 0.08293910123309992\n",
"\n",
"Коэффициент асимметрии для столбца 'quartile_zone': 0.023363344136892138\n",
"\n",
"Количество дубликатов: 0\n"
]
}
],
"source": [
"for column in df_houses.select_dtypes(include=[np.number]).columns:\n",
" skewness = df_houses[column].skew()\n",
" print(f\"\\nКоэффициент асимметрии для столбца '{column}': {skewness}\")\n",
"\n",
"duplicates = df_houses.duplicated().sum()\n",
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Дупликаты отстутсвуют, выбросы незначительны"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Устранение шумов"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADqqUlEQVR4nOydeXgUVfb3v93Z02QlQAcGSNhGQoAAimAQFMMIQcH9J7ijuMEMg/MqoqAwqMigA4ygKCqoCLgygCAjGBwkE0SBACEgEBNQSMDskJ10vX+Eanqp5dbWW87neZgx3VV1b91a+p57zvkeE8dxHAiCIAiCIAiCIAhRzN7uAEEQBEEQBEEQhK9DhhNBEARBEARBEIQMZDgRBEEQBEEQBEHIQIYTQRAEQRAEQRCEDGQ4EQRBEARBEARByECGE0EQBEEQBEEQhAxkOBEEQRAEQRAEQchAhhNBEARBEARBEIQMZDgRBEEQBEEQBEHIQIYTQRAEQRCED/Hbb79h1apV9r+Liorw8ccfe69DBEEAIMOJIAid+fzzz2EymQT/paamert7BEEQPo/JZMKUKVPwn//8B0VFRXjmmWfw/fffe7tbBNHqCfZ2BwiCCEyee+459O7d2/73yy+/7MXeEARB+A+dOnXC5MmTMXr0aABAYmIivvvuO+92iiAImDiO47zdCYIgAofPP/8cd955J3bs2IHrrrvO/vl1112H0tJS5OXlea9zBEEQfkRBQQFKS0uRmpoKi8Xi7e4QRKuHQvUIgtCVxsZGAIDZLP96WbVqFUwmE4qKiuyf2Ww29OvXDyaTySnG/+DBg3jwwQfRrVs3hIeHw2q1YtKkSSgrK3M65pw5cwTDBIODLzvYr7vuOqSmpmLv3r245pprEBERgeTkZCxfvtztXF544QUMGjQIMTExsFgsuPbaa7Fjxw6n7YqKiuzt/Pvf/3b6rr6+HnFxcTCZTHjttdfc+tm+fXs0NTU57bN27Vr78UpLS+2fb9iwAWPHjkXHjh0RFhaG7t27Y968eWhubpYda769o0eP4q677kJ0dDTatm2LadOmob6+3mnblStXYuTIkWjfvj3CwsKQkpKCt956S/C4X3/9NUaMGIGoqChER0fjqquuwpo1a5y2+eGHH5CZmYm4uDhYLBb069cPS5Yscdrm6NGjuOOOOxAfH4/w8HBceeWV2Lhxo9M2Su6XBx980On6x8XF4brrrnMLd2IdU/6eceW1115z61NSUhIefPBBp+0+++wzmEwmJCUlOX1+7tw5PPzww+jSpQuCgoLs/W3Tpo1bW64kJSWJhsWaTCa37VevXo1BgwYhIiIC8fHxuPvuu/Hrr78KnqfcswEADQ0NePHFF9GjRw+EhYWhc+fOeOaZZ9DQ0OC27XfffcfcT1f4e1fo/B3HWcn9AcD+LLRr1w4RERH44x//iOeff96pTal/vAfouuuuc1okAlo87Gaz2e1Z+Oyzz+zXICEhAffeey9Onz7ttM2DDz5ov0+6d++Oq6++GuXl5YiIiHA7P4IgPAuF6hEEoSu84RQWFqZq/48++giHDh1y+3zbtm345Zdf8NBDD8FqteLw4cN45513cPjwYezevdttYvXWW285TT5dDbmKigpkZmbirrvuwoQJE/Dpp5/iiSeeQGhoKCZNmgQAqK6uxrvvvosJEyZg8uTJOH/+PN577z3ceOON2LNnD9LS0pyOGR4ejpUrV+KWW26xf/bll1+6GSaOnD9/Hl999RVuvfVW+2crV65EeHi4236rVq1CmzZt8NRTT6FNmzbIysrCCy+8gOrqaixcuFC0DUfuuusuJCUlYf78+di9ezf+9a9/oaKiAh9++KHT2PXp0wfjxo1DcHAwNm3ahCeffBI2mw1Tpkxx6s+kSZPQp08fzJw5E7Gxsdi/fz+2bt2KiRMnAmi5bjfddBMSExMxbdo0WK1WHDlyBF999RWmTZsGADh8+DDS09PRqVMnPPvss7BYLPj0009xyy234IsvvnAaG1fE7hcASEhIwKJFiwC0JNsvWbIEmZmZ+PXXXxEbG6vbmMpx8eJF+4TclQceeADbt2/Hn//8Z/Tv3x9BQUF45513sG/fPqZjp6Wl4W9/+5vTZx9++CG2bdvm9NnLL7+M2bNn46677sIjjzyC33//HW+88QaGDx+O/fv328cDYHs2bDYbxo0bh127duHRRx9F7969cejQISxatAjHjh1zW0Dg+ctf/oKrrrpKtJ96I3Z/HDx4ENdeey1CQkLw6KOPIikpCQUFBdi0aRNefvll3HbbbejRo4d9++nTp6N379549NFH7Z85hiI7snLlSsyaNQuvv/66/TkAWu61hx56CFdddRXmz5+Ps2fPYsmSJcjOzna7Bq688MILku8RgiA8BEcQBKEjixcv5gBwBw4ccPp8xIgRXJ8+fZw+W7lyJQeAKyws5DiO4+rr67kuXbpwY8aM4QBwK1eutG9bW1vr1tbatWs5ANzOnTvtn7344oscAO73338X7eOIESM4ANzrr79u/6yhoYFLS0vj2rdvzzU2NnIcx3EXL17kGhoanPatqKjgOnTowE2aNMn+WWFhIQeAmzBhAhccHMyVlJTYv7vhhhu4iRMncgC4hQsXuvVzwoQJ3E033WT//OTJk5zZbOYmTJjgdh5CY/DYY49xkZGRXH19vej5OrY3btw4p8+ffPJJt+sl1M6NN97IdevWzf53ZWUlFxUVxV199dVcXV2d07Y2m43juJbxS05O5rp27cpVVFQIbsNxLWPUt29fp3Ow2WzcNddcw/Xs2dP+mZL75YEHHuC6du3q1OY777zDAeD27Nkjea5CYyp0/3Icxy1cuNCpTxzHcV27duUeeOAB+99vvvkmFxYWxl1//fVOfaqrq+PMZjP32GOPOR3zgQce4CwWi1tbrnTt2pUbO3as2+dTpkzhHH/ei4qKuKCgIO7ll1922u7QoUNccHCw0+esz8ZHH33Emc1m7vvvv3c65vLlyzkAXHZ2ttPn33zzDQeA+/zzz0X7KcbcuXM5AE73DH/+juOs5P4YPnw4FxUVxZ08edLpmK5tiLXlyIgRI7gRI0ZwHMdxmzdv5oKDg7m//e1vTts0NjZy7du351JTU52el6+++ooDwL3wwgv2z1zv3by8PM5sNtvPw/FeIwjCs1CoHkEQusKHzrVr107xvsuWLUNZWRlefPFFt+8iIiLs/11fX4/S0lIMGTIEAJhX5x0JDg7GY489Zv87NDQUjz32GM6dO4e9e/cCAIKCghAaGgqgZYW9vLwcFy9exJVXXinY5sCBA9GnTx989NFHAICTJ09ix44dbmFbjkyaNAlbt25FSUkJAOCDDz7A0KFD0atXL7dtHcfg/PnzKC0txbXXXova2locPXqU6bwdPUYA8Oc//xkAsGXLFsF2qqqqUFpaihEjRuCXX35BVVUVgBZP0vnz5/Hss88iPDzc6Zi892///v0oLCzEX//6V7fVdH6b8vJyZGVl4a677rKfU2lpKcrKynDjjTfi+PHjbqFMPFL3C9Byzfjj5ebm4sMPP0RiYqKTp0DJmDY3N9uPx/+rra0VbJuntrYWf//73zF16lR06dLF6buamhrYbDa0bdtW8hha+fLLL2Gz2XDXXXc59d1qtaJnz55uoacsz8Znn32G3r1744orrnA65siRIwHA7Zi8t8T1XmGhffv2AFq8hkoQuz9+//137Ny5E5MmTXK7Jiyhg2Ls2bMHd911F26//XY3b+VPP/2Ec+fO4cknn3Qag7Fjx+KKK67A5s2bRY87c+ZMDBw4EHfeeafqvhEEoQ+t2nDauXMnbr75ZnTs2FEwN4EFjuPw2muvoVevXggLC0OnTp1IPYxo1Zw8eRLBwcGKDaeqqiq88soreOqpp9ChQwe378vLyzFt2jR06NABERERaNeuHZKTk+37KqVjx45uyda8seKYQ/DBBx+gX79+CA8PR9u2bdGuXTts3rxZtM2HHnoIK1euBNASmnPNNdegZ8+eov1IS0tDamoqPvzwQ3AcZw/nEeLw4cO49dZbERMTg+joaLRr1w733nsvAPYxcO1L9+7
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выбросы в датасете:\n",
" date price bedrooms grade has_basement living_in_m2 \\\n",
"142 2014-05-05 890000.0 4 10 True 410.63126 \n",
"166 2014-06-12 1240000.0 5 11 False 409.70223 \n",
"201 2014-09-23 1230000.0 5 9 True 407.84417 \n",
"209 2014-08-28 1275000.0 6 11 True 572.28248 \n",
"345 2014-12-23 1042000.0 4 10 True 457.08276 \n",
"... ... ... ... ... ... ... \n",
"6777 2015-03-30 1436000.0 4 11 True 461.72791 \n",
"6795 2014-05-27 1149000.0 4 11 True 551.84382 \n",
"6809 2014-09-12 1040000.0 5 11 True 443.14731 \n",
"6905 2014-05-29 1425000.0 4 11 True 460.79888 \n",
"6931 2015-03-31 985000.0 5 10 True 431.99895 \n",
"\n",
" renovated nice_view perfect_condition real_bathrooms has_lavatory \\\n",
"142 False False False 4 True \n",
"166 False False False 4 False \n",
"201 False False False 4 False \n",
"209 False False False 5 True \n",
"345 False True False 4 True \n",
"... ... ... ... ... ... \n",
"6777 False True False 3 True \n",
"6795 False True False 3 False \n",
"6809 False False False 3 True \n",
"6905 True False False 4 True \n",
"6931 False False False 4 True \n",
"\n",
" single_floor month quartile_zone \n",
"142 False 5 3 \n",
"166 False 6 4 \n",
"201 False 9 4 \n",
"209 False 8 4 \n",
"345 False 12 2 \n",
"... ... ... ... \n",
"6777 False 3 4 \n",
"6795 False 5 3 \n",
"6809 True 9 4 \n",
"6905 False 5 4 \n",
"6931 False 3 3 \n",
"\n",
"[135 rows x 14 columns]\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydeXgV1fnHv/eG7GQlQAICCZsSw14RjIhiqAiKa63gjqIotBT7U0GlglSRagu24IYIVgTqggUFqWCwSAyihAAhUCAkoJCA2SE7ufP7I8zlLrOcM8u9c5P38zw+j9ycmTkz58yZ857zvt/XJgiCAIIgCIIgCIIgCEIWu78rQBAEQRAEQRAEYXXIcCIIgiAIgiAIglCBDCeCIAiCIAiCIAgVyHAiCIIgCIIgCIJQgQwngiAIgiAIgiAIFchwIgiCIAiCIAiCUIEMJ4IgCIIgCIIgCBXIcCIIgiAIgiAIglCBDCeCIAiCIAiCIAgVyHAiCIIgCIIgCIJQgQwngiB8wieffAKbzSb5X1pamr+rRxAEQRAEoUg7f1eAIIi2xbPPPot+/fo5//3SSy/5sTYEQRAEQRBskOFEEIRPGTNmDK699lrnv999912Ulpb6r0IEQRAEQRAMkKseQRA+obGxEQBgt6sPOytXroTNZkNRUZHzN4fDgQEDBsBms2HlypXO3/ft24cHH3wQPXv2RFhYGBITEzF58mSUlZW5nXPu3LmSboLt2l1cP7r22muRlpaG3bt346qrrkJ4eDhSUlLw1ltved3Ln/70JwwdOhQxMTGIjIzEyJEjsW3bNrdyRUVFzuv8+9//dvtbfX094uLiYLPZ8Nprr3nVs1OnTmhqanI7Zs2aNc7zuRqb69evx/jx49GlSxeEhoaiV69emD9/Ppqbm1WftXi9Q4cO4a677kJ0dDQ6dOiAGTNmoL6+3q3sihUrMHr0aHTq1AmhoaFITU3Fm2++KXneL7/8EqNGjUJUVBSio6NxxRVXYPXq1W5lvv/+e4wbNw5xcXGIjIzEgAED8Prrr7uVOXToEO68807Ex8cjLCwMv/rVr7Bhwwa3Mjz95cEHH3Rr/7i4OFx77bX49ttv3c7J+kzFPuPJa6+95lWn5ORkPPjgg27lPv74Y9hsNiQnJ7v9fubMGTz88MPo3r07goKCnPVt376917U8SU5OlnWLtdlsbmXPnz+P+fPno1evXggNDUVycjKeffZZNDQ0eJ2XpU1d+7zSdR0OBxYvXozLL78cYWFh6Ny5Mx577DFUVFQw3Z/nc/zmm29gs9nwzTffOH+79tpr3RZpAOCHH36QrA8ArFq1CsOGDUNERATi4uJwzTXX4KuvvnJeU+mZiu0n3r9rnzt79iyGDh2KlJQUFBcXy5YDgGnTpsFms3ndH0EQ1oB2nAiC8Ami4RQaGqrp+A8++AD79+/3+n3Lli04duwYHnroISQmJuLAgQN45513cODAAezcudNrgvTmm2+6TT49DbmKigqMGzcOd911FyZOnIiPPvoIjz/+OEJCQjB58mQAQHV1Nd59911MnDgRU6ZMwdmzZ7F8+XLccMMN2LVrFwYNGuR2zrCwMKxYsQK33nqr87d169Z5GSaunD17Fl988QVuu+02528rVqxAWFiY13ErV65E+/bt8eSTT6J9+/bIzMzEn/70J1RXV+PVV1+VvYYrd911F5KTk7FgwQLs3LkTf//731FRUYF//vOfbs/u8ssvx4QJE9CuXTt8/vnneOKJJ+BwODBt2jS3+kyePBmXX345Zs+ejdjYWOzZswebN2/GpEmTALS020033YSkpCTMmDEDiYmJOHjwIL744gvMmDEDAHDgwAGkp6eja9eumDVrFiIjI/HRRx/h1ltvxaeffur2bDyR6y8AkJCQgEWLFgEAfv75Z7z++usYN24cfvrpJ8TGxhr2TNU4f/48nnvuOcm/PfDAA9i6dSt+97vfYeDAgQgKCsI777yDnJwcpnMPGjQIf/zjH91+++c//4ktW7a4/fbII4/g/fffx5133ok//vGP+P7777FgwQIcPHgQn332mbMcS5u68uijj2LkyJEAWvq667kA4LHHHsPKlSvx0EMP4fe//z0KCwuxZMkS7NmzB1lZWQgODma6T16eeeYZyd/nzZuHuXPn4qqrrsKLL76IkJAQfP/998jMzMSvf/1rLF68GOfOnQMAHDx4EC+//LKb27GcQdvU1IQ77rgDJ06cQFZWFpKSkmTrdvToUSxbtkznHRIEYSoCQRCED1i8eLEAQNi7d6/b76NGjRIuv/xyt99WrFghABAKCwsFQRCE+vp6oXv37sKNN94oABBWrFjhLFtbW+t1rTVr1ggAhO3btzt/e+GFFwQAwi+//CJbx1GjRgkAhL/+9a/O3xoaGoRBgwYJnTp1EhobGwVBEITz588LDQ0NbsdWVFQInTt3FiZPnuz8rbCwUAAgTJw4UWjXrp1QUlLi/Nv1118vTJo0SQAgvPrqq171nDhxonDTTTc5fz9+/Lhgt9uFiRMnet2H1DN47LHHhIiICKG+vl72fl2vN2HCBLffn3jiCa/2krrODTfcIPTs2dP578rKSiEqKkq48sorhbq6OreyDodDEISW55eSkiL06NFDqKiokCwjCC3PqH///m734HA4hKuuukro06eP8zee/vLAAw8IPXr0cLvmO++8IwAQdu3apXivUs9Uqv8KgiC8+uqrbnUSBEHo0aOH8MADDzj//cYbbwihoaHCdddd51anuro6wW63C4899pjbOR944AEhMjLS61qe9OjRQxg/frzX79OmTRNcP/u5ubkCAOGRRx5xK/d///d/AgAhMzNTEAS2NhU5cuSIAEB4//33nb+JfUzk22+/FQAIH374oduxmzdvlvzdk5SUFOH+++93+23btm0CAGHbtm3O30aNGiWMGjXK+e9NmzYJAISxY8e61efIkSOC3W4XbrvtNqG5uVnx/uSuJSK+8ytWrBAcDodwzz33CBEREcL3338vW07krrvuEtLS0oRu3bq59ROCIKwDueoRBOETRNe5jh07ch+7dOlSlJWV4YUXXvD6W3h4uPP/6+vrUVpaiuHDhwMA8+q8K+3atcNjjz3m/HdISAgee+wxnDlzBrt37wYABAUFISQkBECLy1F5eTnOnz+PX/3qV5LXHDJkCC6//HJ88MEHAIDjx49j27Ztiu44kydPxubNm1FSUgIAeP/99zFixAj07dvXq6zrMzh79ixKS0sxcuRI1NbW4tChQ0z37bpjBAC/+93vAACbNm2SvE5VVRVKS0sxatQoHDt2DFVVVQBadpLOnj2LWbNmISwszO2c4u7fnj17UFhYiD/84Q/OHR7PMuXl5cjMzMRdd93lvKfS0lKUlZXhhhtuwJEjR3Dy5EnJe1HqL0BLm4nny83NxT//+U8kJSW5iZbwPNPm5mbn+cT/amtrJa8tUltbixdffBHTp09H9+7d3f5WU1MDh8OBDh06KJ5DL2LbPvnkk26/iztVGzduBMDWpiIsO8sff/wxYmJiMGbMGLdnNnToULRv397L5dWTTp064eeff2a4w4sIgoDZs2fjjjvuwJVXXun2t3//+99wOBz405/+5LUDLeXSx8pTTz2FDz/8EB999BGGDRumWHb37t34+OOPsWDBAiZ3ZoIg/EObfju3b9+Om2++GV26dJGMQWBBEAS89tpr6Nu3L0JDQ9G1a1dSCSMICY4fP4527dpxG05VVVV4+eWX8eSTT6Jz585efy8vL8eMGTPQuXNnhIeHo2PHjkhJSXEey0uXLl0QGRnp9ptorLjGq7z//vsYMGAAwsLC0KFDB3Ts2BEbN26UveZDDz2EFStWAGhxe7rqqqvQp08f2XoMGjQIaWlp+Oc//wlBEJxuTVIcOHAAt912G2JiYhAdHY2OHTvi3nvvBcD+DDzr0qtXL9jtdrd7zsrKQkZGBiIjIxEbG4uOHTvi2WefdbtOQUEBAChKzLOUOXr0KARBwJw5c9CxY0e3/0SD6MyZM17HqfUXAPjpp5+c5xo8eDAKCgrw6aefurlb8TzTQ4cOydZRjr/97W+or69
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df_houses['living_in_m2'], df_houses['price'])\n",
"plt.xlabel('Площадь')\n",
"plt.ylabel('Цена')\n",
"plt.title('Диаграмма рассеивания перед чисткой')\n",
"plt.show()\n",
"\n",
"Q1 = df_houses[\"living_in_m2\"].quantile(0.25)\n",
"Q3 = df_houses[\"living_in_m2\"].quantile(0.75)\n",
"\n",
"IQR = Q3 - Q1\n",
"\n",
"threshold = 1.5 * IQR\n",
"lower_bound = Q1 - threshold\n",
"upper_bound = Q3 + threshold\n",
"\n",
"outliers = (df_houses[\"living_in_m2\"] < lower_bound) | (df_houses[\"living_in_m2\"] > upper_bound)\n",
"\n",
"# Вывод выбросов\n",
"print(\"Выбросы в датасете:\")\n",
"print(df_houses[outliers])\n",
"\n",
"# Заменяем выбросы на медианные значения\n",
"median_score = df_houses[\"living_in_m2\"].median()\n",
"df_houses.loc[outliers, \"living_in_m2\"] = median_score\n",
"\n",
"# Визуализация данных после обработки\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df_houses['living_in_m2'], df_houses['price'])\n",
"plt.xlabel('Площадь')\n",
"plt.ylabel('Цена')\n",
"plt.title('Диаграмма рассеивания после чистки')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 4168\n",
"Размер контрольной выборки: 1390\n",
"Размер тестовой выборки: 1390\n"
]
}
],
"source": [
"train_df, test_df = train_test_split(df_houses, test_size=0.2, random_state=42)\n",
"\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Видим недостаток баланса"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение цены в обучающей выборке:\n",
"price\n",
"450000.0 46\n",
"500000.0 40\n",
"375000.0 30\n",
"550000.0 30\n",
"300000.0 27\n",
" ..\n",
"579950.0 1\n",
"1218000.0 1\n",
"374990.0 1\n",
"626000.0 1\n",
"1169000.0 1\n",
"Name: count, Length: 1463, dtype: int64\n",
"\n",
"Распределение цены в контрольной выборке:\n",
"price\n",
"400000.0 16\n",
"450000.0 16\n",
"550000.0 14\n",
"600000.0 14\n",
"500000.0 13\n",
" ..\n",
"744500.0 1\n",
"419950.0 1\n",
"369950.0 1\n",
"616000.0 1\n",
"220000.0 1\n",
"Name: count, Length: 710, dtype: int64\n",
"\n",
"Распределение цены в тестовой выборке:\n",
"price\n",
"600000.0 15\n",
"550000.0 13\n",
"425000.0 13\n",
"250000.0 12\n",
"525000.0 12\n",
" ..\n",
"607000.0 1\n",
"465250.0 1\n",
"359500.0 1\n",
"427500.0 1\n",
"442000.0 1\n",
"Name: count, Length: 718, dtype: int64\n",
"\n"
]
}
],
"source": [
"def check_balance(df_houses, name):\n",
" counts = df_houses['price'].value_counts()\n",
" print(f\"Распределение цены в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"также используем oversampling и undersampling"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Оверсэмплинг:\n",
"Распределение цены в обучающей выборке:\n",
"price\n",
"626000.0 46\n",
"369000.0 46\n",
"650000.0 46\n",
"1169000.0 46\n",
"850000.0 46\n",
" ..\n",
"735000.0 46\n",
"210500.0 46\n",
"265000.0 46\n",
"255000.0 46\n",
"369300.0 46\n",
"Name: count, Length: 1463, dtype: int64\n",
"\n",
"Распределение цены в контрольной выборке:\n",
"price\n",
"863000.0 16\n",
"585000.0 16\n",
"560000.0 16\n",
"207000.0 16\n",
"267000.0 16\n",
" ..\n",
"510000.0 16\n",
"260000.0 16\n",
"550000.0 16\n",
"185000.0 16\n",
"435000.0 16\n",
"Name: count, Length: 710, dtype: int64\n",
"\n",
"Распределение цены в тестовой выборке:\n",
"price\n",
"331292.0 15\n",
"950000.0 15\n",
"519000.0 15\n",
"240000.0 15\n",
"193000.0 15\n",
" ..\n",
"250000.0 15\n",
"420000.0 15\n",
"870000.0 15\n",
"242000.0 15\n",
"470000.0 15\n",
"Name: count, Length: 718, dtype: int64\n",
"\n",
"Андерсэмплинг:\n",
"Распределение цены в обучающей выборке:\n",
"price\n",
"1485000.0 1\n",
"80000.0 1\n",
"81000.0 1\n",
"82000.0 1\n",
"83000.0 1\n",
" ..\n",
"125000.0 1\n",
"118125.0 1\n",
"115000.0 1\n",
"110000.0 1\n",
"100000.0 1\n",
"Name: count, Length: 1463, dtype: int64\n",
"\n",
"Распределение цены в контрольной выборке:\n",
"price\n",
"1475000.0 1\n",
"114000.0 1\n",
"115000.0 1\n",
"126000.0 1\n",
"130000.0 1\n",
" ..\n",
"163250.0 1\n",
"160000.0 1\n",
"158000.0 1\n",
"157000.0 1\n",
"150000.0 1\n",
"Name: count, Length: 710, dtype: int64\n",
"\n",
"Распределение цены в тестовой выборке:\n",
"price\n",
"1475000.0 1\n",
"75000.0 1\n",
"100000.0 1\n",
"111300.0 1\n",
"114975.0 1\n",
" ..\n",
"169900.0 1\n",
"169575.0 1\n",
"169500.0 1\n",
"165000.0 1\n",
"163500.0 1\n",
"Name: count, Length: 718, dtype: int64\n",
"\n"
]
}
],
"source": [
"\n",
"\n",
"def oversample(df, target_column):\n",
" X = df.drop(target_column, axis=1)\n",
" y = df[target_column]\n",
" \n",
" oversampler = RandomOverSampler(random_state=42)\n",
" x_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
" \n",
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1) \n",
" return resampled_df\n",
"\n",
"def undersample(df, target_column):\n",
" X = df.drop(target_column, axis=1)\n",
" y = df[target_column]\n",
" \n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" x_resampled, y_resampled = undersampler.fit_resample(X, y) # type: ignore\n",
" \n",
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_oversampled = oversample(train_df, 'price')\n",
"val_df_oversampled = oversample(val_df, 'price')\n",
"test_df_oversampled = oversample(test_df, 'price')\n",
"\n",
"train_df_undersampled = undersample(train_df, 'price')\n",
"val_df_undersampled = undersample(val_df, 'price')\n",
"test_df_undersampled = undersample(test_df, 'price')\n",
"\n",
"print(\"Оверсэмплинг:\")\n",
2024-10-18 19:14:48 +04:00
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
"\n",
"print(\"Андерсэмплинг:\")\n",
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
"check_balance(test_df_undersampled, \"тестовой выборке\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}