978 lines
283 KiB
Plaintext
978 lines
283 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Лабораторная 2"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Информация об экономике стран"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 132,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['stock index', 'country', 'year', 'index price', 'log_indexprice',\n",
|
|||
|
" 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent',\n",
|
|||
|
" 'percapitaincome', 'unemploymentrate', 'manufacturingoutput',\n",
|
|||
|
" 'tradebalance', 'USTreasury'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import numpy as np\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\".//static//scv//Economic Data - 9 Countries (1980-2020).csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Столбцы на русском:\n",
|
|||
|
"'stock index' - индекс акций\n",
|
|||
|
"'country' - страна\n",
|
|||
|
"'year'- год\n",
|
|||
|
"'index price' - индекс стоимости\n",
|
|||
|
"'log_indexprice' - индексная цена журнала\n",
|
|||
|
"'inflationrate' - ставка инфляции\n",
|
|||
|
"'oil prices' - цена на нефть\n",
|
|||
|
"'exchange_rate' - ставка обмена\n",
|
|||
|
"'gdppercent' - процент ВВП\n",
|
|||
|
"'percapitaincome' - доход на душу населения\n",
|
|||
|
"'unemploymentrate' - уровень безработицы\n",
|
|||
|
"'manufacturingoutput' - объем производства\n",
|
|||
|
"'tradebalance' - торговый баланс\n",
|
|||
|
"'USTreasury' - UST казначейство"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 133,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 369 entries, 0 to 368\n",
|
|||
|
"Data columns (total 14 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 stock index 369 non-null object \n",
|
|||
|
" 1 country 369 non-null object \n",
|
|||
|
" 2 year 369 non-null float64\n",
|
|||
|
" 3 index price 317 non-null float64\n",
|
|||
|
" 4 log_indexprice 369 non-null float64\n",
|
|||
|
" 5 inflationrate 326 non-null float64\n",
|
|||
|
" 6 oil prices 369 non-null float64\n",
|
|||
|
" 7 exchange_rate 367 non-null float64\n",
|
|||
|
" 8 gdppercent 350 non-null float64\n",
|
|||
|
" 9 percapitaincome 368 non-null float64\n",
|
|||
|
" 10 unemploymentrate 348 non-null float64\n",
|
|||
|
" 11 manufacturingoutput 278 non-null float64\n",
|
|||
|
" 12 tradebalance 365 non-null float64\n",
|
|||
|
" 13 USTreasury 369 non-null float64\n",
|
|||
|
"dtypes: float64(12), object(2)\n",
|
|||
|
"memory usage: 40.5+ KB\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>stock index</th>\n",
|
|||
|
" <th>country</th>\n",
|
|||
|
" <th>year</th>\n",
|
|||
|
" <th>index price</th>\n",
|
|||
|
" <th>log_indexprice</th>\n",
|
|||
|
" <th>inflationrate</th>\n",
|
|||
|
" <th>oil prices</th>\n",
|
|||
|
" <th>exchange_rate</th>\n",
|
|||
|
" <th>gdppercent</th>\n",
|
|||
|
" <th>percapitaincome</th>\n",
|
|||
|
" <th>unemploymentrate</th>\n",
|
|||
|
" <th>manufacturingoutput</th>\n",
|
|||
|
" <th>tradebalance</th>\n",
|
|||
|
" <th>USTreasury</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1980.0</td>\n",
|
|||
|
" <td>168.61</td>\n",
|
|||
|
" <td>2.23</td>\n",
|
|||
|
" <td>0.14</td>\n",
|
|||
|
" <td>21.59</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.09</td>\n",
|
|||
|
" <td>12575.0</td>\n",
|
|||
|
" <td>0.07</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-13.06</td>\n",
|
|||
|
" <td>0.11</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1981.0</td>\n",
|
|||
|
" <td>203.15</td>\n",
|
|||
|
" <td>2.31</td>\n",
|
|||
|
" <td>0.10</td>\n",
|
|||
|
" <td>31.77</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.12</td>\n",
|
|||
|
" <td>13976.0</td>\n",
|
|||
|
" <td>0.08</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-12.52</td>\n",
|
|||
|
" <td>0.14</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1982.0</td>\n",
|
|||
|
" <td>188.98</td>\n",
|
|||
|
" <td>2.28</td>\n",
|
|||
|
" <td>0.06</td>\n",
|
|||
|
" <td>28.52</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.04</td>\n",
|
|||
|
" <td>14434.0</td>\n",
|
|||
|
" <td>0.10</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-19.97</td>\n",
|
|||
|
" <td>0.13</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1983.0</td>\n",
|
|||
|
" <td>285.43</td>\n",
|
|||
|
" <td>2.46</td>\n",
|
|||
|
" <td>0.03</td>\n",
|
|||
|
" <td>26.19</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.09</td>\n",
|
|||
|
" <td>15544.0</td>\n",
|
|||
|
" <td>0.10</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-51.64</td>\n",
|
|||
|
" <td>0.11</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1984.0</td>\n",
|
|||
|
" <td>248.89</td>\n",
|
|||
|
" <td>2.40</td>\n",
|
|||
|
" <td>0.04</td>\n",
|
|||
|
" <td>25.88</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.11</td>\n",
|
|||
|
" <td>17121.0</td>\n",
|
|||
|
" <td>0.08</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-102.73</td>\n",
|
|||
|
" <td>0.12</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" stock index country year index price log_indexprice \\\n",
|
|||
|
"0 NASDAQ United States of America 1980.0 168.61 2.23 \n",
|
|||
|
"1 NASDAQ United States of America 1981.0 203.15 2.31 \n",
|
|||
|
"2 NASDAQ United States of America 1982.0 188.98 2.28 \n",
|
|||
|
"3 NASDAQ United States of America 1983.0 285.43 2.46 \n",
|
|||
|
"4 NASDAQ United States of America 1984.0 248.89 2.40 \n",
|
|||
|
"\n",
|
|||
|
" inflationrate oil prices exchange_rate gdppercent percapitaincome \\\n",
|
|||
|
"0 0.14 21.59 1.0 0.09 12575.0 \n",
|
|||
|
"1 0.10 31.77 1.0 0.12 13976.0 \n",
|
|||
|
"2 0.06 28.52 1.0 0.04 14434.0 \n",
|
|||
|
"3 0.03 26.19 1.0 0.09 15544.0 \n",
|
|||
|
"4 0.04 25.88 1.0 0.11 17121.0 \n",
|
|||
|
"\n",
|
|||
|
" unemploymentrate manufacturingoutput tradebalance USTreasury \n",
|
|||
|
"0 0.07 NaN -13.06 0.11 \n",
|
|||
|
"1 0.08 NaN -12.52 0.14 \n",
|
|||
|
"2 0.10 NaN -19.97 0.13 \n",
|
|||
|
"3 0.10 NaN -51.64 0.11 \n",
|
|||
|
"4 0.08 NaN -102.73 0.12 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 133,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df.info()\n",
|
|||
|
"df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Объект наблюдения - экономика\n",
|
|||
|
"Атрибуты - содержит набор информации об обучении, такие как:\n",
|
|||
|
"Фондовый рынок, ВВП, страна, год, стоимость топлива, уровень инфлции,уровень безработицы и так далее"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 134,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2wAAAImCAYAAAAv2AnvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3xc1Z3//9e5905Vl2xJ7hVXbEwxnYQAIVmSQIAsaaSQAhvIptdfymZTN/slm0bqEpIQkgCBNAIJLIYUiDE2xRTbuODeZKtLU++95/fH2AJhGayZsVX8fj4eCtLcO+dzro5Hmc+ccz/HWGstIiIiIiIiMuw4Q90BERERERERGZgSNhERERERkWFKCZuIiIiIiMgwpYRNRERERERkmFLCJiIiIiIiMkwpYRMRERERERmmlLCJiIiIiIgMU0rYREREREREhiklbCIiIiIiIsOUEjYRERnVPvWpT3HOOecc9Pg555zDpz71qSPYo8PnvvvuY/bs2UPdDRERKSMlbCIiIqPAsmXL+OhHPzrU3RARkTJTwiYiIjKC9fT08M1vfpMrrriCaDQ61N0REZEyU8ImIiLyPEEQ8Mtf/pLXve51LFy4kLPPPptrr72WbDbbd86nPvUpZs+ezate9aoDnn/JJZcwe/Zsvvvd7/Y91tHRwec//3lOP/10FixYwGWXXcbSpUv7PW/27NncdNNNfPKTn+T444/n9NNP5ytf+Uq/uAO57bbbuPXWW/n85z/P5ZdfXuLVi4jIcKOETUREjgq+7w/49UKf//zn+drXvsZ5553HD37wA9761rdy0003cfXVV2Ot7TsvmUyyefNmNmzY0PfYli1bWLNmTb/2stks73jHO1iyZAkf/vCHue6662hubuY973nPAUnbt7/9bVpbW/nWt77Fe97zHm655RY++clPvuh1nXPOOdx333286U1vKubXIiIiw5w31B0QERE53LZv3878+fNf8rz169dz22238dGPfpQrr7wSgDPOOIPGxkY+8YlP8Pe//52Xv/zlANTV1TFz5kyWLFnCjBkzALjrrrs46aSTWLZsWV+bf/jDH1izZg233norxx13HAAve9nLeNvb3sa1117L7bff3ndufX09P/zhD/E8j5e//OU4jsPXvvY1/v3f/70vxgtNnjy5uF+KiIiMCJphExGRUW/s2LHcdtttA36NHTu277yHH34YgNe85jX9nv+a17wG13X7JWIA5557LkuWLOn7+a677jrguUuXLmXs2LHMnz+/b1YvCAJe8YpX8NRTT9HZ2dl37ute9zo877nPUvcvuVy+fHmJvwERERmpNMMmIiKjXjQaZcGCBQc9tt/+5On5SRyA53nU1dXR3d3d7/HzzjuPb33rW+zZs4euri6effZZzj//fD7/+c/3ndPR0cGePXsOOsO3Z88eampqAGhqaup3rKGhoV+/RETk6KOETUREZJ/9idOePXuYMGFC3+P5fJ729nbq6ur6nT9z5kwmT57M/fffz+7duznttNMOOKeqqoqpU6dy7bXXDhhz4sSJfd+3t7f3O7Z3716gsFRSRESOTloSKSIiss/JJ58MwJ133tnv8TvvvJMgCDjxxBMPeM7+ZZF33XUXF1xwwYBt7ty5k4aGBhYsWND39eCDD3L99dfjum7fuffdd1+/5959990YYzj11FPLcXkiIjICaYZNRERkn5kzZ3LxxRfzne98h3Q6zeLFi1m9ejXXXXcdp5xyCmedddYBzzn33HP5+c9/juu6vPKVrzzg+CWXXMJNN93EFVdcwb/9278xbtw4/vnPf/K///u/XH755UQikb5zH3/8cT72sY9x0UUXsWbNGr773e9y2WWXMWnSpMN63SIiMnwpYRMREXmer3zlK0yZMoXbb7+d//3f/6WxsZG3v/3tXH311TjOgQtTTjjhBGprazn++OOprKw84HgymeSXv/wl3/jGN/h//+//0d3dzYQJE/joRz/Ku971rn7nvuMd72D37t28//3vp66ujn/7t3/jqquuOmzXKiIiw5+xz99URkRERIbE7Nmzef/738+///u/D3VXRERkGNE9bCIiIiIiIsOUEjYREREREZFhSksiRUREREREhinNsImIiIiIiAxTSthERERERESGKSVsIiIiIiIiw5T2YTsMHnvsMay1/TZDFRERERGRo08+n8cYw/HHH1/U8zXDdhhYazmUWi7WWnK53CGdK6OHxv3opHE/Omncj04a96OPxvzodKjjfqi5wcFohu0w2D+ztmDBghc9L5VKsXr1ambOnEkymTwSXZNhQON+dNK4H5007kcnjfvRR2N+dDrUcX/yySdLiqMZNhERERERkWFKCZuIiIiIiMgwpYRNRERERERkmFLCJiIiIiIiMkwpYRMRERERERmmlLCJiIiIiIgMU0rYREREREREhiklbCIiIiIiIsOUEjYREREREZFhSgmbiIiIiIjIMKWETUREREREZJhSwiYiIiIiIjJMKWETEREREREZppSwiYiIiIiIDFPeUHdAREYeGwZYPwthAMbBRGIYR39ORERERMpN77BE5JBZa7G5FGG2GxsGGMACJutiIkmceBXGmKHupoiIiMiooYRNRA6ZzWcIM10AGDdSSM6sBRsQZnvAGNx41RD3UkRERGT00D1sInJIrLWEuR7AYlyvbybNGINxCj/bXAobhkPbUREREZFRRAmbiBya0IfAB+MOfNxxIQywQe7I9ktERERkFFPCJiKHxlrAwovcomax+84TERERkXJQwiYih8ZxwThgD7Lk0VqMcTDOQWbgRERERGTQlLCJyCExjouJJLBhiH3BLJq1trBk0o2CGxmiHoqIiIiMPqoSKSKHzIlVYoMc1s8VZtv2VYm0NsS4EdxEtcr6i4iIiJSREjYROWTGcXGT9YW92PLpwvJI4+BEkzjRCoyrPykiIiIi5aR3VyIyKMZxMfEqTKyyUGDEGM2qiYiIiBwmSthERhlrQ2wuU5j9clxMJH5YEipjTGFJpIiIiIgcNkrYREYJay0220PY217YC81aMA7Gi+FU1uNEk0PdRREREREZJFWJFBklbLaHoKulkKy5EUwkDo6HzWcIOncT5tJD3UURERERGSQlbCKjgLUhYW87YDFeDGMKL23jOOBFIfQJUx0HlOMvPp7F5rOE2V5sPlO2dkVERESkPy2JFBkFbC5dKLXvHbgHmjEG63rYfBqCfCGBKzFW0LMXm3uuSqSJxHAqG3BilSW1LSIiIiL9aYZNZDSwIWD7ZtYOYJzCPW02LC1MPoPfsQOb7QHHAS8GjltI4jp2Ema6S2pfRERERPpTwiYyGjguGAcbHiQhCwszYThuSWGCnlbwc+DFMY6HMQbjuIXELQwJe1q1PFJERESkjJSwiYwCJpLAeFEI8gckTNZaCH1MLIlxD1wyeaisn8PmUoWCJi8o52+MATeC9bOFpZIiIiIiUhZK2ERGAWMMTkVDYQbNz2LDoLAfW+CDn8V4UdxkXWlBwmDf3m4D/9kwzr5ll6FfWhwRERER6aOETWSUcGJJ3JpmTKyikFgFhcTJJKpxa8YVZuBKClBYdslBll3aMCxspF3isksREREReY6qRIqMIk40Udh/Lcjvmw1zS1oG2Y8bwUST2EwX1nH7LYu01kKQw0RimGiiPPFERERERAmbyGhjjCm5dP/B2nUr6vHzafAzWDe6r/pkWEgQHQencszBK1WKiIiIyKDpnZWIHDITTeDWTsDEKgv3tPnZQkGTSBy3ZhxOvGqouygiIiIyqmiGTUQGxYkmMHUT9xU38cG4mEj8gMqRIiIiIlI6JWwiMmjGGIjEUYomIiIicnhpSaSIiIiIiMgwpYRNRERERERkmFLCJiIiIiIiMkwpYRMRERERERmmlLCJiIiIiIgMU0rYREREREREhiklbCIiIiIiIsOU9mETkUGz1kIujQ3y4LiYWIU2zhYRERE5DIbFDNvvf/97LrjgAhYsWMBrXvMa/vznP/cd27ZtG1dddRUnnHACZ555Jt/61rcIgqDf83/5y19y7rnnsnDhQt7ylrewatWqfsfL0YaIFISZHvw
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"\n",
|
|||
|
"plt.scatter(df['inflationrate'], df['percapitaincome'], c=df['percapitaincome'], alpha=0.6)\n",
|
|||
|
"\n",
|
|||
|
"plt.title(\"Номер 1\")\n",
|
|||
|
"plt.ylabel(\"Доход на душу населения\")\n",
|
|||
|
"plt.xlabel(\"Уровень инфляции\")\n",
|
|||
|
"plt.grid(visible='true')\n",
|
|||
|
"\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 135,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA/UAAAImCAYAAAASZqrMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC0nklEQVR4nOzdeXhTZfYH8O/N2rRNm+4LO5SlQKGg7BQVFR1FRQR3ZVTEDVFUZtRx+Om4jI46KAKKiuuIKDrghoiCAyiyL2UpO2Xrku5btja5vz/SpC0t0LRJbnLz/TyPj3CT3Jy0l7Qn73nPEURRFEFEREREREREQUchdQBERERERERE1DZM6omIiIiIiIiCFJN6IiIiIiIioiDFpJ6IiIiIiIgoSDGpJyIiIiIiIgpSTOqJiIiIiIiIghSTeiIiIiIiIqIgxaSeiIiIiIiIKEgxqSciIiIiIiIKUiqpAyAiIqJze/LJJ7Fs2bIWb7v++uvx8ssv+zki+frxxx/x/vvv4+jRo4iKisLIkSPx+OOPIz4+XurQiIiIWsSknoiIKAgkJCRg3rx5TY5Nnz5domjk6YcffsBjjz2Gm266CTNnzkRxcTHefPNNTJkyBf/973+h1WqlDpGIiKgZJvVEREQBzm63Izw8HJmZmU2OazQaaQKSqXfeeQcXXXQR/vGPf7iPdevWDTfeeCN+/fVXXHnllRJGR0RE1DLuqSciIgpwdXV1CAsLO+/9nnzySYwdO7bJsSVLlqB3795466233Mf279+P6dOnY/jw4ejXrx+ysrLwwgsvwGKxuO/Tu3fvZv8BwNixYzFnzhy89NJLGDJkCIYNG4a//OUvKC8vb/K8S5cuxcSJE5GZmYkBAwbguuuuw48//ui+/b///a/7vIWFhU0e+8orr6B379644447msXz9ttvN7nvwYMH3bedOnWq1c9/JofDgVGjRuHGG29scrx79+4AgBMnTpz1sURERFLiSj0REVGAM5vNiI6O9vhxFRUVeOONN5ocMxqNuO2225CZmYmXX34ZGo0G69atw4cffojExERMmzbNfd9JkyZh8uTJzc67ePFidOnSBf/85z9RWlqK119/HcePH8eSJUsgCAI+++wzvPDCC3j44YdxwQUXoKKiAu+99x6eeOIJDBo0CMnJye5zRUREYPXq1bj11lsBAKIo4scff4RC0XzdISIiAmvWrMEDDzzgPrZixQooFAo4HA73MU+e30WhUODJJ59sdvyXX34BAPTs2fNsX2YiIiJJMaknIiIKcOXl5UhMTPT4cXPnzkVqairKysrcxw4ePIj09HS8+eabiIyMBACMHDkSv//+OzZt2tQkqU9OTm5W8g84E+APP/wQer0eABAbG4uHHnoI69evx5gxY3Dy5Encc889ePDBB92P6dChAyZOnIht27bh6quvdh8fM2ZMk6R+x44dqKioQL9+/Zo975gxY7By5UoYjUb31+PHH3/EkCFDsGnTJvf9PHn+czlx4gReeeUVpKen46KLLmrVY4iIiPyNST0REVGAMxqNyMjI8OgxBw8exBdffIFPP/0UN998s/v46NGjMXr0aNTW1uLw4cM4fvw4Dh48iNLSUhgMhlade+zYse6E3vV3lUqFLVu2YMyYMe4V78rKShw9ehTHjx93J902m63JuS699FI89dRTqK6uRmRkJFasWIGxY8fCaDQ2e94ePXqgS5cuWLNmDW6++Wbs27cP+fn5uO2225ok9Z48/9kcOXIE99xzD1QqFebOndti5QAREVEg4E8oIiKiAGaz2VBQUODe291aL7zwAq6++moMGjSoyXGHw4HXXnsNQ4cOxdVXX43nn38eOTk5HnV2T0pKavJ3hUKBmJgYVFRUAHCucP/5z3/GkCFDcPvtt2PRokWoq6sD4CyvbywzMxPR0dFYt24dHA4HVq5ciauuuuqsz33ppZdi9erVAJyl92PGjHFXHLh48vwt2bRpE2655RYAwMcff4zOnTuf9zFERERS4Uo9ERFRAMvJyYHdbkdaWlqrH/Pjjz9iz549eP3115vd9u677+Kjjz7Cc889h3HjxrlX3CdNmtTq8zcu5wec3fnLysoQGxsLh8OBadOmQa1W46uvvkJ6ejpUKhUOHz6Mb775ptm5BEHAJZdcgtWrVyMuLg5WqxVZWVn46KOPWnzuSy+9FJ9++imqq6uxYsUKPPHEE00a/Hn6/Gf6/vvv8eSTT6Jbt254//33m32AQUREFGi4Uk9ERBTA1q5dC71ej4EDB7bq/jabDf/617/w0EMPISEhodnt27ZtQ1paGm644QZ3Ql9YWIiDBw82aTZ3LuvWrWtSxr569WrU1dVhxIgRKCsrw7FjxzBp0iRkZGRApVK5HwOgxee47LLLsG7dOnz77be47LLLzjmqb9CgQdDr9Zg/fz7KyspwySWXNLm9Lc/vsnbtWvzlL3/BoEGD8PnnnzOhJyKioMCVeiIiogC1fft2fPnll+jbty/27dvX7HabzYbS0lKcOHHCXSJeVFSEbt264c4772zxnAMGDMCCBQvw7rvvIjMzE8ePH8fChQths9lgNptbFVd+fj4eeOAB3HnnncjPz8e///1vZGVlYdiwYQCcTek+++wzJCcnIyoqCuvXr8cnn3wCAC0+x4gRI1BXV4dly5Zh4cKF53xuhUKBSy65BB9//DGuvPJK6HS6JrfHxcV5/PwAYLVa8be//Q0RERG4//77cfjw4Sa3Jycnt9g1n4iISGpM6omIiAKUa193UVERbrrpphbvs3btWsTGxuLll192H/vb3/4GtVrd4v3vu+8+lJWV4ZNPPsH8+fORkpKC6667DoIgYOHChaisrERUVNQ547r66qsRFRWFRx99FOHh4bj++usxc+ZM9+0LFizAiy++iCeffBIajQZpaWl4++238dJLL2Hr1q1N5s8DgFarRVZWFjZv3owRI0ac9+ty2WWX4auvvjrr3ntPnx9wfoBSVFQEALj77rub3T59+nQ8/PDD542NiIjI3wSxNR1jiIiIyO969+6Nf/7zn5g4ceJZ73PHHXegQ4cOTZJ6Xxo7diyGDh3qt+cjIiKic+OeeiIiIiIiIqIgxfJ7IiKiADVw4EDExsae8z49evRosSEeERERhQaW3xMREREREREFKZbfExEREREREQUpJvVEREREREREQYpJPREREREREVGQYqO8VtixYwdEUTzrzF8iIiIiIiIib6qtrYUgCBg0aNA578eV+lYQRRHsJ3h+oijCZrPxa0WS47VIgYLXIgUCXocUKHgtUiAIpuuwtXkoV+pbwbVCn5GRIXEkgc1kMiEnJwdpaWkIDw+XOhwKYbwWKVDwWqRAwOuQAgWvRQoEwXQd7t69u1X340o9ERERERERUZBiUk9EREREREQUpJjUExEREREREQUpJvVEREREREREQYpJPREREREREVGQYlJPREREREREFKSY1BMREREREREFKSb1REREREREREGKST0RERERERFRkGJST0RERERERBSkmNQTERERERERBSkm9URERERERERBikk9ERERERERUZBiUk9EREREREQUpCRP6h0OB+bOnYusrCxkZmbi3nvvxcmTJ1v1uKlTp+Ktt95qdlt2djZuu+02DBgwABdddBHmzp0Lh8Phi/CJiIiIiIiIJCN5Ur9gwQIsXrwYzz//PJYsWeJO1m0221kfY7PZ8PTTT2P9+vXNbjt27BjuvPNO9OjRA99++y2efvppfPTRR1i0aJEvXwYRERERERGR36mkfHKbzYYPPvgATzzxBC6++GIAwJw5c5CVlYVVq1Zh/PjxzR6zfft2zJ49GxaLBVFRUc1uX7hwIdLS0vDcc89BEAR07doVBw4cwPbt2339ciRXXG5GXnE1UuMjEW/QSR0OERERERER+ZikSf3+/ftRU1ODESNGuI9FRUWhb9++2LJlS4tJ/dq1a5GVlYWHHnoI1157bbPbf/vtN0ydOhWCILiPzZgxwzcvIICs2nQc85buhCgCggBMn5yJccO6SB0WERERERER+ZCkSX1BQQEAICUlpcnxxMRE921nmjlz5lnPV11
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1200x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"year_condition = df.groupby('gdppercent')['unemploymentrate'].mean().reset_index()\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(12, 6))\n",
|
|||
|
"\n",
|
|||
|
"plt.plot(year_condition['gdppercent'], year_condition['unemploymentrate'], marker='.')\n",
|
|||
|
"\n",
|
|||
|
"plt.title(\"Диаграмма 2\")\n",
|
|||
|
"plt.xlabel(\"GDP percent\")\n",
|
|||
|
"plt.ylabel(\"Unemployent Rate\")\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Присутствует связь между атрибутами, уровень инфляции влияет и зависит от многих атрибутов.\n",
|
|||
|
"Для примера на графике приведена связь между инфляцией и доходом на душу населения. На втором графике показана связь уровня ВВП и безработицы\n",
|
|||
|
"Примеры бизнес целей\n",
|
|||
|
"\n",
|
|||
|
" 1.Прогнозирование уровня инфляции на основе уровня ВВП.\n",
|
|||
|
" 2.Наблюдение за изменениями уровня безработицы с уровнем ВВП.\n",
|
|||
|
" \n",
|
|||
|
"Эффект для бизнеса: влияние на инвестиции индекса акций и цен на нефть, исследование влияния фондового индекса на инвестиции, исследования инфляции и покупательской способности.\n",
|
|||
|
"Цели технического проекта\n",
|
|||
|
"\n",
|
|||
|
"Для первой цели:\n",
|
|||
|
"\n",
|
|||
|
"Вход: Доход на душу населения\n",
|
|||
|
"Целевой признак: Уровень инфляции.\n",
|
|||
|
"\n",
|
|||
|
"Для второй цели:\n",
|
|||
|
"\n",
|
|||
|
"Вход: Уровень безработицы\n",
|
|||
|
"Целевой признак: Уровень ВВП"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проверка на выбросы"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 136,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Пропущенные значения по столбцам:\n",
|
|||
|
"stock index 0\n",
|
|||
|
"country 0\n",
|
|||
|
"year 0\n",
|
|||
|
"index price 52\n",
|
|||
|
"log_indexprice 0\n",
|
|||
|
"inflationrate 43\n",
|
|||
|
"oil prices 0\n",
|
|||
|
"exchange_rate 2\n",
|
|||
|
"gdppercent 19\n",
|
|||
|
"percapitaincome 1\n",
|
|||
|
"unemploymentrate 21\n",
|
|||
|
"manufacturingoutput 91\n",
|
|||
|
"tradebalance 4\n",
|
|||
|
"USTreasury 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Статистический обзор данных:\n",
|
|||
|
" year index price log_indexprice inflationrate oil prices \\\n",
|
|||
|
"count 369.000000 317.000000 369.000000 326.000000 369.000000 \n",
|
|||
|
"mean 2000.000000 7898.648297 3.610542 0.041748 39.743171 \n",
|
|||
|
"std 11.848225 7811.336862 0.482481 0.039579 25.452654 \n",
|
|||
|
"min 1980.000000 168.610000 2.230000 -0.040000 11.350000 \n",
|
|||
|
"25% 1990.000000 2407.100000 3.320000 0.020000 19.410000 \n",
|
|||
|
"50% 2000.000000 5160.100000 3.600000 0.030000 28.520000 \n",
|
|||
|
"75% 2010.000000 10279.500000 3.980000 0.057500 57.880000 \n",
|
|||
|
"max 2020.000000 47751.330000 4.680000 0.240000 98.560000 \n",
|
|||
|
"\n",
|
|||
|
" exchange_rate gdppercent percapitaincome unemploymentrate \\\n",
|
|||
|
"count 367.000000 350.000000 368.000000 348.000000 \n",
|
|||
|
"mean 27.897548 0.037114 20719.964674 0.068908 \n",
|
|||
|
"std 49.620521 0.037850 17435.037783 0.043207 \n",
|
|||
|
"min 0.900000 -0.110000 27.000000 0.020000 \n",
|
|||
|
"25% 1.330000 0.020000 2090.250000 0.040000 \n",
|
|||
|
"50% 5.440000 0.030000 19969.500000 0.060000 \n",
|
|||
|
"75% 15.055000 0.060000 36384.000000 0.090000 \n",
|
|||
|
"max 249.050000 0.150000 65280.000000 0.260000 \n",
|
|||
|
"\n",
|
|||
|
" manufacturingoutput tradebalance USTreasury \n",
|
|||
|
"count 278.000000 365.000000 369.000000 \n",
|
|||
|
"mean 328.084820 -15.996384 0.059024 \n",
|
|||
|
"std 622.395923 154.557170 0.033086 \n",
|
|||
|
"min 0.590000 -770.930000 0.010000 \n",
|
|||
|
"25% 80.380000 -25.370000 0.030000 \n",
|
|||
|
"50% 188.160000 -0.140000 0.050000 \n",
|
|||
|
"75% 271.977500 19.080000 0.080000 \n",
|
|||
|
"max 3868.460000 366.140000 0.140000 \n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"null_values = df.isnull().sum()\n",
|
|||
|
"print(\"Пропущенные значения по столбцам:\")\n",
|
|||
|
"print(null_values)\n",
|
|||
|
"\n",
|
|||
|
"stat_summary = df.describe()\n",
|
|||
|
"print(\"\\nСтатистический обзор данных:\")\n",
|
|||
|
"print(stat_summary)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"На основе данных выше можно выделить большое количество столбцов с пропущенными значениями\n",
|
|||
|
"Также проверим данные на выбросы и дубликаты:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 137,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'year': 0.0\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'index price': 1.7605604508668822\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'log_indexprice': -0.23716751168770417\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'inflationrate': 1.5616085380027898\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'oil prices': 0.9915046764713877\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'exchange_rate': 2.1575952097650455\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'gdppercent': -0.038272329611460466\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'percapitaincome': 0.3051430219264069\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'unemploymentrate': 1.8092896369785585\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'manufacturingoutput': 4.195480293406057\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'tradebalance': -2.266183907194849\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'USTreasury': 0.6687596580836408\n",
|
|||
|
"\n",
|
|||
|
"Количество дубликатов: 0\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"for column in df.select_dtypes(include=[np.number]).columns:\n",
|
|||
|
" skewness = df[column].skew()\n",
|
|||
|
" print(f\"\\nКоэффициент асимметрии для столбца '{column}': {skewness}\")\n",
|
|||
|
"\n",
|
|||
|
"duplicates = df.duplicated().sum()\n",
|
|||
|
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"На основе данных выше можно сказать, что для столбца объем производства присутствует выброс.\n",
|
|||
|
"Удаляем все найденные пустые значения."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 138,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"В наборе данных 'Economic' было удалено 150 строк с пустыми значениями.\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def drop_missing_values(dataframe, name):\n",
|
|||
|
" before_shape = dataframe.shape \n",
|
|||
|
" cleaned_dataframe = dataframe.dropna() \n",
|
|||
|
" after_shape = cleaned_dataframe.shape \n",
|
|||
|
" print(f\"В наборе данных '{name}' было удалено {before_shape[0] - after_shape[0]} строк с пустыми значениями.\")\n",
|
|||
|
" return cleaned_dataframe\n",
|
|||
|
"\n",
|
|||
|
"cleaned_df = drop_missing_values(df, \"Economic\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Очистка данных от шумов:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 139,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2kAAAImCAYAAADJ8cKrAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB9UElEQVR4nO3deVyVZf7/8fdhExABARXNSgE3BBQTcdfUcVpsJGvay6y0xaW0TafVymzStNSytM22qalcctKaXzVZ30xDrVBxCRVzV0AgZZVz//5wOOOR7Rw4Hm4Or+fj4aO47+u+7s/1OTfn3B/u+9yXxTAMQwAAAAAAU/Cq7wAAAAAAAP9DkQYAAAAAJkKRBgAAAAAmQpEGAAAAACZCkQYAAAAAJkKRBgAAAAAmQpEGAAAAACZCkQYAAAAAJkKRBgAAAAAmQpEGoFGaOnWqOnXqVOm/qVOn1nd4ABoxwzDUu3dvrVmzRidOnNATTzyhp59+ur7DAuBGPvUdAADUlxYtWmjBggV2yyZMmFBP0QDAaRaLRQ888IDGjx+v0tJSnXfeeXrnnXfqOywAbkSRBqBRKisrU2BgoLp372633M/Pr34CAoAzXH311frTn/6krKwsnX/++bw3AY0MtzsCaJROnTolf3//GttNnTpVQ4YMsVv24YcfqlOnTpo/f75t2fbt2zVhwgT17t1bXbt21YABA/TMM8+oqKjI1qayWyslaciQIZo7d66effZZJSUlKTk5WQ899JByc3Pt9vvxxx9r1KhR6t69uxISEjRy5EitXr3atn7p0qW2fo8cOWK37d///nd16tRJN998c4V4Fi5caNd2586dtnX79+93eP+VufnmmzV16lS9+uqr6tu3ry666CLdc889OnDggF27r776SjfccIMSExMVFxenSy65RO+//75dm6NHj+rhhx9Wnz59lJiYqJtuukk///yzbX1JSYlefPFFDR06VAkJCRoxYoSWLVtWYT+jRo1SfHy8+vXrp2eeeUYFBQW29fPnz7e9LuW+//57u9tg9+/fb/caxsbGqn///nr++edltVpt26Wmpur2229XUlKS4uLiNGTIEM2fP9/WpryfpUuX2u3v7GNuyJAhFW7BnTJlijp16qT169dLkvLz8/W3v/1N/fv3r3CMlbepzJAhQ6q87dfZvA0ZMkT/+c9/dMkll6hbt2665pprKuw7NzdXjz/+uPr27av4+Hhdc801+vHHH+3a/PDDD5XGc+axe7bKfk/Pzm/578eZx/Rvv/2mrl272vVd3XFU3W3SS5cu1fr16+1yvnPnTg0bNkzXXXed3fhuuOEGXXTRRUpOTtb999+vQ4cO2dafGWdISIiio6O1d+/eCnEC8GxcSQPQKBUWFiokJMTp7fLy8vTiiy/aLTt69KhuvPFGde/eXc8995z8/Pz03Xff6a233lLLli01btw4W9urr75af/3rXyv0+8EHH+jCCy/UzJkzlZOToxdeeEF79+7Vhx9+KIvFovfff1/PPPOMJk6cqIsuukh5eXlavHixHnjgASUmJioyMtLWV9OmTfX111/rhhtukHT6+y2rV6+Wl1fFv8s1bdpU33zzje6++27bslWrVsnLy8uu4HBm/2f7+uuv1bx5cz366KOyWq164YUXdPPNN+vzzz9XQECAvv32W40fP1633HKLJk6cqKKiIn3wwQd66qmnFBcXp27duunkyZO6/vrrVVZWpgcffFCtWrXSm2++qdtuu03Lli1Tu3bt9MADD2jNmjW6++671a1bN61Zs0ZTp06Vr6+vRowYoZUrV+qBBx7QFVdcofvuu08HDhzQ3LlzlZGRobfeeksWi6VC7KWlpXr22WcrHdfdd9+twYMHq7CwUD/88IMWL16s9u3b669//au2b9+uW2+9VZdcconmzp0rwzC0cuVKLViwQFFRUbr88surzFdNNmzYoM8//9xu2XPPPacvvvhCDz/8sKKjo+Xj46OtW7fqqaeeqrG/QYMG6Z577rH9/PHHH+uTTz6x/exo3nJycvTwww9rwoQJuuCCC/Tmm2/q9ttv18cff6wuXbqouLhYo0ePVlZWliZPnqyWLVvq008/1R133KHXX39dffr0kSQVFRUpMjJSL730ki2G6dOn1zpf1ZkxY4ZOnTplt6y64+iee+6xFVwTJkxQbGysLXcXXHCBfvvtN7u+Zs2apbi4ONvv1/Lly/Xwww9rxIgRuvPOO3X8+HHNmzdP1157rZYtW6bw8HCH4wTg2SjSADRKubm5atmypdPbzZs3T23atNHx48dty3bu3KkuXbropZdeUlBQkCSpb9+++uGHH7R+/Xq7Ii0yMrLCLZaS5OXlpbfeekvNmjWTJIWFhWn8+PH6/vvvNXDgQO3bt0+333673cn0eeedp1GjRmnjxo12J/0DBw60K9J+/vln5eXlqWvXrhX2O3DgQH3xxRc6evSoLR+rV69WUlKS3VUQZ/Z/tsLCQi1dulTnn3++JCkqKkpXXnmlli9fruuvv14ZGRm68sor9cgjj9i2SUxMVHJystavX69u3bpp2bJlOnDggJYtW6YuXbpIknr06KGUlBSlpqaqpKREX375pf72t79p9OjRkqQ+ffrowIEDWr9+vS6//HLNnj1bAwYM0OzZs237adeunW699VatWbNGgwcPrhD7u+++q4KCAkVERFRYd8EFF9heyz59+ujjjz/Wli1bbEVa3759NWvWLFtx3K9fP33zzTe2eGrDarXqmWeeUdeuXbV161bb8rS0NPXv31/XXnutbVlxcbFDfYaFhdkdk99//73t/w3DcDhvhYWFevLJJ5WSkiJJ6t27t4YNG6ZFixZp7ty5WrFihbZv365//vOf6tatm6TTx9/NN9+s2bNn69NPP7X1ExwcbBdT+e9VVby8vJwuYr788kv9+uuvat++vW3Zzp07qz2ORowYoQsuuEDS6Vujz87dmfbu3av/+7//02effaYOHTrIarVq9uzZ6t+/v1544QVbux49euiyyy7TG2+8oYceesihOAF4Poo0AI3S0aNHFR8f79Q2O3fu1EcffaR3333X7val/v37q3///iotLVVGRob27t2rnTt3KicnR6GhoQ71PWTIEFuBVv6zj4+PUlNTNXDgQNvtbvn5+dq9e7f27t1rK6JKSkrs+ho6dKimTZumEydOKCgoSKtWrdKQIUN09OjRCvuNjo7WhRdeqG+++UbXXXed0tPTdejQId144412RZoz+z9bjx49bAWaJMXGxur8889Xamqqrr/+et1xxx2SpJMnT2rPnj36/ffftXnzZru+N27cqLZt29oKNEkKCAjQl19+KUn6xz/+IUkaPny43b7Lb0ndtWuXDh8+rDvvvNPuZD4pKUlBQUH64YcfKhRpWVlZevnll/X000/bFSjlrFarTp06pVOnTun//b//p7y8PMXFxUmSUlJSlJKSouLiYu3Zs0d79+7Vtm3bVFZWptLS0kr7KWcYRpW5/PDDD3Xs2DE99dRTdgVzfHy8/vOf/+jHH39UXFycAgIC7K6E1tbu3bsdzpuPj49GjBhha+Pv76+BAwfqu+++kyT9+OOPatGihbp27WrX18UXX6znn39eeXl5CgkJ0aFDh+x+FxwRHh6unJwclZSUOPTdreLiYv3973/X3XffbVeUbty4UVLVx5GjCgoKNHfuXCUnJ6tDhw6SpD179ujYsWO6//777dpecMEFSkxM1E8//eRwnAA8H0UagEanpKREhw8fVlRUlFPbPfPMM7r88suVmJhot9xqtWrOnDl6//33VVBQoNatWyshIUFNmjRxuO9WrVrZ/ezl5aXmzZsrLy9PkvT777/r8ccf148//ihfX19FRUWpc+fOkiqe1Hfv3l0hISH67rvvdMkll+iLL77Q9OnT9fbbb1e676FDh+rrr7/Wddddp1WrVmngwIEVrlw4s/+axiadPqkuH1tOTo6eeOIJffXVV7JYLLrwwgvVs2dPu75zc3OrvBWsfH15v9Wtnz59eqW3zlVWwL7wwguKjY3VZZddVmmR9sgjj9hd/YuOjtZf/vIXSadv2Xv66ae1YsU
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы в датасете:\n",
|
|||
|
" stock index country year index price log_indexprice inflationrate \\\n",
|
|||
|
"229 SZCOMP China 2004.0 1467.57 3.17 0.04 \n",
|
|||
|
"230 SZCOMP China 2005.0 1144.54 3.06 0.02 \n",
|
|||
|
"231 SZCOMP China 2006.0 1687.14 3.23 0.02 \n",
|
|||
|
"232 SZCOMP China 2007.0 4329.44 3.64 0.05 \n",
|
|||
|
"233 SZCOMP China 2008.0 2912.90 3.46 0.06 \n",
|
|||
|
"234 SZCOMP China 2009.0 2737.01 3.44 -0.01 \n",
|
|||
|
"235 SZCOMP China 2010.0 2795.88 3.45 0.03 \n",
|
|||
|
"236 SZCOMP China 2011.0 2639.19 3.42 0.06 \n",
|
|||
|
"237 SZCOMP China 2012.0 2211.11 3.34 0.03 \n",
|
|||
|
"238 SZCOMP China 2013.0 2182.52 3.34 0.03 \n",
|
|||
|
"239 SZCOMP China 2014.0 2279.75 3.36 0.02 \n",
|
|||
|
"240 SZCOMP China 2015.0 3657.40 3.56 0.01 \n",
|
|||
|
"241 SZCOMP China 2016.0 2978.14 3.47 0.02 \n",
|
|||
|
"242 SZCOMP China 2017.0 3257.35 3.51 0.02 \n",
|
|||
|
"243 SZCOMP China 2018.0 2920.18 3.47 0.02 \n",
|
|||
|
"244 SZCOMP China 2019.0 2928.94 3.47 0.03 \n",
|
|||
|
"245 SZCOMP China 2020.0 3109.78 3.49 0.02 \n",
|
|||
|
"271 DAX 30 Germany 2005.0 5408.25 3.73 0.02 \n",
|
|||
|
"272 DAX 30 Germany 2006.0 6596.91 3.82 0.02 \n",
|
|||
|
"273 DAX 30 Germany 2007.0 8067.31 3.91 0.02 \n",
|
|||
|
"274 DAX 30 Germany 2008.0 4810.20 3.68 0.03 \n",
|
|||
|
"276 DAX 30 Germany 2010.0 6914.19 3.84 0.01 \n",
|
|||
|
"277 DAX 30 Germany 2011.0 5898.35 3.77 0.02 \n",
|
|||
|
"280 DAX 30 Germany 2014.0 9805.55 3.99 0.01 \n",
|
|||
|
"281 DAX 30 Germany 2015.0 10743.01 4.03 0.01 \n",
|
|||
|
"283 DAX 30 Germany 2017.0 12917.64 4.11 0.02 \n",
|
|||
|
"284 DAX 30 Germany 2018.0 10558.96 4.02 0.02 \n",
|
|||
|
"285 DAX 30 Germany 2019.0 13249.01 4.12 0.01 \n",
|
|||
|
"286 DAX 30 Germany 2020.0 13718.78 4.14 0.01 \n",
|
|||
|
"\n",
|
|||
|
" oil prices exchange_rate gdppercent percapitaincome unemploymentrate \\\n",
|
|||
|
"229 43.15 8.28 0.10 1509.0 0.04 \n",
|
|||
|
"230 59.41 8.19 0.11 1753.0 0.04 \n",
|
|||
|
"231 61.96 7.97 0.13 2099.0 0.04 \n",
|
|||
|
"232 91.69 7.61 0.14 2694.0 0.04 \n",
|
|||
|
"233 41.12 6.95 0.10 3468.0 0.04 \n",
|
|||
|
"234 74.47 6.83 0.09 3832.0 0.04 \n",
|
|||
|
"235 89.15 6.77 0.11 4550.0 0.04 \n",
|
|||
|
"236 98.56 6.46 0.10 5618.0 0.04 \n",
|
|||
|
"237 87.86 6.31 0.08 6317.0 0.04 \n",
|
|||
|
"238 97.63 6.15 0.08 7051.0 0.05 \n",
|
|||
|
"239 59.29 6.16 0.07 7679.0 0.05 \n",
|
|||
|
"240 37.19 6.28 0.07 8067.0 0.05 \n",
|
|||
|
"241 51.97 6.64 0.07 8148.0 0.05 \n",
|
|||
|
"242 57.88 6.76 0.07 8879.0 0.04 \n",
|
|||
|
"243 49.52 6.61 0.07 9977.0 0.04 \n",
|
|||
|
"244 59.88 6.91 0.06 10217.0 0.05 \n",
|
|||
|
"245 47.02 6.90 0.02 10500.0 0.05 \n",
|
|||
|
"271 59.41 1.24 0.01 34520.0 0.12 \n",
|
|||
|
"272 61.96 1.26 0.04 36354.0 0.11 \n",
|
|||
|
"273 91.69 1.37 0.03 41640.0 0.09 \n",
|
|||
|
"274 41.12 1.47 0.01 45613.0 0.08 \n",
|
|||
|
"276 89.15 1.33 0.04 41572.0 0.08 \n",
|
|||
|
"277 98.56 1.39 0.04 46706.0 0.07 \n",
|
|||
|
"280 59.29 1.33 0.02 48024.0 0.07 \n",
|
|||
|
"281 37.19 1.11 0.01 41103.0 0.06 \n",
|
|||
|
"283 57.88 1.13 0.03 44553.0 0.06 \n",
|
|||
|
"284 49.52 1.18 0.01 47811.0 0.05 \n",
|
|||
|
"285 59.88 1.12 0.01 46468.0 0.05 \n",
|
|||
|
"286 47.02 1.14 -0.05 45724.0 0.06 \n",
|
|||
|
"\n",
|
|||
|
" manufacturingoutput tradebalance USTreasury \n",
|
|||
|
"229 625.22 51.17 0.04 \n",
|
|||
|
"230 733.66 124.63 0.04 \n",
|
|||
|
"231 893.13 208.92 0.05 \n",
|
|||
|
"232 1149.72 308.04 0.05 \n",
|
|||
|
"233 1475.66 348.83 0.04 \n",
|
|||
|
"234 1611.95 220.13 0.03 \n",
|
|||
|
"235 1924.32 222.40 0.03 \n",
|
|||
|
"236 2421.37 180.89 0.03 \n",
|
|||
|
"237 2690.09 231.87 0.02 \n",
|
|||
|
"238 2935.34 234.87 0.02 \n",
|
|||
|
"239 3184.24 221.55 0.03 \n",
|
|||
|
"240 3202.50 358.84 0.02 \n",
|
|||
|
"241 3153.12 255.48 0.02 \n",
|
|||
|
"242 3460.33 215.70 0.02 \n",
|
|||
|
"243 3868.46 106.71 0.03 \n",
|
|||
|
"244 3823.41 164.99 0.02 \n",
|
|||
|
"245 3853.81 366.14 0.01 \n",
|
|||
|
"271 571.36 148.05 0.04 \n",
|
|||
|
"272 618.70 162.20 0.05 \n",
|
|||
|
"273 714.38 231.95 0.05 \n",
|
|||
|
"274 750.91 227.47 0.04 \n",
|
|||
|
"276 669.57 178.90 0.03 \n",
|
|||
|
"277 758.60 184.02 0.03 \n",
|
|||
|
"280 786.55 257.40 0.03 \n",
|
|||
|
"281 683.20 255.02 0.02 \n",
|
|||
|
"283 752.02 257.66 0.02 \n",
|
|||
|
"284 795.96 243.72 0.03 \n",
|
|||
|
"285 737.94 223.82 0.02 \n",
|
|||
|
"286 678.29 221.53 0.01 \n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2MAAAImCAYAAADe01JiAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACBZklEQVR4nO3deVyU5f7/8feAIKACAirapkiiCCgmmqZmWp7WE9py2q1Mq2N60qz0tKdlp1xKK0vbbDutLnnSOkc7Wd/cl0IFNbfSXAEBlZ2Z3x8e5ufAADMwM/cw83o+Hj6K+77u6/5cy30zH+577ttksVgsAgAAAAB4VIDRAQAAAACAPyIZAwAAAAADkIwBAAAAgAFIxgAAAADAACRjAAAAAGAAkjEAAAAAMADJGAAAAAAYgGQMAAAAAAxAMgYAAAAABmhidAAA4C0mTpyohQsX2l03dOhQvfDCCx6OCAAA+DKSMQA4Q6tWrfTqq6/aLHvggQcMigYAAPgykjEA+J+KigqFhYWpe/fuNsuDg4ONCQgAAPg0vjMGAP9TXl6ukJCQOstNnDhRgwYNsln2ySefKCEhQbNnz7Yu2759ux544AFdeOGF6tq1q/r3768pU6aouLjYWiYhIaHaP0kaNGiQZs6cqeeff15paWnq3bu3HnnkEeXl5dns9/PPP9ewYcPUvXt3paSk6Nprr9WyZcus6xcsWGCt98iRIzbb/uMf/1BCQoJuv/32avHMmTPHpuzOnTut6w4cOODw/u25/fbbNXHiRL3xxhvq27evLrjgAv31r3/VH3/8YVNu+fLluuWWW5SamqqkpCRdfvnl+uijj2zKHD16VI8++qj69Omj1NRU3Xbbbdq8ebN1fWlpqV5++WUNHjxYKSkpuvrqq6vdirp8+XINGzZMycnJuuiiizRlyhQVFhZa18+ePds6LpV+/PFHJSQkaOLEiZKkAwcO2IxhYmKi+vXrpxdffFFms9m63fr16zVixAilpaUpKSlJgwYN0uzZs61lKutZsGCBzf6qzrlBgwZZ911p/PjxSkhI0Nq1ayVJBQUF+vvf/65+/fpVm2OVZewZNGiQ3XlZtQ+WLl2qYcOGKTU1VRdddJGefPJJ5efn25T5+eefdffdd6tHjx668MILNX78eJt5OHHixBr3deY827Bhg2677TZ169ZNvXr10qOPPqrc3Nwa21A578+so2q/2evrkpISDR48uFpbFy1apKFDh6pbt24aOHCgpk+frtLSUpvjq+q/yv2ceV44deqUbr/9diUmJqqkpMThcwkA38WVMQD4n6KiIkVERDi9XX5+vl5++WWbZUePHtWtt96q7t2764UXXlBwcLB++OEHvfvuu2rdurVGjRplLXv99dfrhhtuqFbvxx9/rPPOO09Tp05Vbm6upk+frt9++02ffPKJTCaTPvroI02ZMkVjxozRBRdcoPz8fM2bN08TJkxQamqqYmNjrXU1a9ZMK1as0C233CJJslgsWrZsmQICqv9NrlmzZvruu+90//33W5ctXbpUAQEBNomFM/uvasWKFWrZsqUef/xxmc1mTZ8+Xbfffru+/vprhYaG6vvvv9fo0aN1xx13aMyYMSouLtbHH3+sZ599VklJSerWrZtOnTqlm2++WRUVFXr44YfVpk0bvfPOO7r77ru1cOFCtW/fXhMmTNDKlSt1//33q1u3blq5cqUmTpyooKAgXX311VqyZIkmTJiga665Rg8++KD++OMPzZw5U7t27dK7774rk8lULfaysjI9//zzdtt1//33a+DAgSoqKtJPP/2kefPmqUOHDrrhhhu0fft23Xnnnbr88ss1c+ZMWSwWLVmyRK+++qri4uJ01VVX1dhfddmwYYO+/vprm2UvvPCCvvnmGz366KPq2LGjmjRpom3btunZZ5+ts76LL75Yf/3rX60/f/755/riiy+sP7/++uuaNWuWbrnlFo0bN0779+/XK6+8op9//lmfffaZQkJClJmZaU2gXnzxRVVUVGj69OkaMWKEFi1apCZNTn8EqXpr8Pfff2/zx4D169frrrvu0oUXXqiXX35Z+fn5euWVV3THHXfoiy++cOgPKI566623qiVwH330kZ599lndcMMNGj9+vPbv368XX3xR+fn5evDBB/Xpp59a+yQzM9PalqioqGr1f/zxx8rOztb8+fPtXnG3dy4B4NtIxgDgf/Ly8tS6dWunt5s1a5batWun48ePW5ft3LlTXbp00SuvvKLmzZtLkvr27auffvpJa9eutUnGYmNjq90aKUkBAQF699131aJFC0mnP9yNHj1aP/74owYMGKD9+/drxIgRNh+azzrrLA0bNkwbN260+XA/YMAAm2Rs8+bNys/PV9euXavtd8CAAfrmm2909OhRa38sW7ZMaWlpNldUnNl/VUVFRVqwYIHOOeccSVJcXJyGDh2qRYsW6eabb9auXbs0dOhQPfbYY9ZtUlNT1bt3b61du1bdunXTwoUL9ccff2jhwoXq0qWLJKlHjx5KT0/X+vXrVVpaqm+//VZ///vfNXz4cElSnz599Mcff2jt2rW66qqrNG3aNPXv31/Tpk2z7qd9+/a68847tXLlSg0cOLBa7B988IEKCwsVExNTbd25555rHcs+ffro888/19atW63JWN++ffXSSy9Zk+CLLrpI3333nTWe+jCbzZoyZYq6du2qbdu2WZdnZGSoX79++stf/mJdVlJS4lCdUVFRNnPyxx9/tP5/fn6+5syZoxtvvFFPPvmkdXmnTp1066236ssvv9Stt96qN954Q5GRkXrnnXfUtGlTSVLr1q310EMP6ddff7WOWXBwsM2+9uzZYxPL9OnT1aFDB7355psKDAyUJHXr1k1XXXWVdV9VVfZvRUWFQ+2VpEOHDmnevHk2/Wg2m/Xaa6/p0ksv1ZQpU6xli4qK9PXXX6tFixbW2KOioqq15UwVFRX65z//ab0yao+9cwkA38ZtigDwP0ePHlWbNm2c2mbnzp369NNP9cQTT9gs79evnz788EM1bdpUu3bt0ooVKzRnzhzl5uaqtLTUoboHDRpkTcQqf27SpInWr18v6fQtXhMmTFBBQYF+/vlnLV682HobX9V9DB48WGvXrtXJkyclnb7SNWjQIIWGhlbbb8eOHXXeeefpu+++kyRlZmbq0KFDuvTSS23KObP/qnr06GFNxCQpMTFR55xzjrVt99xzj1544QWdOnVKW7du1dKlS/Xmm2/a1L1x40adffbZ1g/1khQaGqpvv/1WN9xwgzZu3ChJGjJkiM2+Z8+ercmTJ2vPnj06fPiwBg0apPLycuu/tLQ0NW/eXD/99FO1uLOzs/Xaa6/p0UcftSYYZzKbzSovL1dxcbGWLFmi/Px8JSUlSZLS09M1b948lZWVafv27fr22281a9YsVVRUqKyszG49lf8sFkuNffnJJ5/o2LFjGj16tM3y5ORkrVu3TqtXr9aJEydUXl5uc2Wzvn7++WeVlpbq6quvtlnes2dPnXXWWVq3bp2k0+MzYMAAm35KTU3Vd999ZzNmtSkqKtIvv/yiiy++WBaLxdof55xzjjp27Gh3jCQpOjpaknT48GGH2/WPf/xDPXv21CWXXGJdtnfvXuXk5Oiyyy6zKTtixAgtWLBAQUFBDtVdXl6uDz74QKdOndKf//xnu2VqOpcA8G1cGQMAnf6Af/jwYcXFxTm13ZQpU3TVVVcpNTXVZrnZbNaMGTP00UcfqbCwUG3btlVKSordD/A1qZoYBgQEqGXLltbv5fz+++968skntXr1agUFBSkuLk6dO3eWpGof3rt3766IiAj98MMPuvzyy/XNN9/omWee0XvvvWd334MHD9aKFSt00003aenSpRowYID1Cl8lZ/ZfV9uk0x+gK9uWm5urp556SsuXL5fJZNJ5552nnj172tSdl5dn/dBtT+X362oqU7n+mWee0TPPPFNt/dGjR6stmz59uhITE3XllVfaXE2r9Nhjj9lczevYsaP1w3dxcbEmT56sxYsXq7y8XGeffbZSU1PVpEmTav1VtR7p9FVHe2145ZVX9Mgjj1Qbn0cffVR
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(cleaned_df['manufacturingoutput'], cleaned_df['gdppercent'])\n",
|
|||
|
"plt.xlabel('Объем производства')\n",
|
|||
|
"plt.ylabel('ВВП')\n",
|
|||
|
"plt.title('Диаграмма рассеивания перед чисткой')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"Q1 = cleaned_df[\"manufacturingoutput\"].quantile(0.25)\n",
|
|||
|
"Q3 = cleaned_df[\"manufacturingoutput\"].quantile(0.75)\n",
|
|||
|
"\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"lower_bound = Q1 - threshold\n",
|
|||
|
"upper_bound = Q3 + threshold\n",
|
|||
|
"\n",
|
|||
|
"outliers = (cleaned_df[\"manufacturingoutput\"] < lower_bound) | (cleaned_df[\"manufacturingoutput\"] > upper_bound)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод выбросов\n",
|
|||
|
"print(\"Выбросы в датасете:\")\n",
|
|||
|
"print(cleaned_df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"# Заменяем выбросы на медианные значения\n",
|
|||
|
"median_score = cleaned_df[\"manufacturingoutput\"].median()\n",
|
|||
|
"cleaned_df.loc[outliers, \"manufacturingoutput\"] = median_score\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(cleaned_df['manufacturingoutput'], cleaned_df['gdppercent'])\n",
|
|||
|
"plt.xlabel('Объем производства')\n",
|
|||
|
"plt.ylabel('ВВП')\n",
|
|||
|
"plt.title('Диаграмма рассеивания после чистки')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 140,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 131\n",
|
|||
|
"Размер контрольной выборки: 44\n",
|
|||
|
"Размер тестовой выборки: 44\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"train_df, test_df = train_test_split(cleaned_df, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Видим недостаток баланса"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 143,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение ВВП в обучающей выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
" 0.02 30\n",
|
|||
|
" 0.04 25\n",
|
|||
|
" 0.03 21\n",
|
|||
|
" 0.01 13\n",
|
|||
|
" 0.07 8\n",
|
|||
|
" 0.08 8\n",
|
|||
|
" 0.05 7\n",
|
|||
|
"-0.01 5\n",
|
|||
|
" 0.11 2\n",
|
|||
|
" 0.09 2\n",
|
|||
|
"-0.02 2\n",
|
|||
|
" 0.10 2\n",
|
|||
|
"-0.03 1\n",
|
|||
|
" 0.14 1\n",
|
|||
|
"-0.10 1\n",
|
|||
|
" 0.06 1\n",
|
|||
|
"-0.05 1\n",
|
|||
|
"-0.04 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение ВВП в контрольной выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
" 0.02 9\n",
|
|||
|
" 0.03 7\n",
|
|||
|
" 0.01 6\n",
|
|||
|
" 0.07 4\n",
|
|||
|
" 0.04 4\n",
|
|||
|
" 0.05 4\n",
|
|||
|
" 0.08 3\n",
|
|||
|
" 0.06 3\n",
|
|||
|
"-0.01 2\n",
|
|||
|
" 0.10 1\n",
|
|||
|
"-0.08 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение ВВП в тестовой выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
" 0.02 12\n",
|
|||
|
" 0.03 8\n",
|
|||
|
" 0.01 7\n",
|
|||
|
" 0.05 5\n",
|
|||
|
" 0.04 3\n",
|
|||
|
" 0.08 3\n",
|
|||
|
"-0.01 2\n",
|
|||
|
"-0.05 1\n",
|
|||
|
" 0.06 1\n",
|
|||
|
" 0.13 1\n",
|
|||
|
" 0.07 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['gdppercent'].value_counts()\n",
|
|||
|
" print(f\"Распределение ВВП в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"также используем oversampling и undersampling"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 142,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Оверсэмплинг:\n",
|
|||
|
"Распределение Дохода на душу населения в обучающей выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
" 0.08 37\n",
|
|||
|
" 0.07 33\n",
|
|||
|
"-0.04 31\n",
|
|||
|
" 0.02 30\n",
|
|||
|
"-0.10 27\n",
|
|||
|
" 0.04 25\n",
|
|||
|
"-0.05 25\n",
|
|||
|
"-0.03 21\n",
|
|||
|
" 0.03 21\n",
|
|||
|
" 0.01 13\n",
|
|||
|
" 0.11 11\n",
|
|||
|
" 0.09 11\n",
|
|||
|
" 0.10 7\n",
|
|||
|
" 0.05 7\n",
|
|||
|
"-0.01 5\n",
|
|||
|
" 0.14 5\n",
|
|||
|
"-0.02 2\n",
|
|||
|
" 0.06 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Дохода на душу населения в контрольной выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
"-0.08 24\n",
|
|||
|
" 0.02 9\n",
|
|||
|
" 0.07 7\n",
|
|||
|
" 0.03 7\n",
|
|||
|
" 0.01 6\n",
|
|||
|
" 0.05 5\n",
|
|||
|
" 0.04 4\n",
|
|||
|
" 0.06 4\n",
|
|||
|
" 0.08 3\n",
|
|||
|
"-0.01 2\n",
|
|||
|
" 0.10 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Дохода на душу населения в тестовой выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
" 0.08 26\n",
|
|||
|
"-0.01 22\n",
|
|||
|
"-0.05 14\n",
|
|||
|
" 0.02 12\n",
|
|||
|
" 0.03 8\n",
|
|||
|
" 0.01 7\n",
|
|||
|
" 0.05 5\n",
|
|||
|
" 0.07 5\n",
|
|||
|
" 0.13 5\n",
|
|||
|
" 0.04 3\n",
|
|||
|
" 0.06 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Андерсэмплинг:\n",
|
|||
|
"Распределение Дохода на душу населения в обучающей выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
" 0.01 2\n",
|
|||
|
" 0.08 2\n",
|
|||
|
"-0.10 1\n",
|
|||
|
"-0.03 1\n",
|
|||
|
"-0.05 1\n",
|
|||
|
"-0.04 1\n",
|
|||
|
" 0.03 1\n",
|
|||
|
" 0.02 1\n",
|
|||
|
" 0.14 1\n",
|
|||
|
" 0.07 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Дохода на душу населения в контрольной выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
"-0.08 1\n",
|
|||
|
" 0.02 1\n",
|
|||
|
" 0.08 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Дохода на душу населения в тестовой выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
"-0.01 2\n",
|
|||
|
" 0.08 2\n",
|
|||
|
"-0.05 1\n",
|
|||
|
" 0.02 1\n",
|
|||
|
" 0.04 1\n",
|
|||
|
" 0.05 1\n",
|
|||
|
" 0.07 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
"\n",
|
|||
|
"def binning(target, bins):\n",
|
|||
|
" return pd.cut(target, bins=bins, labels=False)\n",
|
|||
|
"\n",
|
|||
|
"train_df['gdppercent_binned'] = binning(train_df['gdppercent'], bins=3)\n",
|
|||
|
"val_df['gdppercent_binned'] = binning(val_df['gdppercent'], bins=3)\n",
|
|||
|
"test_df['gdppercent_binned'] = binning(test_df['gdppercent'], bins=3)\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df, target_column):\n",
|
|||
|
" X = df.drop(target_column, axis=1)\n",
|
|||
|
" y = df[target_column]\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" x_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1) \n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"def undersample(df, target_column):\n",
|
|||
|
" X = df.drop(target_column, axis=1)\n",
|
|||
|
" y = df[target_column]\n",
|
|||
|
" \n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42)\n",
|
|||
|
" x_resampled, y_resampled = undersampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df, 'gdppercent_binned')\n",
|
|||
|
"val_df_oversampled = oversample(val_df, 'gdppercent_binned')\n",
|
|||
|
"test_df_oversampled = oversample(test_df, 'gdppercent_binned')\n",
|
|||
|
"\n",
|
|||
|
"train_df_undersampled = undersample(train_df, 'gdppercent_binned')\n",
|
|||
|
"val_df_undersampled = undersample(val_df, 'gdppercent_binned')\n",
|
|||
|
"test_df_undersampled = undersample(test_df, 'gdppercent_binned')\n",
|
|||
|
"\n",
|
|||
|
"print(\"Оверсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"print(\"Андерсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_undersampled, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|