978 lines
271 KiB
Plaintext
978 lines
271 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Лабораторная 2"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Информация об экономике стран"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['stock index', 'country', 'year', 'index price', 'log_indexprice',\n",
|
|||
|
" 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent',\n",
|
|||
|
" 'percapitaincome', 'unemploymentrate', 'manufacturingoutput',\n",
|
|||
|
" 'tradebalance', 'USTreasury'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import numpy as np\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\".//static//scv//Economic Data - 9 Countries (1980-2020).csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Столбцы на русском:\n",
|
|||
|
"'stock index' - индекс акций\n",
|
|||
|
"'country' - страна\n",
|
|||
|
"'year'- год\n",
|
|||
|
"'index price' - индекс стоимости\n",
|
|||
|
"'log_indexprice' - индексная цена журнала\n",
|
|||
|
"'inflationrate' - ставка инфляции\n",
|
|||
|
"'oil prices' - цена на нефть\n",
|
|||
|
"'exchange_rate' - ставка обмена\n",
|
|||
|
"'gdppercent' - процент ВВП\n",
|
|||
|
"'percapitaincome' - доход на душу населения\n",
|
|||
|
"'unemploymentrate' - уровень безработицы\n",
|
|||
|
"'manufacturingoutput' - объем производства\n",
|
|||
|
"'tradebalance' - торговый баланс\n",
|
|||
|
"'USTreasury' - UST казначейство"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 369 entries, 0 to 368\n",
|
|||
|
"Data columns (total 14 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 stock index 369 non-null object \n",
|
|||
|
" 1 country 369 non-null object \n",
|
|||
|
" 2 year 369 non-null float64\n",
|
|||
|
" 3 index price 317 non-null float64\n",
|
|||
|
" 4 log_indexprice 369 non-null float64\n",
|
|||
|
" 5 inflationrate 326 non-null float64\n",
|
|||
|
" 6 oil prices 369 non-null float64\n",
|
|||
|
" 7 exchange_rate 367 non-null float64\n",
|
|||
|
" 8 gdppercent 350 non-null float64\n",
|
|||
|
" 9 percapitaincome 368 non-null float64\n",
|
|||
|
" 10 unemploymentrate 348 non-null float64\n",
|
|||
|
" 11 manufacturingoutput 278 non-null float64\n",
|
|||
|
" 12 tradebalance 365 non-null float64\n",
|
|||
|
" 13 USTreasury 369 non-null float64\n",
|
|||
|
"dtypes: float64(12), object(2)\n",
|
|||
|
"memory usage: 40.5+ KB\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>stock index</th>\n",
|
|||
|
" <th>country</th>\n",
|
|||
|
" <th>year</th>\n",
|
|||
|
" <th>index price</th>\n",
|
|||
|
" <th>log_indexprice</th>\n",
|
|||
|
" <th>inflationrate</th>\n",
|
|||
|
" <th>oil prices</th>\n",
|
|||
|
" <th>exchange_rate</th>\n",
|
|||
|
" <th>gdppercent</th>\n",
|
|||
|
" <th>percapitaincome</th>\n",
|
|||
|
" <th>unemploymentrate</th>\n",
|
|||
|
" <th>manufacturingoutput</th>\n",
|
|||
|
" <th>tradebalance</th>\n",
|
|||
|
" <th>USTreasury</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1980.0</td>\n",
|
|||
|
" <td>168.61</td>\n",
|
|||
|
" <td>2.23</td>\n",
|
|||
|
" <td>0.14</td>\n",
|
|||
|
" <td>21.59</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.09</td>\n",
|
|||
|
" <td>12575.0</td>\n",
|
|||
|
" <td>0.07</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-13.06</td>\n",
|
|||
|
" <td>0.11</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1981.0</td>\n",
|
|||
|
" <td>203.15</td>\n",
|
|||
|
" <td>2.31</td>\n",
|
|||
|
" <td>0.10</td>\n",
|
|||
|
" <td>31.77</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.12</td>\n",
|
|||
|
" <td>13976.0</td>\n",
|
|||
|
" <td>0.08</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-12.52</td>\n",
|
|||
|
" <td>0.14</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1982.0</td>\n",
|
|||
|
" <td>188.98</td>\n",
|
|||
|
" <td>2.28</td>\n",
|
|||
|
" <td>0.06</td>\n",
|
|||
|
" <td>28.52</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.04</td>\n",
|
|||
|
" <td>14434.0</td>\n",
|
|||
|
" <td>0.10</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-19.97</td>\n",
|
|||
|
" <td>0.13</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1983.0</td>\n",
|
|||
|
" <td>285.43</td>\n",
|
|||
|
" <td>2.46</td>\n",
|
|||
|
" <td>0.03</td>\n",
|
|||
|
" <td>26.19</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.09</td>\n",
|
|||
|
" <td>15544.0</td>\n",
|
|||
|
" <td>0.10</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-51.64</td>\n",
|
|||
|
" <td>0.11</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1984.0</td>\n",
|
|||
|
" <td>248.89</td>\n",
|
|||
|
" <td>2.40</td>\n",
|
|||
|
" <td>0.04</td>\n",
|
|||
|
" <td>25.88</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.11</td>\n",
|
|||
|
" <td>17121.0</td>\n",
|
|||
|
" <td>0.08</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-102.73</td>\n",
|
|||
|
" <td>0.12</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" stock index country year index price log_indexprice \\\n",
|
|||
|
"0 NASDAQ United States of America 1980.0 168.61 2.23 \n",
|
|||
|
"1 NASDAQ United States of America 1981.0 203.15 2.31 \n",
|
|||
|
"2 NASDAQ United States of America 1982.0 188.98 2.28 \n",
|
|||
|
"3 NASDAQ United States of America 1983.0 285.43 2.46 \n",
|
|||
|
"4 NASDAQ United States of America 1984.0 248.89 2.40 \n",
|
|||
|
"\n",
|
|||
|
" inflationrate oil prices exchange_rate gdppercent percapitaincome \\\n",
|
|||
|
"0 0.14 21.59 1.0 0.09 12575.0 \n",
|
|||
|
"1 0.10 31.77 1.0 0.12 13976.0 \n",
|
|||
|
"2 0.06 28.52 1.0 0.04 14434.0 \n",
|
|||
|
"3 0.03 26.19 1.0 0.09 15544.0 \n",
|
|||
|
"4 0.04 25.88 1.0 0.11 17121.0 \n",
|
|||
|
"\n",
|
|||
|
" unemploymentrate manufacturingoutput tradebalance USTreasury \n",
|
|||
|
"0 0.07 NaN -13.06 0.11 \n",
|
|||
|
"1 0.08 NaN -12.52 0.14 \n",
|
|||
|
"2 0.10 NaN -19.97 0.13 \n",
|
|||
|
"3 0.10 NaN -51.64 0.11 \n",
|
|||
|
"4 0.08 NaN -102.73 0.12 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 3,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df.info()\n",
|
|||
|
"df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Объект наблюдения - экономика\n",
|
|||
|
"Атрибуты - содержит набор информации об обучении, такие как:\n",
|
|||
|
"Фондовый рынок, ВВП, страна, год, стоимость топлива, уровень инфлции,уровень безработицы и так далее"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2kAAAIjCAYAAACZPFMYAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXxdVb3//9cezpjkZB7apk3nmbZQBEqZZKqKKIoDytWKiMqFq1B/Dtyvouj1euUqoBeUrxPFr3oVUJBJoCAgSKHQFih0noc0aeaT4Ux7+P2RNhCaDjknbZL2/fQRafZe2euzz276OJ+z1vosw/d9HxERERERERkSzMEOQERERERERN6iJE1ERERERGQIUZImIiIiIiIyhChJExERERERGUKUpImIiIiIiAwhStJERERERESGECVpIiIiIiIiQ4iSNBERERERkSFESZqIiIiIiMgQoiRNRERERERkCFGSJiIix4TFixdjGAavvPJKn+fPOeccZs6ceZSjOvJ+/vOf89GPfpQxY8ZgGAaf+cxnBjskERHJkT3YAYiIiEj2fvjDH9Le3s4pp5zC7t27BzscEREZAErSREREhrFnn322ZxQtPz9/sMMREZEBoOmOIiJy3HIch+9973tMmDCBUCjE2LFj+fd//3dSqVSvdmPHjsUwDK677rr9rrFgwQIMw+D9739/r+OpVIpvf/vbTJw4kVAoxOjRo/na176237UNw+Daa6/l97//PVOmTCEcDjN37lz+8Y9/HNY91NTUYBhG/25cRESGNI2kiYjIMaWtrY3Gxsb9jmcymf2Ofe5zn+Puu+/mIx/5CF/5yld46aWX+MEPfsCaNWu4//77e7UNh8P8/ve/57//+78JBAIA7Ny5k6eeeopwONyrred5fOADH+D555/n85//PNOmTWPVqlXceuutrF+/ngceeKBX+2effZY//elPfOlLXyIUCvGzn/2M97znPSxbtuyYXEcnIiIHpyRNRESOKeeff/4Bz82YMaPnz6+99hp33303n/vc5/jlL38JwL/+679SUVHBj370I55++mne/e5397Q/88wzWblyJQ8++CCXXnop0F2s5NRTT2XXrl29+vnDH/7Ak08+ybPPPssZZ5zRc3zmzJl88Ytf5IUXXuD000/vOf7GG2/wyiuvMHfuXAAuu+wypkyZwo033shf/vKXHF4NEREZjjTdUUREjil33HEHS5Ys2e9r1qxZvdo9+uijACxatKjX8a985SsAPPLII72OB4NBLr/8cu66666eY4sXL+aKK67YL4Z7772XadOmMXXqVBobG3u+zj33XACefvrpXu3nzZvXk6ABjBkzhg9+8IM8/vjjuK7b35dARESGOY2kiYjIMeWUU07h5JNP3u94cXFxr2mQ27ZtwzRNJk6c2KtdVVUVRUVFbNu2bb9rXHHFFcydO5fdu3ezfv16du/ezcc+9jH+4z/+o1e7DRs2sGbNGsrLy/uMcc+ePb2+nzRp0n5tJk+eTFdXFw0NDVRVVR34hkVE5JijJE1ERI5r/Sm6MXv2bGbPns1vf/tb1qxZw6WXXkosFtuvned5nHDCCdxyyy19Xmf06NFZxysiIsc+JWkiInJcqqmpwfM8NmzYwLRp03qO19fX09raSk1NTZ8/99nPfpZbb72Vuro6HnrooT7bTJgwgddee43zzjvvsJLADRs27Hds/fr1RKPRA47GiYjIsUtr0kRE5Lj0vve9D4Dbbrut1/F9o18XXXRRnz/3yU9+kl27dlFRUcE555zTZ5uPfexj7Nq1q6cgydslEgk6Ozt7HVu6dCkrVqzo+X7Hjh389a9/5cILL8SyrMO9JREROUZoJE1ERI5Ls2fPZuHChfziF7+gtbWVs88+m2XLlnH33XdzySWX9Krs+HbFxcXs3r0by7IOOEr2qU99invuuYcvfvGLPP3008yfPx/XdVm7di333HMPjz/+eK91czNnzmTBggW9SvAD3HTTTYe8j4ceeojXXnsN6N5m4PXXX+9ZI/eBD3xgv4IpIiIy9ClJExGR49avfvUrxo8fz+LFi7n//vupqqrihhtu4Nvf/vZBf66oqOig503T5IEHHuDWW2/lt7/9Lffffz/RaJTx48fz5S9/mcmTJ/dqf/bZZzNv3jxuuukmtm/fzvTp01m8ePFhJVh//vOfufvuu3u+X7lyJStXrgSgurpaSZqIyDBk+L7vD3YQIiIixyvDMLjmmmu4/fbbBzsUEREZIrQmTUREREREZAhRkiYiIiIiIjKEKEkTEREREREZQlQ4REREZBBpabiIiLyTRtJERERERESGECVpIiIiIiIiQ4imOw4Qz/Oora2loKDggJubioiIiIjIsc/3fdrb2xk5ciSm2f9xMSVpA6S2tpbRo0cPdhgiIiIiIjJE7Nixg+rq6n7/nJK0AVJQUAB0P4hYLHbAdplMhieeeIILL7yQQCBwtMKTQabnfnzScz8+6bkff/TMj0967senw33u8Xic0aNH9+QI/aUkbYDsm+IYi8UOmaRFo1FisZh+oY8jeu7HJz3345Oe+/FHz/z4pOd+fOrvc892GZQKh4iIiIiIiAwhStJERERERESGECVpIiIiIiIiQ4iSNBERERERkSFESZqIiIiIiMgQoiRNRERERERkCFGSJiIiIiIiMoQoSRMRERERERlClKSJiIiIiIgMIUrSREREREREhhAlaSIiIiIiIkOIkjQREREREZEhREmaiBw230/gey34fmawQxERERE5ZtmDHYCIDH2+sx0/9RSkXwHfATMGoTMgdB6GmT/Y4YmIiIgcU5SkichB+Zn1+B13gFcHRjEYYfCa8Lv+FzJvQP6/YZixwQ5TRERE5Jih6Y4ickC+7+B3/Q68PWBNBKu8exTNGgXWGMi8jp98fLDDFBERETmmKEkTkQNz1oK7tTspM97xz4URAiMGqX/ie12DEp6IiIjIsUhJmogcmNsAfgaMSN/njQLw28FvObpxiYiIiBzDlKSJyIEZwe7/+u4BGmQACwgdpYBEREREjn1K0kTkwALTwCwGr2n/c74PXiMEpoBZevRjExERETlGKUkTkQMyzBIIndc9pdFtAN/rPuFnwN0ORgFGeAGGYQxuoCIiIiLHEJXgF5GDMiIfwMeB1NPgbt53FKwRGJGPYQRmDmp8IiIiIscaJWkiclCGEcCIfhw/9O7ufdH8BJglEJiNYUYHOzwRERGRY46SNBE5LIZVAda5gx2GiIiIyDFPSZrIMOe5dWSSz+GmX8YnhWVPxA6diRWYrbViIiIiIsOQkjSRYczNrCHZfjueuwvDyAPDJpN6Bie1lEDkAwSjH1eiJiIiIjLMKEkTGaZ8L0Gy4xd4bh2mPRnDeKtYq+c2kUk8gGWPww6dOohRioiIiEh/KUkTGaaczHI8dzumPbZXggZgWqW4mWYyqWcHJEnzvQSZ9DLS6Zfx/TYsaxSB4OnYgZn79S0iIiIiuVGSJjJMec5O8H0MI9DnecMswnU24vsOhpH9r7rnNtPV/hMcZxU+YBDESa8inXyGYPgCInkLc7q+iIiIiPSmd1Yiw5VhAP5BGngY2ED2a9J83yfR+RsymVex7LEYRuitq3stpJOPYtmjCYUvzLoPEREREelN85REhinLnoxhBPH9xH7nfN/H91qxgidhGFbWfbjuFjKZVzGtql4JGoBpFgM26eQSfN/Jug8RERER6U1JmsgwZQVOwAxMw3O24fvJnuO+7+G7OzDMQgKhc3Lqw3W24nsdGEasz/OGWYLr7sbz6nPqR0RERETeoumOIsOUYdiE868m2X47rrMGfBcMC3wHwyonlLcQKzBlIHoaoDYiIiIicjiUpIkMY6ZVSaTwm7jpV3Eyb4KfxrSrsYOnYlrlOV/ftsdjmHn4fhuGUbTfed9rwrLHYZoVOfclIiIiIt2UpIkMc4YRwg6dekT2QzOtGgKBk0innsEwwhhGuOec5zXj4xIMn6/qjiIiIiIDSO+sROSADMMgkv8ZfL+DTObVnpL/vp/CMPIIhT9AMHTeYIcpIiIickxRkiYiB2WaJeTFvkYmvZxM6hV8vwP
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"\n",
|
|||
|
"plt.scatter(df['inflationrate'], df['percapitaincome'], c=df['percapitaincome'], alpha=0.6)\n",
|
|||
|
"\n",
|
|||
|
"plt.title(\"Номер 1\")\n",
|
|||
|
"plt.ylabel(\"Доход на душу населения\")\n",
|
|||
|
"plt.xlabel(\"Уровень инфляции\")\n",
|
|||
|
"plt.grid(visible='true')\n",
|
|||
|
"\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA/IAAAIjCAYAAACgdyAGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACncElEQVR4nOzdd3xT9f4/8NdJ0jTdK3QDpQyhFMpGQMCrKKDABRXQK4rgj696RVGUe8UBTkBFLiooinugKKLiAEWWshVklLJp6aArHelu2uT8/khPSqGFpiQ9ycnr+Xj0IU1PTt4FBN55vz/vtyCKoggiIiIiIiIicgsquQMgIiIiIiIiouZjIk9ERERERETkRpjIExEREREREbkRJvJEREREREREboSJPBEREREREZEbYSJPRERERERE5EaYyBMRERERERG5ESbyRERERERERG6EiTwRERERERGRG2EiT0RERERERORGmMgTERHJZM2aNRAEodGPxMREucPzWH/++SdmzpyJ7t27w8/PD+3atcOkSZNw4sQJuUMjIiICAGjkDoCIiMjTPfnkk+jWrZvt85deeknGaOjll1/Gjh07MHHiRPTs2RM5OTlYtmwZ+vTpg927d/NNFiIikp0giqIodxBERESeaM2aNZg4cSK2bNmCa6+91vb4tddeC4PBgOTkZPmC82A7d+5Ev379oNVqbY+dPHkSPXr0wG233YbPPvtMxuiIiIjYWk9ERCQbk8kEAFCpLv/X8UcffQRBEJCWlmZ7zGKxoGfPnhAEAR999JHt8UOHDuGee+5BfHw8dDodIiMjMX36dBQUFDS457PPPttoW79GU9+wd+211yIxMRH79u3D4MGD4ePjgw4dOmDFihUXfS/z5s1D3759ERQUBD8/PwwdOhRbtmxpcF1aWprtdb777rsGX6uqqkJISAgEQcDixYsvijM8PBw1NTUNnvPFF1/Y7mcwGGyPf//997j55psRHR0Nb29vdOzYES+88ALMZvNlf64HDx7cIIkHgM6dO6N79+44evToZZ9PRETkbGytJyIikomUyHt7e7fo+Z9++ikOHz580eMbN27EmTNnMG3aNERGRuLIkSN49913ceTIEezevRuCIDS4/u2334a/v7/t8wvfWCgqKsJNN92ESZMm4Y477sBXX32FBx54AFqtFtOnTwcAlJSU4L333sMdd9yBGTNmoLS0FO+//z5GjhyJvXv3olevXg3uqdPp8OGHH2L8+PG2x9auXYuqqqomv9/S0lL8+OOPmDBhgu2xDz/8EDqd7qLnffTRR/D398fs2bPh7++PzZs3Y968eSgpKcGrr77a5Gs0RRRF5Obmonv37nY/l4iIyNGYyBMREcnEaDQCAHx8fOx+bnV1NebNm4fRo0dj/fr1Db7273//G4899liDx66++mrccccd2L59O4YOHdrga7fddhv0en2Tr3Xu3Dm89tprmD17NgDgvvvuw8CBAzF37lzcdddd8PLyQkhICNLS0hpUsmfMmIGuXbvizTffxPvvv9/gnhMmTMDXX3+N3NxcREREAAA++OAD3HLLLVi1alWjcUyYMAEffPCBLZFPT0/Hpk2bMHnyZHzxxRcNrl21alWDn9f7778f999/P9566y28+OKLdr958vnnnyMrKwvPP/+8Xc8jIiJyBrbWExERyURqdW/Tpo3dz12+fDkKCgowf/78i752fgJbVVUFg8GAq6++GgCwf/9+u19Lo9Hgvvvus32u1Wpx3333IS8vD/v27QMAqNVqWxJvsVhQWFiI2tpa9OvXr9HX7NOnD7p3745PP/0UAHD27Fls2bIF99xzT5NxTJ8+HRs2bEBOTg4A4OOPP8agQYPQpUuXi649/+egtLQUBoMBQ4cORUVFBY4dO2bX93/s2DE8+OCDGDRoEKZOnWrXc4mIiJyBiTwREZFMzp49C41GY3cibzQasWDBAsyePdtWzT5fYWEhZs2ahYiICPj4+KBNmzbo0KGD7bn2io6Ohp+fX4PHpOT5/DP7H3/8MXr27AmdToewsDC0adMGP/30U5OvOW3aNHz44YcArK3wgwcPRufOnZuMo1evXkhMTMQnn3wCURTx0UcfYdq0aY1ee+TIEUyYMAFBQUEIDAxEmzZtMGXKFAD2/Rzk5OTg5ptvRlBQENasWQO1Wt3s5xIRETkLE3kiIiKZHD9+HPHx8Q2GyzXHyy+/DJVKhTlz5jT69UmTJmHlypW4//77sXbtWvz666/YsGEDAGu13Bk+++wz3HPPPejYsSPef/99bNiwARs3bsR1113X5GtOmTIFp06dwu7du/Hxxx83mZSfb/r06fjwww+xbds25OTkYNKkSRddU1xcjOHDh+PgwYN4/vnn8cMPP2Djxo14+eWXATT/58BoNGL06NEoLi7Ghg0bEB0d3aznERERORvPyBMREcmguroaBw4caDDsrTnOnTuH119/HQsXLkRAQMBFk+iLioqwadMmPPfcc5g3b57t8ZMnT7Y41nPnzqG8vLxBVf7EiRMAgLi4OADWVXrx8fFYu3Ztg2F6jbX+S8LCwjBu3Dhbm/6kSZMaTJ5vzJ133ok5c+Zg1qxZuO222xAQEHDRNVu3bkVBQQHWrl2LYcOG2R5PTU1t1vcLWI8kjB07FidOnMBvv/2GhISEZj+XiIjI2ViRJyIiksGqVatQXV2N66+/3q7nPffcc4iIiMD999/f6Nel1m9RFBs8vnTp0hbFCQC1tbV45513bJ+bTCa88847aNOmDfr27dvk6+7Zswe7du265L2nT5+OQ4cOYeLEiQ0m5zclNDQU//znP3Ho0CHbxPwLNRaLyWTCW2+9ddn7A4DZbMbkyZOxa9cufP311xg0aFCznkdERNRaWJEnIiJqReXl5XjzzTfx/PPPQ61WQxRFfPbZZw2uyc3NRVlZGT777DPccMMNDc7B//rrr/j8888v2nMuCQwMxLBhw/DKK6+gpqYGMTEx+PXXX+2qRl8oOjoaL7/8MtLS0tClSxesXr0aBw4cwLvvvgsvLy8AwJgxY7B27VpMmDABN998M1JTU7FixQokJCSgrKysyXuPGjUK+fn5zUriJR999BGWL1/e5KT9wYMHIyQkBFOnTsXDDz8MQRDw6aefXvTmRlMee+wxrFu3DmPHjkVhYeFFvz7SWXsiIiK5MJEnIiJqRfn5+Zg7d67t8/OnwV/orrvuwpYtWxok8r169cIdd9xxyddYtWoVHnroISxfvhyiKOLGG2/E+vXrW3zGOyQkBB9//DEeeughrFy5EhEREVi2bBlmzJhhu+aee+5BTk4O3nnnHfzyyy9ISEjAZ599hq+//hpbt25t8t6CIFxy9V1jfHx8LrmyLywsDD/++CMee+wxPP300wgJCcGUKVNw/fXXY+TIkZe9/4EDBwAAP/zwA3744YeLvs5EnoiI5CaIzX17moiIiK5YWloaOnTogC1btuDaa6+94uuc7dprr4XBYEBycrJsMRAREVFDPCNPRERERERE5EaYyBMREbUif39/3HnnnY3uf2/JdUREROR52FpPRERETWJrPRERkethIk9ERERERETkRthaT0RERERERORGmMgTERERERERuRHukW+ExWLBuXPnEBAQAEEQ5A6HiIiIiIiIFE4URZSWliI6Ohoq1aVr7kzkG3Hu3Dm0bdtW7jCIiIiIiIjIw2RkZCA2NvaS1zCRb0RAQAAA609gYGCgzNEQERERERGR0pWUlKBt27a2fPRSmMg3QmqnDwwMZCJPREREREREraY5x7s57I6IiIiIiIjIjTCRJyIiIiIiInIjTOSJiIiIiIiI3AgTeSIiIiIiIiI3wkSeiIiIiIiIyI0wkSciIiIiIiJyI0zkiYiIiIiIiNwIE3kiIiIiIiIiN8JEnoiIiIiIiMiNMJEnIiIiIiIiciNM5ImIiIiIiIjcCBN5IiIiIiIiIjfCRJ6IiIiIiIjIjTCRJyIiIiIiInIjsifyy5cvR1xcHHQ6HQYOHIi9e/c2ee2RI0dw6623Ii4uDoIgYOnSpY1el5WVhSlTpiAsLAw+Pj7o0aMH/vrrLyd9B0RERER
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1200x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"year_condition = df.groupby('gdppercent')['unemploymentrate'].mean().reset_index()\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(12, 6))\n",
|
|||
|
"\n",
|
|||
|
"plt.plot(year_condition['gdppercent'], year_condition['unemploymentrate'], marker='.')\n",
|
|||
|
"\n",
|
|||
|
"plt.title(\"Диаграмма 2\")\n",
|
|||
|
"plt.xlabel(\"GDP percent\")\n",
|
|||
|
"plt.ylabel(\"Unemployent Rate\")\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Присутствует связь между атрибутами, уровень инфляции влияет и зависит от многих атрибутов.\n",
|
|||
|
"Для примера на графике приведена связь между инфляцией и доходом на душу населения. На втором графике показана связь уровня ВВП и безработицы\n",
|
|||
|
"Примеры бизнес целей\n",
|
|||
|
"\n",
|
|||
|
" 1.Прогнозирование уровня инфляции на основе уровня ВВП.\n",
|
|||
|
" 2.Наблюдение за изменениями уровня безработицы с уровнем ВВП.\n",
|
|||
|
" \n",
|
|||
|
"Эффект для бизнеса: влияние на инвестиции индекса акций и цен на нефть, исследование влияния фондового индекса на инвестиции, исследования инфляции и покупательской способности.\n",
|
|||
|
"Цели технического проекта\n",
|
|||
|
"\n",
|
|||
|
"Для первой цели:\n",
|
|||
|
"\n",
|
|||
|
"Вход: Доход на душу населения\n",
|
|||
|
"Целевой признак: Уровень инфляции.\n",
|
|||
|
"\n",
|
|||
|
"Для второй цели:\n",
|
|||
|
"\n",
|
|||
|
"Вход: Уровень безработицы\n",
|
|||
|
"Целевой признак: Уровень ВВП"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проверка на выбросы"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Пропущенные значения по столбцам:\n",
|
|||
|
"stock index 0\n",
|
|||
|
"country 0\n",
|
|||
|
"year 0\n",
|
|||
|
"index price 52\n",
|
|||
|
"log_indexprice 0\n",
|
|||
|
"inflationrate 43\n",
|
|||
|
"oil prices 0\n",
|
|||
|
"exchange_rate 2\n",
|
|||
|
"gdppercent 19\n",
|
|||
|
"percapitaincome 1\n",
|
|||
|
"unemploymentrate 21\n",
|
|||
|
"manufacturingoutput 91\n",
|
|||
|
"tradebalance 4\n",
|
|||
|
"USTreasury 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Статистический обзор данных:\n",
|
|||
|
" year index price log_indexprice inflationrate oil prices \\\n",
|
|||
|
"count 369.000000 317.000000 369.000000 326.000000 369.000000 \n",
|
|||
|
"mean 2000.000000 7898.648297 3.610542 0.041748 39.743171 \n",
|
|||
|
"std 11.848225 7811.336862 0.482481 0.039579 25.452654 \n",
|
|||
|
"min 1980.000000 168.610000 2.230000 -0.040000 11.350000 \n",
|
|||
|
"25% 1990.000000 2407.100000 3.320000 0.020000 19.410000 \n",
|
|||
|
"50% 2000.000000 5160.100000 3.600000 0.030000 28.520000 \n",
|
|||
|
"75% 2010.000000 10279.500000 3.980000 0.057500 57.880000 \n",
|
|||
|
"max 2020.000000 47751.330000 4.680000 0.240000 98.560000 \n",
|
|||
|
"\n",
|
|||
|
" exchange_rate gdppercent percapitaincome unemploymentrate \\\n",
|
|||
|
"count 367.000000 350.000000 368.000000 348.000000 \n",
|
|||
|
"mean 27.897548 0.037114 20719.964674 0.068908 \n",
|
|||
|
"std 49.620521 0.037850 17435.037783 0.043207 \n",
|
|||
|
"min 0.900000 -0.110000 27.000000 0.020000 \n",
|
|||
|
"25% 1.330000 0.020000 2090.250000 0.040000 \n",
|
|||
|
"50% 5.440000 0.030000 19969.500000 0.060000 \n",
|
|||
|
"75% 15.055000 0.060000 36384.000000 0.090000 \n",
|
|||
|
"max 249.050000 0.150000 65280.000000 0.260000 \n",
|
|||
|
"\n",
|
|||
|
" manufacturingoutput tradebalance USTreasury \n",
|
|||
|
"count 278.000000 365.000000 369.000000 \n",
|
|||
|
"mean 328.084820 -15.996384 0.059024 \n",
|
|||
|
"std 622.395923 154.557170 0.033086 \n",
|
|||
|
"min 0.590000 -770.930000 0.010000 \n",
|
|||
|
"25% 80.380000 -25.370000 0.030000 \n",
|
|||
|
"50% 188.160000 -0.140000 0.050000 \n",
|
|||
|
"75% 271.977500 19.080000 0.080000 \n",
|
|||
|
"max 3868.460000 366.140000 0.140000 \n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"null_values = df.isnull().sum()\n",
|
|||
|
"print(\"Пропущенные значения по столбцам:\")\n",
|
|||
|
"print(null_values)\n",
|
|||
|
"\n",
|
|||
|
"stat_summary = df.describe()\n",
|
|||
|
"print(\"\\nСтатистический обзор данных:\")\n",
|
|||
|
"print(stat_summary)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"На основе данных выше можно выделить большое количество столбцов с пропущенными значениями\n",
|
|||
|
"Также проверим данные на выбросы и дубликаты:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'year': 0.0\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'index price': 1.7605604508668822\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'log_indexprice': -0.23716751168770417\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'inflationrate': 1.5616085380027898\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'oil prices': 0.9915046764713877\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'exchange_rate': 2.1575952097650455\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'gdppercent': -0.038272329611460466\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'percapitaincome': 0.3051430219264069\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'unemploymentrate': 1.8092896369785585\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'manufacturingoutput': 4.195480293406057\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'tradebalance': -2.266183907194849\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'USTreasury': 0.6687596580836408\n",
|
|||
|
"\n",
|
|||
|
"Количество дубликатов: 0\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"for column in df.select_dtypes(include=[np.number]).columns:\n",
|
|||
|
" skewness = df[column].skew()\n",
|
|||
|
" print(f\"\\nКоэффициент асимметрии для столбца '{column}': {skewness}\")\n",
|
|||
|
"\n",
|
|||
|
"duplicates = df.duplicated().sum()\n",
|
|||
|
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"На основе данных выше можно сказать, что для столбца объем производства присутствует выброс.\n",
|
|||
|
"Удаляем все найденные пустые значения."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"В наборе данных 'Economic' было удалено 150 строк с пустыми значениями.\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def drop_missing_values(dataframe, name):\n",
|
|||
|
" before_shape = dataframe.shape \n",
|
|||
|
" cleaned_dataframe = dataframe.dropna() \n",
|
|||
|
" after_shape = cleaned_dataframe.shape \n",
|
|||
|
" print(f\"В наборе данных '{name}' было удалено {before_shape[0] - after_shape[0]} строк с пустыми значениями.\")\n",
|
|||
|
" return cleaned_dataframe\n",
|
|||
|
"\n",
|
|||
|
"cleaned_df = drop_missing_values(df, \"Economic\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Очистка данных от шумов:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2gAAAIjCAYAAAB2/jgmAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAByTElEQVR4nO3deXxU1f3/8fdMQhKWZJIAyQQaICyKkc2AQFCECpaIgtTWhYIiWlBcqnUFW42xtWipuxaU/gQV91b9itooRVGxwSiLGgJUMAGUhEACSViSQOb8/kgzZcgkmaxzJ3k9H495wNx77rmfc+bO5H7m3jnHZowxAgAAAAD4nd3fAQAAAAAAqpCgAQAAAIBFkKABAAAAgEWQoAEAAACARZCgAQAAAIBFkKABAAAAgEWQoAEAAACARZCgAQAAAIBFkKABAAAAgEWQoAEAALRhP/zwg5YvX+5+npubq5deesl/AQGoEwkaAEv7+9//LpvN5vUxaNAgf4cHAJZns9l0ww036IMPPlBubq7uvPNOffbZZ/4OC0Atgv0dAAD44u6779Zpp53mfv7AAw/4MRoACBw9e/bUnDlzlJKSIkmKi4vTmjVr/BsUgFrZjDHG30EAQG3+/ve/65JLLtHHH3+s8ePHu5ePHz9e+/fvV1ZWlv+CA4AAsmPHDu3fv1+DBg1S586d/R0OgFpwiyMAS6uoqJAk2e31f1wtX75cNptNubm57mUul0tDhgyRzWbz+A3GN998o6uuukp9+/ZVWFiYnE6nrr76ahUWFnrUed9993m9vTI4+H83IIwfP16DBg3S+vXrNWbMGHXs2FEJCQlasmRJjbbce++9Gj58uBwOhzp37qyxY8fq448/9iiXm5vr3s/bb7/tsa6srExRUVGy2Wz6y1/+UiPOmJgYHTt2zGObV155xV3f/v373cv/7//+TxdccIF69Oih0NBQ9evXT3/4wx9UWVlZb19X72/r1q269NJLFRERoa5du+rmm29WWVmZR9lly5bp3HPPVUxMjEJDQ5WYmKjFixd7rfef//ynxo0bp/DwcEVEROjMM8/Uyy+/7FHmiy++0OTJkxUVFaXOnTtryJAhevzxxz3KbN26Vb/85S8VHR2tsLAwjRgxQu+8845HmYYcL1dddZXH6x8VFaXx48fXuE3M1z6tPmZO9pe//KVGTH369NFVV13lUe6NN96QzWZTnz59PJYXFBTommuuUa9evRQUFOSOt0uXLjX2dbI+ffrUejuxzWarUX7FihUaPny4OnbsqOjoaF1++eXavXu313bW996QpPLycqWmpqp///4KDQ1VfHy87rzzTpWXl9cou2bNGp/jPFn1seut/Sf2c0OOD0nu90L37t3VsWNHnXrqqfrd737nsc+6HtVXtMaPH+/xZZRUdceA3W6v8V5444033K9Bt27dNHPmTP34448eZa666ir3cdKvXz+NGjVKRUVF6tixY432AbAGbnEEYGnVCVpoaGijtn/xxRf17bff1li+atUqff/995o9e7acTqc2b96sZ599Vps3b9a6detqnMAtXrzY4yT35ITxwIEDmjx5si699FJNnz5dr7/+uubNm6eQkBBdffXVkqSSkhL97W9/0/Tp0zVnzhyVlpbq//2//6dJkyYpMzNTw4YN86gzLCxMy5Yt07Rp09zL3nzzzRoJ0IlKS0v17rvv6uc//7l72bJlyxQWFlZju+XLl6tLly669dZb1aVLF3300Ue69957VVJSokWLFtW6jxNdeuml6tOnjxYuXKh169bpiSee0IEDB/TCCy949N3pp5+uqVOnKjg4WCtXrtT1118vl8ulG264wSOeq6++WqeffroWLFigyMhIbdy4Uenp6frVr34lqep1u/DCCxUXF6ebb75ZTqdTW7Zs0bvvvqubb75ZkrR582adddZZ6tmzp+bPn6/OnTvr9ddf17Rp0/SPf/zDo29OVtvxIkndunXTo48+Kqlq0IXHH39ckydP1u7duxUZGdlsfVqf48ePu0/8TzZr1iz961//0k033aShQ4cqKChIzz77rDZs2OBT3cOGDdNtt93mseyFF17QqlWrPJY98MADuueee3TppZfq17/+tfbt26cnn3xS55xzjjZu3OjuD8m394bL5dLUqVO1du1azZ07V6eddpq+/fZbPfroo/rPf/5T44uKar/5zW905pln1hpnc6vt+Pjmm280duxYdejQQXPnzlWfPn20Y8cOrVy5Ug888IAuvvhi9e/f313+t7/9rU477TTNnTvXvezEW7hPtGzZMv3+97/Xww8/7H4fSFXH2uzZs3XmmWdq4cKF2rt3rx5//HF9/vnnNV6Dk9177711fo4A8DMDABb22GOPGUnm66+/9lg+btw4c/rpp3ssW7ZsmZFkcnJyjDHGlJWVmV69epnzzz/fSDLLli1zlz1y5EiNfb3yyitGkvn000/dy1JTU40ks2/fvlpjHDdunJFkHn74Yfey8vJyM2zYMBMTE2MqKiqMMcYcP37clJeXe2x74MABExsba66++mr3spycHCPJTJ8+3QQHB5v8/Hz3ugkTJphf/epXRpJZtGhRjTinT59uLrzwQvfynTt3GrvdbqZPn16jHd764NprrzWdOnUyZWVltbb3xP1NnTrVY/n1119f4/Xytp9JkyaZvn37up8fPHjQhIeHm1GjRpmjR496lHW5XMaYqv5LSEgwvXv3NgcOHPBaxpiqPho8eLBHG1wulxkzZowZMGCAe1lDjpdZs2aZ3r17e+zz2WefNZJMZmZmnW311qfejl9jjFm0aJFHTMYY07t3bzNr1iz387/+9a8mNDTU/PSnP/WI6ejRo8Zut5trr73Wo85Zs2aZzp0719jXyXr37m0uuOCCGstvuOEGc+LpQm5urgkKCjIPPPCAR7lvv/3WBAcHeyz39b3x4osvGrvdbj777DOPOpcsWWIkmc8//9xj+Ycffmgkmb///e+1xlmbtLQ0I8njmKlu/4n93JDj45xzzjHh4eFm586dHnWevI/a9nWicePGmXHjxhljjHnvvfdMcHCwue222zzKVFRUmJiYGDNo0CCP98u7775rJJl7773XvezkYzcrK8vY7XZ3O0481gBYA7c4ArC06lsOu3fv3uBtn376aRUWFio1NbXGuo4dO7r/X1ZWpv3792v06NGS5PPVhhMFBwfr2muvdT8PCQnRtddeq4KCAq1fv16SFBQUpJCQEElVVwyKiop0/PhxjRgxwus+k5KSdPrpp+vFF1+UJO3cuVMff/xxjdvdTnT11VcrPT1d+fn5kqTnn39eycnJOuWUU2qUPbEPSktLtX//fo0dO1ZHjhzR1q1bfWr3iVfAJOmmm26SJL3//vte91NcXKz9+/dr3Lhx+v7771VcXCyp6spYaWmp5s+fr7CwMI86q69mbty4UTk5ObrllltqXB2oLlNUVKSPPvpIl156qbtN+/fvV2FhoSZNmqTvvvuuxi1g1eo6XqSq16y6vk2bNumFF15QXFycx5WPhvRpZWWlu77qx5EjR7zuu9qRI0d0//3368Ybb1SvXr081h0+fFgul0tdu3ats46mevPNN+VyuXTppZd6xO50OjVgwIAat+z68t544403dNppp2ngwIEedZ577rmSVKPO6qs/Jx8rvoiJiZFUdRW0IWo7Pvbt26dPP/1UV199dY3XxJdbLmuTmZmpSy+9VL/4xS9qXH396quvVFBQoOuvv96jDy644AINHDhQ7733Xq31LliwQElJSbrkkksaHRuAlsUtjgAsbefOnQoODm5wglZcXKw//elPuvXWWxUbG1tjfVFRkdLS0vTqq6+qoKCgxrYN1aNHjxo/uq9OinJzc93J3/PPP6+HH35YW7du9fitWEJCgtd6Z8+erWeffVa33367li9frjFjxmjAgAG1xjFs2DANGjRIL7zwgu644w4tX75cd999d43fBklVtwL+/ve/10cffaSSkhKPdb72wcmx9OvXT3a73eN3LZ9//rlSU1OVkZFRIwEpLi6Ww+HQjh07JKnOqRN8KbN9+3YZY3TPPffonnvu8VqmoKBAPXv
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы в датасете:\n",
|
|||
|
" stock index country year index price log_indexprice inflationrate \\\n",
|
|||
|
"229 SZCOMP China 2004.0 1467.57 3.17 0.04 \n",
|
|||
|
"230 SZCOMP China 2005.0 1144.54 3.06 0.02 \n",
|
|||
|
"231 SZCOMP China 2006.0 1687.14 3.23 0.02 \n",
|
|||
|
"232 SZCOMP China 2007.0 4329.44 3.64 0.05 \n",
|
|||
|
"233 SZCOMP China 2008.0 2912.90 3.46 0.06 \n",
|
|||
|
"234 SZCOMP China 2009.0 2737.01 3.44 -0.01 \n",
|
|||
|
"235 SZCOMP China 2010.0 2795.88 3.45 0.03 \n",
|
|||
|
"236 SZCOMP China 2011.0 2639.19 3.42 0.06 \n",
|
|||
|
"237 SZCOMP China 2012.0 2211.11 3.34 0.03 \n",
|
|||
|
"238 SZCOMP China 2013.0 2182.52 3.34 0.03 \n",
|
|||
|
"239 SZCOMP China 2014.0 2279.75 3.36 0.02 \n",
|
|||
|
"240 SZCOMP China 2015.0 3657.40 3.56 0.01 \n",
|
|||
|
"241 SZCOMP China 2016.0 2978.14 3.47 0.02 \n",
|
|||
|
"242 SZCOMP China 2017.0 3257.35 3.51 0.02 \n",
|
|||
|
"243 SZCOMP China 2018.0 2920.18 3.47 0.02 \n",
|
|||
|
"244 SZCOMP China 2019.0 2928.94 3.47 0.03 \n",
|
|||
|
"245 SZCOMP China 2020.0 3109.78 3.49 0.02 \n",
|
|||
|
"271 DAX 30 Germany 2005.0 5408.25 3.73 0.02 \n",
|
|||
|
"272 DAX 30 Germany 2006.0 6596.91 3.82 0.02 \n",
|
|||
|
"273 DAX 30 Germany 2007.0 8067.31 3.91 0.02 \n",
|
|||
|
"274 DAX 30 Germany 2008.0 4810.20 3.68 0.03 \n",
|
|||
|
"276 DAX 30 Germany 2010.0 6914.19 3.84 0.01 \n",
|
|||
|
"277 DAX 30 Germany 2011.0 5898.35 3.77 0.02 \n",
|
|||
|
"280 DAX 30 Germany 2014.0 9805.55 3.99 0.01 \n",
|
|||
|
"281 DAX 30 Germany 2015.0 10743.01 4.03 0.01 \n",
|
|||
|
"283 DAX 30 Germany 2017.0 12917.64 4.11 0.02 \n",
|
|||
|
"284 DAX 30 Germany 2018.0 10558.96 4.02 0.02 \n",
|
|||
|
"285 DAX 30 Germany 2019.0 13249.01 4.12 0.01 \n",
|
|||
|
"286 DAX 30 Germany 2020.0 13718.78 4.14 0.01 \n",
|
|||
|
"\n",
|
|||
|
" oil prices exchange_rate gdppercent percapitaincome unemploymentrate \\\n",
|
|||
|
"229 43.15 8.28 0.10 1509.0 0.04 \n",
|
|||
|
"230 59.41 8.19 0.11 1753.0 0.04 \n",
|
|||
|
"231 61.96 7.97 0.13 2099.0 0.04 \n",
|
|||
|
"232 91.69 7.61 0.14 2694.0 0.04 \n",
|
|||
|
"233 41.12 6.95 0.10 3468.0 0.04 \n",
|
|||
|
"234 74.47 6.83 0.09 3832.0 0.04 \n",
|
|||
|
"235 89.15 6.77 0.11 4550.0 0.04 \n",
|
|||
|
"236 98.56 6.46 0.10 5618.0 0.04 \n",
|
|||
|
"237 87.86 6.31 0.08 6317.0 0.04 \n",
|
|||
|
"238 97.63 6.15 0.08 7051.0 0.05 \n",
|
|||
|
"239 59.29 6.16 0.07 7679.0 0.05 \n",
|
|||
|
"240 37.19 6.28 0.07 8067.0 0.05 \n",
|
|||
|
"241 51.97 6.64 0.07 8148.0 0.05 \n",
|
|||
|
"242 57.88 6.76 0.07 8879.0 0.04 \n",
|
|||
|
"243 49.52 6.61 0.07 9977.0 0.04 \n",
|
|||
|
"244 59.88 6.91 0.06 10217.0 0.05 \n",
|
|||
|
"245 47.02 6.90 0.02 10500.0 0.05 \n",
|
|||
|
"271 59.41 1.24 0.01 34520.0 0.12 \n",
|
|||
|
"272 61.96 1.26 0.04 36354.0 0.11 \n",
|
|||
|
"273 91.69 1.37 0.03 41640.0 0.09 \n",
|
|||
|
"274 41.12 1.47 0.01 45613.0 0.08 \n",
|
|||
|
"276 89.15 1.33 0.04 41572.0 0.08 \n",
|
|||
|
"277 98.56 1.39 0.04 46706.0 0.07 \n",
|
|||
|
"280 59.29 1.33 0.02 48024.0 0.07 \n",
|
|||
|
"281 37.19 1.11 0.01 41103.0 0.06 \n",
|
|||
|
"283 57.88 1.13 0.03 44553.0 0.06 \n",
|
|||
|
"284 49.52 1.18 0.01 47811.0 0.05 \n",
|
|||
|
"285 59.88 1.12 0.01 46468.0 0.05 \n",
|
|||
|
"286 47.02 1.14 -0.05 45724.0 0.06 \n",
|
|||
|
"\n",
|
|||
|
" manufacturingoutput tradebalance USTreasury \n",
|
|||
|
"229 625.22 51.17 0.04 \n",
|
|||
|
"230 733.66 124.63 0.04 \n",
|
|||
|
"231 893.13 208.92 0.05 \n",
|
|||
|
"232 1149.72 308.04 0.05 \n",
|
|||
|
"233 1475.66 348.83 0.04 \n",
|
|||
|
"234 1611.95 220.13 0.03 \n",
|
|||
|
"235 1924.32 222.40 0.03 \n",
|
|||
|
"236 2421.37 180.89 0.03 \n",
|
|||
|
"237 2690.09 231.87 0.02 \n",
|
|||
|
"238 2935.34 234.87 0.02 \n",
|
|||
|
"239 3184.24 221.55 0.03 \n",
|
|||
|
"240 3202.50 358.84 0.02 \n",
|
|||
|
"241 3153.12 255.48 0.02 \n",
|
|||
|
"242 3460.33 215.70 0.02 \n",
|
|||
|
"243 3868.46 106.71 0.03 \n",
|
|||
|
"244 3823.41 164.99 0.02 \n",
|
|||
|
"245 3853.81 366.14 0.01 \n",
|
|||
|
"271 571.36 148.05 0.04 \n",
|
|||
|
"272 618.70 162.20 0.05 \n",
|
|||
|
"273 714.38 231.95 0.05 \n",
|
|||
|
"274 750.91 227.47 0.04 \n",
|
|||
|
"276 669.57 178.90 0.03 \n",
|
|||
|
"277 758.60 184.02 0.03 \n",
|
|||
|
"280 786.55 257.40 0.03 \n",
|
|||
|
"281 683.20 255.02 0.02 \n",
|
|||
|
"283 752.02 257.66 0.02 \n",
|
|||
|
"284 795.96 243.72 0.03 \n",
|
|||
|
"285 737.94 223.82 0.02 \n",
|
|||
|
"286 678.29 221.53 0.01 \n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2IAAAIjCAYAAABh3KjvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB0Y0lEQVR4nO3deXxU1f3/8fdMQhICWYGQYDEJoNWIgKBAVMQqCi4o1q1UFJcixaVuVcFWI1rFre4WFH8Ciitt9SvWpiIqKgViWVSI+EUNiJIQIWRhCYHM+f2R70wzyUwyM5m5s+T1fDzygNw5997PWe5kPrk359iMMUYAAAAAAMvYwx0AAAAAAHQ2JGIAAAAAYDESMQAAAACwGIkYAAAAAFiMRAwAAAAALEYiBgAAAAAWIxEDAAAAAIuRiAEAAACAxUjEAAAAAMBiJGIAAAAAYDESMQBR669//atsNpvHr4EDB4Y7PAAAAK/iwx0AAHTUHXfcoSOPPNL1/X333RfGaAAAANpHIgYg6p122mk6+eSTXd8///zz2rFjR/gCAgAAaAePJgKIWg0NDZIku739t7L58+fLZrNp8+bNrm0Oh0ODBg2SzWbT/PnzXdu/+OILXX755erXr5+SkpKUnZ2tK6+8Ujt37nQ75t133+3xscj4+P/+juvkk0/WwIEDtXr1ah1//PHq2rWr8vPzNWfOnFZ1ueuuuzRs2DClpaWpW7duGjVqlD788EO3cps3b3ad56233nJ7rb6+XhkZGbLZbHrkkUdaxZmVlaUDBw647fPqq6+6jtc8ef2f//kfnXXWWerTp48SExPVv39/3XvvvWpsbGy3rZ3n27hxoy666CKlpqaqR48euuGGG1RfX+9Wdt68eTrllFOUlZWlxMREFRQUaPbs2R6P+89//lOjR49WSkqKUlNTddxxx+mVV15xK7Nq1SqdeeaZysjIULdu3TRo0CA98cQTbmU2btyoCy64QJmZmUpKStKxxx6rt99+262MP+Pl8ssvd+v/jIwMnXzyyfrkk0/cjulrmzrHTEuPPPJIq5jy8vJ0+eWXu5VbtGiRbDab8vLy3LZXVlbqqquu0qGHHqq4uDhXvN27d291rpby8vK8PgZss9ncyh48eFD33nuv+vfvr8TEROXl5emOO+7Q/v37Wx3Xlz5tPubbOq/D4dDjjz+uo446SklJSerdu7emTp2qXbt2+VS/lu340UcfyWaz6aOPPnJtO/nkk91+6SNJn332mcd4JGnhwoUaPny4kpOTlZGRoZNOOknvvfee65xttamz/5z1bz7m6urqNGzYMOXn56u8vNxrOUm69tprZbPZWtUPQPhxRwxA1HImYomJiQHt/9JLL+nLL79stX3JkiX67rvvdMUVVyg7O1sbNmzQc889pw0bNmjlypWtPnDNnj3b7cNsy8Rw165dOvPMM3XRRRdp4sSJeuONNzRt2jQlJCToyiuvlCTV1tbq+eef18SJEzVlyhTV1dXp//2//6exY8eqpKREQ4YMcTtmUlKS5s2bpwkTJri2/f3vf2+V6DRXV1end955R+edd55r27x585SUlNRqv/nz56t79+66+eab1b17d33wwQe66667VFtbq4cfftjrOZq76KKLlJeXp1mzZmnlypV68skntWvXLr344otubXfUUUfpnHPOUXx8vBYvXqxrrrlGDodD1157rVs8V155pY466ijNmDFD6enpWrt2rYqLi/XrX/9aUlO/nX322crJydENN9yg7OxsffXVV3rnnXd0ww03SJI2bNigE044QYcccoimT5+ubt266Y033tCECRP0t7/9za1tWvI2XiSpZ8+eeuyxxyRJP/zwg5544gmdeeaZ2rp1q9LT04PWpu05ePCg/vCHP3h8bfLkyXr//fd1/fXXa/DgwYqLi9Nzzz2nNWvW+HTsIUOG6JZbbnHb9uKLL2rJkiVu237zm99owYIFuuCCC3TLLbdo1apVmjVrlr766iu9+eabrnK+9GlzV199tUaNGiWpaaw3P5YkTZ06VfPnz9cVV1yh3/3udyorK9PTTz+ttWvXavny5erSpYtP9fTX7bff7nH7zJkzdffdd+v444/XPffco4SEBK1atUoffPCBTj/9dD3++OPavXu3JOmrr77S/fff7/aYtbcE+cCBAzr//PP1/fffa/ny5crJyfEa2zfffKO5c+d2sIYAQsYAQJR6/PHHjSTz+eefu20fPXq0Oeqoo9y2zZs3z0gyZWVlxhhj6uvrzaGHHmrOOOMMI8nMmzfPVXbv3r2tzvXqq68aSebjjz92bSsqKjKSzE8//eQ1xtGjRxtJ5s9//rNr2/79+82QIUNMVlaWaWhoMMYYc/DgQbN//363fXft2mV69+5trrzySte2srIyI8lMnDjRxMfHm4qKCtdrp556qvn1r39tJJmHH364VZwTJ040Z599tmv7li1bjN1uNxMnTmxVD09tMHXqVJOcnGzq6+u91rf5+c455xy37ddcc02r/vJ0nrFjx5p+/fq5vq+urjYpKSlmxIgRZt++fW5lHQ6HMaap/fLz801ubq7ZtWuXxzLGNLXR0Ucf7VYHh8Nhjj/+eHPYYYe5tvkzXiZPnmxyc3Pdzvncc88ZSaakpKTNunpqU0/j1xhjHn74YbeYjDEmNzfXTJ482fX9X/7yF5OYmGh+8YtfuMW0b98+Y7fbzdSpU92OOXnyZNOtW7dW52opNzfXnHXWWa22X3vttab5R4l169YZSeY3v/mNW7nf//73RpL54IMPjDG+9anTpk2bjCSzYMEC1zbnGHP65JNPjCTz8ssvu+1bXFzscXtL+fn55rLLLnPb9uGHHxpJ5sMPP3RtGz16tBk9erTr+3fffddIMuPGjXOLZ9OmTcZut5vzzjvPNDY2tlk/b+dycl7z8+bNMw6Hw1xyySUmOTnZrFq1yms5p4suusgMHDjQ9O3b122cAIgMPJoIIGo5HxXs1auX3/s+88wz2rlzp4qKilq91rVrV9f/6+vrtWPHDo0cOVKSfL570Fx8fLymTp3q+j4hIUFTp05VZWWlVq9eLUmKi4tTQkKCpKZHrKqqqnTw4EEde+yxHs85dOhQHXXUUXrppZckSVu2bNGHH37Y5uNHV155pYqLi1VRUSFJWrBggQoLC3X44Ye3Ktu8Derq6rRjxw6NGjVKe/fu1caNG32qd/M7WpJ0/fXXS5Leffddj+epqanRjh07NHr0aH333XeqqamR1HSnq66uTtOnT1dSUpLbMZ13J9euXauysjLdeOONrjtQLctUVVXpgw8+0EUXXeSq044dO7Rz506NHTtWmzZt0o8//uixLm2NF6mpz5zHW7dunV588UXl5OS4TSLjT5s2Nja6juf82rt3r8dzO+3du1f33HOPrrvuOh166KFur+3Zs0cOh0M9evRo8xgd5ezbm2++2W27807aP/7xD0m+9amTL3e+Fy1apLS0NJ122mlubTZs2DB179691SO+LWVlZemHH37woYb/ZYzRjBkzdP7552vEiBFur7311ltyOBy66667Wt0h9/QIo69uvfVWvfzyy3rjjTc0fPjwNsuuXr1aixYt0qxZs3x6fBuA9bgyAUStLVu2KD4+3u9ErKamRvfff79uvvlm9e7du9XrVVVVuuGGG9S7d2917dpVvXr1Un5+vmtff/Xp00fdunVz2+ZMfpr/vc+CBQs0aNAgJSUlqUePHurVq5f+8Y9/eD3nFVdcoXnz5klqeszr+OOP12GHHeY1jiFDhmjgwIF68cUXZYxxPcblyYYNG3TeeecpLS1Nqamp6tWrlyZNmiTJ9zZoGUv//v1lt9vd6rx8+XKNGTNG3bp1U3p6unr16qU77rjD7TzffvutJLW5JIEvZb755hsZY3TnnXeqV69ebl/OBKuysrLVfu2NF0naunWr61jHHHOMvv32W/3tb39ze7zMnzbduHGj1xi9efTRR1VfX+9qv+Z69Oihww47TM8//7zee+89VVZWaseOHR7/bqsjtmzZIrvdrgEDBrhtz87OVnp6urZs2SLJt/5yqq6uluT9UT1J2rRpk2pqapSVldW
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(cleaned_df['manufacturingoutput'], cleaned_df['gdppercent'])\n",
|
|||
|
"plt.xlabel('Объем производства')\n",
|
|||
|
"plt.ylabel('ВВП')\n",
|
|||
|
"plt.title('Диаграмма рассеивания перед чисткой')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"Q1 = cleaned_df[\"manufacturingoutput\"].quantile(0.25)\n",
|
|||
|
"Q3 = cleaned_df[\"manufacturingoutput\"].quantile(0.75)\n",
|
|||
|
"\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"lower_bound = Q1 - threshold\n",
|
|||
|
"upper_bound = Q3 + threshold\n",
|
|||
|
"\n",
|
|||
|
"outliers = (cleaned_df[\"manufacturingoutput\"] < lower_bound) | (cleaned_df[\"manufacturingoutput\"] > upper_bound)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод выбросов\n",
|
|||
|
"print(\"Выбросы в датасете:\")\n",
|
|||
|
"print(cleaned_df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"# Заменяем выбросы на медианные значения\n",
|
|||
|
"median_score = cleaned_df[\"manufacturingoutput\"].median()\n",
|
|||
|
"cleaned_df.loc[outliers, \"manufacturingoutput\"] = median_score\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(cleaned_df['manufacturingoutput'], cleaned_df['gdppercent'])\n",
|
|||
|
"plt.xlabel('Объем производства')\n",
|
|||
|
"plt.ylabel('ВВП')\n",
|
|||
|
"plt.title('Диаграмма рассеивания после чистки')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 131\n",
|
|||
|
"Размер контрольной выборки: 44\n",
|
|||
|
"Размер тестовой выборки: 44\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"train_df, test_df = train_test_split(cleaned_df, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Видим недостаток баланса"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение ВВП в обучающей выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
" 0.02 30\n",
|
|||
|
" 0.04 25\n",
|
|||
|
" 0.03 21\n",
|
|||
|
" 0.01 13\n",
|
|||
|
" 0.07 8\n",
|
|||
|
" 0.08 8\n",
|
|||
|
" 0.05 7\n",
|
|||
|
"-0.01 5\n",
|
|||
|
" 0.11 2\n",
|
|||
|
" 0.09 2\n",
|
|||
|
"-0.02 2\n",
|
|||
|
" 0.10 2\n",
|
|||
|
"-0.03 1\n",
|
|||
|
" 0.14 1\n",
|
|||
|
"-0.10 1\n",
|
|||
|
" 0.06 1\n",
|
|||
|
"-0.05 1\n",
|
|||
|
"-0.04 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение ВВП в контрольной выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
" 0.02 9\n",
|
|||
|
" 0.03 7\n",
|
|||
|
" 0.01 6\n",
|
|||
|
" 0.07 4\n",
|
|||
|
" 0.04 4\n",
|
|||
|
" 0.05 4\n",
|
|||
|
" 0.08 3\n",
|
|||
|
" 0.06 3\n",
|
|||
|
"-0.01 2\n",
|
|||
|
" 0.10 1\n",
|
|||
|
"-0.08 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение ВВП в тестовой выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
" 0.02 12\n",
|
|||
|
" 0.03 8\n",
|
|||
|
" 0.01 7\n",
|
|||
|
" 0.05 5\n",
|
|||
|
" 0.04 3\n",
|
|||
|
" 0.08 3\n",
|
|||
|
"-0.01 2\n",
|
|||
|
"-0.05 1\n",
|
|||
|
" 0.06 1\n",
|
|||
|
" 0.13 1\n",
|
|||
|
" 0.07 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['gdppercent'].value_counts()\n",
|
|||
|
" print(f\"Распределение ВВП в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"также используем oversampling и undersampling"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 12,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Оверсэмплинг:\n",
|
|||
|
"Распределение ВВП в обучающей выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
" 0.08 37\n",
|
|||
|
" 0.07 33\n",
|
|||
|
"-0.04 31\n",
|
|||
|
" 0.02 30\n",
|
|||
|
"-0.10 27\n",
|
|||
|
" 0.04 25\n",
|
|||
|
"-0.05 25\n",
|
|||
|
"-0.03 21\n",
|
|||
|
" 0.03 21\n",
|
|||
|
" 0.01 13\n",
|
|||
|
" 0.11 11\n",
|
|||
|
" 0.09 11\n",
|
|||
|
" 0.10 7\n",
|
|||
|
" 0.05 7\n",
|
|||
|
"-0.01 5\n",
|
|||
|
" 0.14 5\n",
|
|||
|
"-0.02 2\n",
|
|||
|
" 0.06 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение ВВП в контрольной выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
"-0.08 24\n",
|
|||
|
" 0.02 9\n",
|
|||
|
" 0.07 7\n",
|
|||
|
" 0.03 7\n",
|
|||
|
" 0.01 6\n",
|
|||
|
" 0.05 5\n",
|
|||
|
" 0.04 4\n",
|
|||
|
" 0.06 4\n",
|
|||
|
" 0.08 3\n",
|
|||
|
"-0.01 2\n",
|
|||
|
" 0.10 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение ВВП в тестовой выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
" 0.08 26\n",
|
|||
|
"-0.01 22\n",
|
|||
|
"-0.05 14\n",
|
|||
|
" 0.02 12\n",
|
|||
|
" 0.03 8\n",
|
|||
|
" 0.01 7\n",
|
|||
|
" 0.05 5\n",
|
|||
|
" 0.07 5\n",
|
|||
|
" 0.13 5\n",
|
|||
|
" 0.04 3\n",
|
|||
|
" 0.06 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Андерсэмплинг:\n",
|
|||
|
"Распределение ВВП в обучающей выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
" 0.01 2\n",
|
|||
|
" 0.08 2\n",
|
|||
|
"-0.10 1\n",
|
|||
|
"-0.03 1\n",
|
|||
|
"-0.05 1\n",
|
|||
|
"-0.04 1\n",
|
|||
|
" 0.03 1\n",
|
|||
|
" 0.02 1\n",
|
|||
|
" 0.14 1\n",
|
|||
|
" 0.07 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение ВВП в контрольной выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
"-0.08 1\n",
|
|||
|
" 0.02 1\n",
|
|||
|
" 0.08 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение ВВП в тестовой выборке:\n",
|
|||
|
"gdppercent\n",
|
|||
|
"-0.01 2\n",
|
|||
|
" 0.08 2\n",
|
|||
|
"-0.05 1\n",
|
|||
|
" 0.02 1\n",
|
|||
|
" 0.04 1\n",
|
|||
|
" 0.05 1\n",
|
|||
|
" 0.07 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
"\n",
|
|||
|
"def binning(target, bins):\n",
|
|||
|
" return pd.cut(target, bins=bins, labels=False)\n",
|
|||
|
"\n",
|
|||
|
"train_df['gdppercent_binned'] = binning(train_df['gdppercent'], bins=3)\n",
|
|||
|
"val_df['gdppercent_binned'] = binning(val_df['gdppercent'], bins=3)\n",
|
|||
|
"test_df['gdppercent_binned'] = binning(test_df['gdppercent'], bins=3)\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df, target_column):\n",
|
|||
|
" X = df.drop(target_column, axis=1)\n",
|
|||
|
" y = df[target_column]\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" x_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1) \n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"def undersample(df, target_column):\n",
|
|||
|
" X = df.drop(target_column, axis=1)\n",
|
|||
|
" y = df[target_column]\n",
|
|||
|
" \n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42)\n",
|
|||
|
" x_resampled, y_resampled = undersampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df, 'gdppercent_binned')\n",
|
|||
|
"val_df_oversampled = oversample(val_df, 'gdppercent_binned')\n",
|
|||
|
"test_df_oversampled = oversample(test_df, 'gdppercent_binned')\n",
|
|||
|
"\n",
|
|||
|
"train_df_undersampled = undersample(train_df, 'gdppercent_binned')\n",
|
|||
|
"val_df_undersampled = undersample(val_df, 'gdppercent_binned')\n",
|
|||
|
"test_df_undersampled = undersample(test_df, 'gdppercent_binned')\n",
|
|||
|
"\n",
|
|||
|
"print(\"Оверсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"print(\"Андерсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_undersampled, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|