AIM-PIbd-32-Isaeva-A-I/lab_3/Lab3.ipynb

552 lines
164 KiB
Plaintext
Raw Permalink Normal View History

2024-12-20 23:47:13 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Лабораторная 3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Вариант 7. Экономика стран"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Бизнес-цели:\n",
"1) прогнозирование уровня инфляции на основе данных за года\n",
"2) определение факторов, значительно влияющих на показателль ВВП на душу населения"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Технические цели:\n",
"1) Разработать МО для прогнозирования уровня инфляции на основе исторических данных\n",
"2) Проанализировать взаимосвязь между экономическими показателями и ВВП"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['stock index', 'country', 'year', 'index price', 'log_indexprice',\n",
" 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent',\n",
" 'percapitaincome', 'unemploymentrate', 'manufacturingoutput',\n",
" 'tradebalance', 'USTreasury'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"df = pd.read_csv(\".//csv//EconomicData.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Подготовка данных:"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"stock index 0\n",
"country 0\n",
"year 0\n",
"index price 52\n",
"log_indexprice 0\n",
"inflationrate 43\n",
"oil prices 0\n",
"exchange_rate 2\n",
"gdppercent 19\n",
"percapitaincome 1\n",
"unemploymentrate 21\n",
"manufacturingoutput 91\n",
"tradebalance 4\n",
"USTreasury 0\n",
"dtype: int64\n"
]
}
],
"source": [
"print(df.isnull().sum())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Заполним пустые значения медианами:"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"for column in df.columns:\n",
" if (column != \"stock index\" and column != \"country\"):\n",
" df[column].fillna(df[column].median())"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA+0AAAK9CAYAAABRvo1QAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADX6klEQVR4nOzdd5hdVb3/8c8+vUzPJFPSQyopJCQkhK4GAgKK8JNyvRdELnhFFIggoBJAkQgCItIEC6hwQa8KFohIJNSQQEIINaRMemYyJTNn5vSz9/79EXNgmJLps2fyfj1PHjJr7bPO90wJ8zlr7bUM27ZtAQAAAAAAx3H1dwEAAAAAAKB1hHYAAAAAAByK0A4AAAAAgEMR2gEAAAAAcChCOwAAAAAADkVoBwAAAADAoQjtAAAAAAA4FKEdAAAAAACHIrQDAAAAAOBQhHYAAAAAAByK0A4AOCgtWbJEhmHo6aefbtFXVVUln8+nz3/+8/1QGSTp9ttvl2EY2rJlS3+XAgBAvyK0AwAOShdeeKE8Ho9+8YtftOj71a9+pXQ6rUsuuaQfKgMAAPgIoR0AcFAqLS3Vaaedpr/+9a+qrKzMttu2rV/84hcaOXKkTjnllH6sEAAAgNAOADiIXXzxxcpkMnr44Yezbc8995w2b96sr3zlK3K59v1v8uGHH5ZhGHrxxRf11a9+VUOGDFFeXp7OP/987d27t8W49913n6ZOnSq/36/y8nJ9/etfV319fbNrTjjhBBmGkf1TXFysU089Ve+8806L8X73u99p9uzZCgaDKioq0rnnnqvt27e3GG/atGktHtvaMvMxY8botNNO68Rn6iNjxozRl7/85WZtf/jDH2QYhsaMGZNt27JliwzD0O23395ijGnTpumEE05o1rZ69WrNmjVLoVBI5513nhKJhCTpnXfe0YwZMxQOh3X++ecrFou1GO/GG29s9rnc/+eTdT7//PM69thjVVhY2Oy6yy67rMVYNTU1zR77xhtvyDCMZt8r+1/jx9sk6etf/3qL59//PfTxr4NlWZoxY0arYwAAsJ+nvwsAAKC/nHzyyRo5cqR++ctf6pprrpFhGHrooYfkcrl00UUXtbj+sssuU0FBgW688UatX79e999/v7Zu3arly5fLMAxJ+0LfTTfdpAULFuhrX/ta9rrXX39dr7zyirxeb3a8yZMn67vf/a5s29amTZt055136rOf/ay2bduWveaHP/yhrr/+ep199tn67//+b1VXV+tnP/uZjjvuOL355psqKCjo9c9TezKZjL773e92a4xIJKKTTz5ZwWBQN998s958803dfffdkvZ9zv/nf/5HlZWVuvvuuxUMBvXzn/+81XF++9vfZv9+5ZVXNuurqKjQqaeeqrKyMi1evFhDhw6VJP3Xf/1Xt2r/pI0bN+qhhx7q0LW//e1v9fbbb/fo8wMABh9COwDgoOVyufSVr3xFN910k5YvX65p06bpqaeeyob5T/L5fFq2bFk2eI8ePVrf/va39de//lWf+9znVF1drSVLluikk07SM888k52pnzx5si677DL97ne/04UXXpgdr6SkRP/5n/+Z/TidTuuWW25RdXW1hg4dqq1bt+qGG27QzTffrO985zvZ684880zNmjVL9913X7P2/vDQQw9p27Zt+tSnPqXNmzd3aYxf/vKXqq2t1fvvv69JkyZJkr74xS/q//7v/3Tbbbfp7LPPliSFQiHdeeed+v73v6+SkpLs4zOZjAzDaPa5/N73vtfsOf75z38qHo/r0Ucf1ZFHHplt7+nQ/t3vfleTJk1SQ0NDu9clk0ktXrxYp5xyip555pkerQEAMLiwPB4AcFC76KKL5HK59NBDD+nXv/61UqlUmxvQXXLJJc1myr/2ta/J4/Fkd6B/7rnnlEqldMUVV2QDu7RvGX5eXp7+/ve/NxsvnU6rpqZG1dXVWrFihf785z9rxowZKi4uliT96U9/kmVZOvvss1VTU5P9U1paqgkTJuj5559vNp5pms2uq6mpaXU5+cefu7a2VplMpvOfOEmxWEzf//73ddlll2nUqFFtXvPJmkzTbHbNsmXLdOihh2YDuyTNmzdPkjR37txs25lnnqlkMqmXX3652eNTqZT8fn+7tTY2NkqShgwZ0vEX2EmrV6/WH/7wBy1ZsqTZ17819957r2pra3XDDTf0Wj0AgMGBmXYAwEFt5MiROvnkk/WnP/1Jr776qsrLy3Xqqae2eu2ECROafZyTk6OysrLsfcpbt26VpGbhU9o3Qz9u3Lhs/36vvvpqdpn2/vGffPLJ7FL7DRs2yLbtFs+738ffQJCkDz74oNl47Xn22Wez17rdbs2YMUM/+tGPdNJJJ3Xo8ZJ05513KpFI6Dvf+Y4WLVrU6jU33HBDq8H04zPl27dv1/Dhww/4fPuv+eT9/PX19crJyWn3sfPnz5ckXX311VqyZEmHP0+dce211+rYY4/Vaaed1uw++U9qaGjQLbfcokWLFjX7PAAA0BpCOwDgoHfxxRfr6aef1tatW/Xd735XHk/f/O9xxowZuuOOOyRJ1dXVuvvuu3XCCSdozZo1Ki0tlWVZMgxDzzzzjNxud4vHfzKojhkzpsX91H/4wx/04IMPtnjsvHnzdPPNN0uSdu3apVtvvVVf+MIX9O677zbbUK4tNTU1+vGPf6zrrrtORUVFbV53ySWX6Itf/GKztosvvrjZx/s3neuoeDze7OPKykqVlpa2+5ijjjpKP/7xj3XTTTfp0EMP7dTzdcSzzz6r5557TitWrDjgtbfeeqtcLpeuvvpq1dbW9ngtAIDBhdAOADjonXbaaRo2bJiqq6v13//9321et2HDBn3qU5/KftzU1KTdu3frs5/9rKR997hL0vr16zVu3LjsdalUShUVFVqwYEGz8QoLC5u1nXDCCSovL9evf/1rXXfddTrkkENk27bGjh2riRMnHvB1hMPhFs+xdu3aVq8tLi5udu348eN19NFH68UXX+xQaL/55puVm5uryy+/vN3rJkyY0KKmcDjc7OOysjLt2rXrgM+5c+dOSVJ5eXmz9vfee0+HH374AR9/1VVXacOGDfrjH/+o3/zmN/L5fDrxxBMP+LgDsW1b1157rb7whS80u1++Nbt27dJPf/pTLVmyRLm5uYR2AMABcU87AOCgV1dXp4aGBp144ontBtYHH3xQ6XQ6+/H999+vTCaTPc99wYIF8vl8uvvuu2Xbdva6X/7yl2poaGhz2f1++2eQk8mkpH33cLvdbt10003NxpP2BcWeDHyWZUlSqzP6n7Rlyxbdf//9uvHGGxUMBrv93Mcdd5zeffddffjhh9m2lStXSpJWrVqVbXvyySclSccee2y27Y033tCmTZv06U9/+oDP89e//lUPPvigfvGLX+izn/1sizcTuurxxx/XunXrtGTJkgNee9NNN6mkpET/8z//0yPPDQAY/JhpBwActNatW6f/+7//0zPPPKNkMtnufcjSvhnzz3zmMzr77LO1fv163XfffTrmmGP0uc99TpI0dOhQXXfddbrpppt08skn63Of+1z2uiOOOKLZ7uaSVFVVpd/97neS9i03//nPfy6Px5M9Q/2QQw7RzTffrOuuu05btmzRGWecodzcXFVUVOjPf/6zLrnkEl111VVdeu3V1dVaunSpJGn37t269dZblZ+f32wlQVteeOEFTZkypdlO+N1x6aWX6mc/+5lOPPFEXXHFFVqzZo1efPFFSfvuE6+oqMge+XbuuedmVzF8//vf109/+lONGzdO559/frvPUVlZqYsuukj//d//rTPOOOOANf3rX/9SXl5e9uMNGzZIkt5++229/fbbmj59erbv2Wef1cUXX9xiL4PWPPvss3r00Ufl8/kOeC0AABKhHQBwEFuzZo1uueUWlZeXa/HixTr99NPbvf6ee+7Ro48+qsWLFyudTuu8887T3Xffnd04Ttp3TvvQoUN1zz336Morr1RRUZEuueQS3XLLLa1uHLf/yLGCggJNnTpVd955p+bMmZO95tprr9XEiRP1k5/8RDfddJOkfZvnnXTSSdk3C7pi1apV2RU
"text/plain": [
"<Figure size 1200x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(12, 8))\n",
"ax = sns.scatterplot(x='exchange_rate', y='oil prices', hue='inflationrate', data=df)\n",
"plt.title('Уровень инфляции')\n",
"plt.xlabel('Валютный курс')\n",
"plt.ylabel('Цены на нефть')\n",
"plt.legend(title='inflationrate')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA/0AAAK9CAYAAAB2EAy4AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC4X0lEQVR4nOzdd5xcZb0/8M8p09v2mt1k03tPloQSlEBCR7i0qwIR4aqXq5CLCogJCD8iihgRJIIiqHgFFEEEIhASakgghTTSN9nN9jq9nDnn/P5YMmQyM5vtZfbzfr3mlezzPOc5z5l6vucpR9B1XQcRERERERERpR1xoBtARERERERERH2DQT8RERERERFRmmLQT0RERERERJSmGPQTERERERERpSkG/URERERERERpikE/ERERERERUZpi0E9ERERERESUphj0ExEREREREaUpBv1EREREREREaYpBPxEREREREVGaYtBPRETUDatWrYIgCHjttdcS8urr62E0GnHppZcOQMsIAB566CEIgoAjR44MdFOIiIgGFIN+IiKibli2bBlkWcbvfve7hLynnnoKiqLg5ptvHoCWEREREX2BQT8REVE3FBQU4KKLLsIrr7yCurq6WLqu6/jd736HkpISnH/++QPYQiIiIiIG/URERN120003IRqN4umnn46lvfXWWzh8+DC+8Y1vQBTbf2affvppCIKAd999F//1X/+F7OxsOJ1OXHfddWhtbU2o9ze/+Q2mTJkCk8mEoqIi/Pd//zfa2triypx99tkQBCH2yMnJwYUXXohdu3Yl1PfnP/8Zc+bMgcViQVZWFq655hpUVVUl1Dd16tSEbZMNkx81ahQuuuiiLjxTXxg1ahRuuOGGuLQXXngBgiBg1KhRsbQjR45AEAQ89NBDCXVMnToVZ599dlzali1bMGvWLFitVlx77bUIhUIAgF27dmH69Omw2Wy47rrrEAgEEuq755574p7L44+T27l+/XqceeaZyMzMjCt3yy23JNTV1NQUt+0nn3wCQRDi3ivHj/HENAD47//+74T9H38Pnfg6aJqG6dOnJ62DiIjoOHmgG0BERDRULV26FCUlJfj973+PH/7whxAEAU8++SREUcSNN96YUP6WW25BRkYG7rnnHuzbtw+PP/44jh49ig0bNkAQBADtQeO9996LxYsX49vf/nas3Mcff4wPPvgABoMhVt/EiRPxox/9CLqu49ChQ3j44YdxwQUXoLKyMlbm//2//4cf//jHuOqqq/DNb34TjY2N+PWvf42zzjoL27ZtQ0ZGRp8/Tx2JRqP40Y9+1KM6PB4Pli5dCovFgvvvvx/btm3DI488AqD9Of/Wt76Furo6PPLII7BYLPjtb3+btJ4//elPsf/fdtttcXkVFRW48MILUVhYiBUrViA3NxcA8PWvf71HbT/ZwYMH8eSTT3aq7J/+9Cfs3LmzV/dPRETph0E/ERFRN4miiG984xu49957sWHDBkydOhUvv/xy7GLAyYxGI9atWxcL3EeOHIkf/OAHeOWVV3DJJZegsbERq1atwnnnnYfXX389NlJg4sSJuOWWW/DnP/8Zy5Yti9WXn5+Pr33ta7G/FUXBAw88gMbGRuTm5uLo0aNYuXIl7r//ftx1112xcpdffjlmzZqF3/zmN3HpA+HJJ59EZWUlvvSlL+Hw4cPdquP3v/89mpub8dlnn2HChAkAgCuvvBJ/+9vf8LOf/QxXXXUVAMBqteLhhx/GT37yE+Tn58e2j0ajEAQh7rm8++674/bx5ptvIhgM4tlnn8Vpp50WS+/toP9HP/oRJkyYALfb3WG5cDiMFStW4Pzzz8frr7/eq20gIqL0wuH9REREPXDjjTdCFEU8+eST+MMf/oBIJJJyAb+bb745rqf+29/+NmRZjt0B4K233kIkEsGtt94aC/iB9mkETqcTr776alx9iqKgqakJjY2N2LhxI/7xj39g+vTpyMnJAQC8+OKL0DQNV111FZqammKPgoICjBs3DuvXr4+rT1XVuHJNTU1Jh8OfuO/m5mZEo9GuP3EAAoEAfvKTn+CWW25BaWlpyjInt0lV1bgy69atw+TJk2MBPwCUl5cDAObPnx9Lu/zyyxEOh/H+++/HbR+JRGAymTpsq9frBQBkZ2d3/gC7aMuWLXjhhRewatWquNc/mcceewzNzc1YuXJln7WHiIjSA3v6iYiIeqCkpARLly7Fiy++iA8//BBFRUW48MILk5YdN25c3N92ux2FhYWxedpHjx4FgLjgFWgfITB69OhY/nEffvhhbJj58fpfeuml2FSBAwcOQNf1hP0ed+IFCADYu3dvXH0deeONN2JlJUnC9OnT8dOf/hTnnXdep7YHgIcffhihUAh33XUXli9fnrTMypUrkwa2J/bUV1VVobi4+JT7O17m5PUM2traYLfbO9x2wYIFAIDvf//7WLVqVaefp6644447cOaZZ+Kiiy6KWyfgZG63Gw888ACWL18e9zwQERElw6CfiIioh2666Sa89tprOHr0KH70ox9Blvvn53X69On4xS9+AQBobGzEI488grPPPhtbt25FQUEBNE2DIAh4/fXXIUlSwvYnB7qjRo1KmE/+wgsv4IknnkjYtry8HPfffz8AoKamBg8++CC+8pWvYPfu3XEL8qXS1NSEn//857jzzjuRlZWVstzNN9+MK6+8Mi7tpptuivv7+KJ9nRUMBuP+rqurQ0FBQYfbLFy4ED//+c9x7733YvLkyV3aX2e88cYbeOutt7Bx48ZTln3wwQchiiK+//3vo7m5udfbQkRE6YVBPxERUQ9ddNFFyMvLQ2NjI775zW+mLHfgwAF86Utfiv3t8/lQW1uLCy64AED7HH8A2LdvH0aPHh0rF4lEUFFRgcWLF8fVl5mZGZd29tlno6ioCH/4wx9w5513YsyYMdB1HWVlZRg/fvwpj8NmsyXsY/v27UnL5uTkxJUdO3YsTj/9dLz77rudCvrvv/9+OBwOfO973+uw3Lhx4xLaZLPZ4v4uLCxETU3NKfdZXV0NACgqKopL37NnD2bPnn3K7W+//XYcOHAAf//73/HHP/4RRqMR55577im3OxVd13HHHXfgK1/5Stx6AcnU1NTgV7/6FVatWgWHw8Ggn4iITolz+omIiHqopaUFbrcb5557bocB7xNPPAFFUWJ/P/7444hGozj//PMBAIsXL4bRaMQjjzwCXddj5X7/+9/D7XannDZw3PEe7HA4DKB9DrskSbj33nvj6gPaA83eDBg1TQOApCMKTnbkyBE8/vjjuOeee2CxWHq877POOgu7d+/G/v37Y2mbNm0CAGzevDmW9tJLLwEAzjzzzFjaJ598gkOHDuHLX/7yKffzyiuv4IknnsDvfvc7XHDBBQkXI7rrr3/9K3bs2IFVq1adsuy9996L/Px8fOtb3+qVfRMRUfpjTz8REVE37dixA3/729/w+uuvIxwOdzgPG2jvsT/nnHNw1VVXYd++ffjNb36DM844A5dccgkAIDc3F3feeSfuvfdeLF26FJdcckms3Lx58+JWlweA+vp6/PnPfwbQPlz+t7/9LWRZxkUXXQQAGDNmDO6//37ceeedOHLkCC677DI4HA5UVFTgH//4B26++Wbcfvvt3Tr2xsZGrF27FgBQW1uLBx98EC6XK24kQyrvvPMOJk2aFHcngp74zne+g1//+tc499xzceutt2Lr1q149913AbTPk6+oqIjdsu+aa66JjaL4yU9+gl/96lcYPXo0rrvuug73UVdXhxtvvBHf/OY3cdlll52yTW+//TacTmfs7wMHDgAAdu7ciZ07d2LatGmxvDfeeAM33XRTwloOybzxxht49tlnYTQaT1mWiIgIYNBPRETUbVu3bsUDDzyAoqIirFixAhdffHGH5R999FE8++yzWLFiBRRFwbXXXotHHnkktvAeANxzzz3Izc3Fo48+ittuuw1ZWVm4+eab8cADDyRdeO/4LeMyMjIwZcoUPPzww5g7d26szB133IHx48fjl7/8Je69914A7YsPnnfeebGLDd2xefPm2AiFnJwczJ49G88880z
"text/plain": [
"<Figure size 1200x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"Q1 = df['oil prices'].quantile(0.25)\n",
"Q3 = df['oil prices'].quantile(0.75)\n",
"IQR = Q3 - Q1\n",
"\n",
"threshold = 1.5 * IQR\n",
"outliers = (df['oil prices'] < (Q1 - threshold)) | (df['oil prices'] > (Q3 + threshold))\n",
"\n",
"median_rating = df['oil prices'].median()\n",
"df.loc[outliers, 'oil prices'] = median_rating\n",
"\n",
"plt.figure(figsize=(12, 8))\n",
"ax = sns.scatterplot(x='exchange_rate', y='gdppercent', hue='inflationrate', data=df)\n",
"plt.title('Уровень инфляции')\n",
"plt.xlabel('Валютный курс')\n",
"plt.ylabel('Цены на нефть')\n",
"plt.legend(title='inflationrate')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение данных на выборки и оценка сбалансированности выборки"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 221\n",
"Размер контрольной выборки: 74\n",
"Размер тестовой выборки: 74\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# обучающая и тестовая\n",
"train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)\n",
"\n",
"# обучающая на обучающую и контрольную\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Конструирование признаков"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1) Кодирование категориальных признаков"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of stock index year index price log_indexprice inflationrate \\\n",
"0 NASDAQ 1980.0 168.61 2.23 0.14 \n",
"1 NASDAQ 1981.0 203.15 2.31 0.10 \n",
"2 NASDAQ 1982.0 188.98 2.28 0.06 \n",
"3 NASDAQ 1983.0 285.43 2.46 0.03 \n",
"4 NASDAQ 1984.0 248.89 2.40 0.04 \n",
".. ... ... ... ... ... \n",
"364 IEX 35 2016.0 9352.10 3.97 NaN \n",
"365 IEX 35 2017.0 10043.90 4.00 0.02 \n",
"366 IEX 35 2018.0 8539.90 3.93 0.02 \n",
"367 IEX 35 2019.0 9549.20 3.98 0.01 \n",
"368 IEX 35 2020.0 8073.70 3.91 NaN \n",
"\n",
" oil prices exchange_rate gdppercent percapitaincome unemploymentrate \\\n",
"0 21.59 1.00 0.09 12575.0 0.07 \n",
"1 31.77 1.00 0.12 13976.0 0.08 \n",
"2 28.52 1.00 0.04 14434.0 0.10 \n",
"3 26.19 1.00 0.09 15544.0 0.10 \n",
"4 25.88 1.00 0.11 17121.0 0.08 \n",
".. ... ... ... ... ... \n",
"364 51.97 1.11 0.03 26523.0 0.20 \n",
"365 57.88 1.13 0.03 28170.0 0.17 \n",
"366 49.52 1.18 0.02 30389.0 0.15 \n",
"367 59.88 1.12 0.02 29565.0 0.14 \n",
"368 47.02 1.14 -0.11 27057.0 0.16 \n",
"\n",
" ... USTreasury country_China country_France country_Germany \\\n",
"0 ... 0.11 False False False \n",
"1 ... 0.14 False False False \n",
"2 ... 0.13 False False False \n",
"3 ... 0.11 False False False \n",
"4 ... 0.12 False False False \n",
".. ... ... ... ... ... \n",
"364 ... 0.02 False False False \n",
"365 ... 0.02 False False False \n",
"366 ... 0.03 False False False \n",
"367 ... 0.02 False False False \n",
"368 ... 0.01 False False False \n",
"\n",
" country_Hong Kong country_India country_Japan country_Spain \\\n",
"0 False False False False \n",
"1 False False False False \n",
"2 False False False False \n",
"3 False False False False \n",
"4 False False False False \n",
".. ... ... ... ... \n",
"364 False False False True \n",
"365 False False False True \n",
"366 False False False True \n",
"367 False False False True \n",
"368 False False False True \n",
"\n",
" country_United Kingdom country_United States of America \n",
"0 False True \n",
"1 False True \n",
"2 False True \n",
"3 False True \n",
"4 False True \n",
".. ... ... \n",
"364 False False \n",
"365 False False \n",
"366 False False \n",
"367 False False \n",
"368 False False \n",
"\n",
"[369 rows x 22 columns]>\n"
]
}
],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"df = pd.get_dummies(df, columns=['country'])\n",
"print(df.head)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Дискретизация числовых признаков"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"min = 27.0\n",
"max = 65280.0\n",
"10880.0\n"
]
}
],
"source": [
"print(f\"min = {df['percapitaincome'].min()}\")\n",
"print(f\"max = {df['percapitaincome'].max()}\")\n",
"print(df['percapitaincome'].max()/6)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of 0 низкий\n",
"1 низкий\n",
"2 низкий\n",
"3 низкий\n",
"4 низкий\n",
" ... \n",
"364 средний\n",
"365 средний\n",
"366 средний\n",
"367 средний\n",
"368 средний\n",
"Name: percapitaincome_level, Length: 369, dtype: category\n",
"Categories (5, object): ['незначительный' < 'низкий' < 'средний' < 'высокий' < 'очень высокий']>\n"
]
}
],
"source": [
"from sklearn.preprocessing import KBinsDiscretizer\n",
"\n",
"bins = [0, 11000, 22000, 33000, 44000, float('inf')]\n",
"labels = ['незначительный', 'низкий', 'средний', 'высокий', 'очень высокий']\n",
"\n",
"df['percapitaincome_level'] = pd.cut(df['percapitaincome'], bins=bins, labels=labels)\n",
"print(df['percapitaincome_level'].head)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"3) Ручной синтез признаков"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n",
"e:\\AIM1.5\\Scripts\\Lib\\site-packages\\featuretools\\synthesis\\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:\n",
" agg_primitives: ['max', 'mean', 'min', 'std', 'sum']\n",
"This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n",
" warnings.warn(warning_msg, UnusedPrimitiveWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Сгенерированные признаки:\n",
" stock index year index price log_indexprice inflationrate \\\n",
"index \n",
"0 NASDAQ 1980.0 168.61 2.23 0.14 \n",
"1 NASDAQ 1981.0 203.15 2.31 0.10 \n",
"2 NASDAQ 1982.0 188.98 2.28 0.06 \n",
"3 NASDAQ 1983.0 285.43 2.46 0.03 \n",
"4 NASDAQ 1984.0 248.89 2.40 0.04 \n",
"\n",
" oil prices exchange_rate gdppercent percapitaincome \\\n",
"index \n",
"0 21.59 1.0 0.09 12575 \n",
"1 31.77 1.0 0.12 13976 \n",
"2 28.52 1.0 0.04 14434 \n",
"3 26.19 1.0 0.09 15544 \n",
"4 25.88 1.0 0.11 17121 \n",
"\n",
" unemploymentrate ... oil prices * year percapitaincome * USTreasury \\\n",
"index ... \n",
"0 0.07 ... 42748.20 1383.25 \n",
"1 0.08 ... 62936.37 1956.64 \n",
"2 0.10 ... 56526.64 1876.42 \n",
"3 0.10 ... 51934.77 1709.84 \n",
"4 0.08 ... 51345.92 2054.52 \n",
"\n",
" percapitaincome * tradebalance percapitaincome * unemploymentrate \\\n",
"index \n",
"0 -164229.50 880.25 \n",
"1 -174979.52 1118.08 \n",
"2 -288246.98 1443.40 \n",
"3 -802692.16 1554.40 \n",
"4 -1758840.33 1369.68 \n",
"\n",
" percapitaincome * year tradebalance * USTreasury \\\n",
"index \n",
"0 24898500.0 -1.4366 \n",
"1 27686456.0 -1.7528 \n",
"2 28608188.0 -2.5961 \n",
"3 30823752.0 -5.6804 \n",
"4 33968064.0 -12.3276 \n",
"\n",
" tradebalance * unemploymentrate tradebalance * year \\\n",
"index \n",
"0 -0.9142 -25858.80 \n",
"1 -1.0016 -24802.12 \n",
"2 -1.9970 -39580.54 \n",
"3 -5.1640 -102402.12 \n",
"4 -8.2184 -203816.32 \n",
"\n",
" unemploymentrate * USTreasury unemploymentrate * year \n",
"index \n",
"0 0.0077 138.60 \n",
"1 0.0112 158.48 \n",
"2 0.0130 198.20 \n",
"3 0.0110 198.30 \n",
"4 0.0096 158.72 \n",
"\n",
"[5 rows x 207 columns]\n",
"\n",
"Описание:\n",
"[<Feature: stock index>, <Feature: year>, <Feature: index price>, <Feature: log_indexprice>, <Feature: inflationrate>, <Feature: oil prices>, <Feature: exchange_rate>, <Feature: gdppercent>, <Feature: percapitaincome>, <Feature: unemploymentrate>, <Feature: manufacturingoutput>, <Feature: tradebalance>, <Feature: USTreasury>, <Feature: country_China>, <Feature: country_France>, <Feature: country_Germany>, <Feature: country_Hong Kong>, <Feature: country_India>, <Feature: country_Japan>, <Feature: country_Spain>, <Feature: country_United Kingdom>, <Feature: country_United States of America>, <Feature: percapitaincome_level>, <Feature: index price_scaled>, <Feature: log_indexprice_scaled>, <Feature: USTreasury + year>, <Feature: exchange_rate + USTreasury>, <Feature: exchange_rate + gdppercent>, <Feature: exchange_rate + index price>, <Feature: exchange_rate + index price_scaled>, <Feature: exchange_rate + inflationrate>, <Feature: exchange_rate + log_indexprice>, <Feature: exchange_rate + log_indexprice_scaled>, <Feature: exchange_rate + manufacturingoutput>, <Feature: exchange_rate + oil prices>, <Feature: exchange_rate + percapitaincome>, <Feature: exchange_rate + tradebalance>, <Feature: exchange_rate + unemploymentrate>, <Feature: exchange_rate + year>, <Feature: gdppercent + USTreasury>, <Feature: gdppercent + index price>, <Feature: gdppercent + index price_scaled>, <Feature: gdppercent + inflationrate>, <Feature: gdppercent + log_indexprice>, <Feature: gdppercent + log_indexprice_scaled>, <Feature: gdppercent + manufacturingoutput>, <Feature: gdppercent + oil prices>, <Feature: gdppercent + percapitaincome>, <Feature: gdppercent + tradebalance>, <Feature: gdppercent + unemploymentrate>, <Feature: gdppercent + year>, <Feature: index price + USTreasury>, <Feature: index price + index price_scaled>, <Feature: index price + inflationrate>, <Feature: index price + log_indexprice>, <Feature: index price + log_indexprice_scaled>, <Feature: index price + manufacturingoutput>, <Feature: index price + oil prices>, <Feature: index price + percapitaincome>, <Feature: index price + tradebalance>, <Feature: index price + unemploymentrate>, <Feature: index price + year>, <Feature: index price_scaled + USTreasury>, <Feature: index price_scaled + inflationrate>, <Feature: index price_scaled + log_indexprice>, <Feature: index price_scaled + log_indexprice_scaled>, <Feature: index price_scaled + manufacturingoutput>, <Feature: index price_scaled + oil prices>, <Feature: index price_scaled + percapitaincome>, <Feature: index price_scaled + tradebalance>, <Feature: index price_scaled + unemploymentrate>, <Feature: index price_scaled + year>, <Feature: inflationrate + USTreasury>, <Feature: inflationrate + log_indexprice>, <Feature: inflationrate + log_indexprice_scaled>, <Feature: inflationrate + manufacturingoutput>, <Feature: inflationrate + oil prices>, <Feature: inflationrate + percapitaincome>, <Feature: inflationrate + tradebalance>, <Feature: inflationrate + unemploymentrate>, <Feature: inflationrate + year>, <Feature: log_indexprice + USTreasury>, <Feature: log_indexprice + log_indexprice_scaled>, <Feature: log_indexprice + manufacturingoutput>, <Feature: log_indexprice + oil prices>, <Feature: log_indexprice + percapitaincome>, <Feature: log_indexprice + tradebalance>, <Feature: log_indexprice + unemploymentrate>, <Feature: log_indexprice + year>, <Feature: log_indexprice_scaled + USTreasury>, <Feature: log_indexprice_scaled + manufacturingoutput>, <Feature: log_indexprice_scaled + oil prices>, <Feature: log_indexprice_scaled + percapitaincome>, <Feature: log_indexprice_scaled + tradebalance>, <Feature: log_indexprice_scaled + unemploymentrate>, <Feature: log_indexprice_scaled + year>, <Feature: manufacturingoutput + USTreasury>, <Feature: manufacturingoutput + oil prices>, <Feature: manufacturingoutput + percapitaincome>, <Feature: manufacturingoutput + tradebalance>, <Feature: manufacturingoutput + unemploymentrate>, <Feature: manufacturingoutput + year>, <Feature: oil prices + USTreasury>, <Feature: oil prices + perc
]
}
],
"source": [
"# pip install featuretools\n",
"import featuretools as ft\n",
"\n",
"es = ft.EntitySet(id='economy_data')\n",
"es.add_dataframe(\n",
" dataframe=df,\n",
" dataframe_name='economy',\n",
" index='index',\n",
" make_index=True\n",
")\n",
"\n",
"# Автоматическое конструирование\n",
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=es,\n",
" target_dataframe_name='economy',\n",
" agg_primitives=['mean', 'sum', 'max', 'min', 'std'],\n",
" trans_primitives=['add_numeric', 'multiply_numeric'],\n",
" max_depth=2 \n",
")\n",
"\n",
"print(\"Сгенерированные признаки:\")\n",
"print(feature_matrix.head())\n",
"print(\"\\nОписание:\")\n",
"print(feature_defs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"4. Масштабирование"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import MinMaxScaler\n",
"\n",
"scaler_minmax = MinMaxScaler()\n",
"df[['index price_scaled', 'log_indexprice_scaled']] = scaler_minmax.fit_transform(df[['index price', 'log_indexprice']])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оценка качества наборов признаков:\n",
"Набор данных достаточно полный, но требует предварительной обработки (заполнение пропусков, удаление выбросов, нормализация). После обработки он может быть использован для анализа и построения моделей."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Scripts",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}