896 lines
172 KiB
Plaintext
Raw Normal View History

2024-12-16 21:15:20 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"ПОПУЛЯЦИЯ ЧЕЛОВЕЧЕСТВА В СТРАНАХ"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 235 entries, 1 to 235\n",
"Data columns (total 12 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Country (or dependency) 235 non-null object \n",
" 1 Population2020 235 non-null int64 \n",
" 2 Yearly Change 235 non-null float64\n",
" 3 NetChange 235 non-null object \n",
" 4 Density (P/Km²) 235 non-null object \n",
" 5 LandArea 235 non-null int64 \n",
" 6 Migrants (net) 201 non-null object \n",
" 7 Fert. Rate 235 non-null object \n",
" 8 Med. Age 235 non-null object \n",
" 9 Urban Pop % 235 non-null object \n",
" 10 World Share 235 non-null object \n",
" 11 Net Change 235 non-null int64 \n",
"dtypes: float64(1), int64(3), object(8)\n",
"memory usage: 23.9+ KB\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\n",
" \".//static//csv///world-population-by-country-2020.csv\", index_col=\"no\"\n",
")\n",
"\n",
"df[\"Population2020\"] = df[\"Population2020\"].apply(\n",
" lambda x: int(\"\".join(x.split(\",\")))\n",
")\n",
"df[\"Net Change\"] = df[\"NetChange\"].apply(\n",
" lambda x: int(\"\".join(x.split(\",\")))\n",
")\n",
"df[\"Yearly Change\"] = df[\"Yearly Change\"].apply(lambda x: float(\"\".join(x.rstrip(\"%\"))))\n",
"df[\"LandArea\"] = df[\"LandArea\"].apply(\n",
" lambda x: int(\"\".join(x.split(\",\")))\n",
")\n",
"\n",
"df.info()\n",
"# print(df['date'].head)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разделим на 3 выборки\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 150\n",
"Размер контрольной выборки: 38\n",
"Размер тестовой выборки: 47\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки (80% - обучение, 20% - тест)\n",
"train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную (80% - обучение, 20% - контроль)\n",
"train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_data))\n",
"print(\"Размер контрольной выборки:\", len(val_data))\n",
"print(\"Размер тестовой выборки:\", len(test_data))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjIAAAHHCAYAAACle7JuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABVB0lEQVR4nO3deVhUZf8G8HuGZUCRVdkUAUHE3aJUXHJDkTQ1fd0yt9wqNJfSpFJcw9RcUtQ0Qy2XxC2tXjVRcwk3EpdSREVRYMCNVdlmnt8fvsyPkQGBgJmj9+e6zqVzznOe833mzAz3nDlnRiaEECAiIiKSILm+CyAiIiIqLwYZIiIikiwGGSIiIpIsBhkiIiKSLAYZIiIikiwGGSIiIpIsBhkiIiKSLAYZIiIikiwGGSIiA5Sfn4+UlBTEx8fruxSqYLm5uVAqlUhMTNR3KS8EBhkiIgMRGxuLMWPGwMnJCaampnBwcICvry/4Beyl8+OPP+LWrVua2xs2bEBCQoL+Cirk3LlzeOedd1CzZk0oFAo4OTmhX79++i7rhWCs7wJedBs2bMDIkSM1txUKBerWrYtu3bphxowZcHBw0GN1RGQoTp06hYCAANja2mL69Olo1KgRZDIZrKysIJPJ9F2eJBw/fhx79+7FwoULERMTg8DAQFy/fl3fZeHnn3/GwIED4e3tjfnz58PDwwMAYG9vr+fKXgwy/tZS5SoIMnPmzIG7uzuys7Nx4sQJ/PDDD3B1dcXly5dRrVo1fZdJRHqUm5uL5s2bw9LSEgcPHoSVlZW+S5Kkq1evomPHjkhOTgYATJkyBV9//bVea3r48CEaNGiANm3aIDw8HKampnqt50XEIzJVJCAgAK+99hoAYPTo0bCzs8OSJUvw888/Y/DgwXqujoj0ad++fYiJicHVq1cZYv4Fb29v3LhxA5cvX0bNmjU1Rz70KSwsDNnZ2diwYQNDTCXhOTJ60rlzZwBAXFwcgKep/ZNPPkHTpk1hYWEBS0tLBAQE4MKFC0XWzc7OxqxZs+Dl5QUzMzM4OTmhb9++uHHjBgDg1q1bkMlkxU4dO3bU9HX06FHIZDL89NNP+Oyzz+Do6Ijq1aujV69euHPnTpFtnz59Gt27d4eVlRWqVauGDh064OTJkzrH2LFjR53bnzVrVpG2P/74I3x8fGBubg5bW1sMGjRI5/ZLGltharUay5YtQ+PGjWFmZgYHBweMGzcOjx490mrn5uaGnj17FtnO+PHji/Spq/ZFixYVuU8BICcnB8HBwfD09IRCoYCLiwumTZuGnJwcnfdVYcXdbwVT4XMAAGDVqlVo3LgxFAoFnJ2dERgYiNTU1HL1OXz4cNSsWRN5eXlF6urWrRsaNGigNW/Dhg3PfYwBwPnz59G9e3fUqlVLq13h+76gr3Pnzmmte//+fZ33fWn3R8Fj/OjRo1pte/ToUezjsbCC9QsmhUIBLy8vhISElOrclZSUFIwaNQoODg4wMzND8+bNsXHjRq02p06dgru7O3bu3AkPDw+Ympqibt26mDZtGp48eaJpV9r9U9yYR4wYATc3N615ixcvRps2bWBnZwdzc3P4+Phgx44dRfp3c3PDiBEjNLczMjIwfvx41K5dGwqFAvXr18eCBQugVqu11pPJZBg/frzWvJ49exapY8eOHTprTk1NxaRJk+Di4gKFQgFPT0989dVXWtspeF3YsGEDqlevjlatWsHDwwOBgYGQyWRadevy7OuKiYkJ3NzcMHXqVOTm5mraFfcYLaxjx45aj79Tp06hRYsW+PLLLzVjKO6+ys/Px9y5c+Hh4QGFQgE3Nzd89tlnRV43Cl63Dh48iBYtWsDMzAyNGjXCrl27tNoV1Fv4NePvv/+GjY0Nevbsifz8/DLdz4aKR2T0pCB02NnZAQBu3ryJPXv2oH///nB3d0dycjK+/fZbdOjQAf/88w+cnZ0BACqVCj179kRERAQGDRqEiRMnIiMjA7///jsuX76s9Q5k8ODBePPNN7W2GxQUpLOe+fPnQyaT4dNPP0VKSgqWLVsGPz8/REdHw9zcHABw+PBhBAQEwMfHB8HBwZDL5QgLC0Pnzp1x/PhxtGzZski/derUQUhICAAgMzMTH3zwgc5tz5gxAwMGDMDo0aNx7949rFixAm+88QbOnz8Pa2vrIuuMHTsW7du3BwDs2rULu3fv1lo+btw4zcd6H330EeLi4rBy5UqcP38eJ0+ehImJic77oSxSU1M1YytMrVajV69eOHHiBMaOHYuGDRvi0qVLWLp0Ka5du4Y9e/Y8t+/C91uB3377DVu3btWaN2vWLMyePRt+fn744IMPEBMTg9WrV+Ps2bNFxlmaPocOHYpNmzbhwIEDWiFDqVTi8OHDCA4O1lnv0qVLUbNmTQBP92dhaWlpCAgIgBACU6ZMgYuLCwBg8uTJz70fyqK4/aHLsWPH8Ntvv5Wp/88++wwNGzbEkydPNMHf3t4eo0aNKnadJ0+eoGPHjrh+/TrGjx8Pd3d3hIeHY8SIEUhNTcXEiRMBAA8ePMDNmzfx2WefoW/fvvj4449x7tw5LFq0CJcvX8avv/4KmUxW7v1TkuXLl6NXr14YMmQIcnNzsW3bNvTv3x+//PILevToUex6/fr1w++//45hw4ahZcuWOHLkCIKCgnDr1i2sWbOmzHXo8vjxY3To0AEJCQkYN24c6tatiz///BNBQUFISkrCsmXLil33+vXrWLduXZm2V/C6kpOTgwMHDmDx4sUwMzPD3Llzyz2GBw8e4MSJEzhx4gTee+89+Pj4ICIiQud9NXr0aGzcuBH/+c9/8PHHH+P06dMICQnBlStXirzGxcbGYuDAgXj//fcxfPhwhIWFoX///ti/fz+6du2qs5Y7d+6ge/fu8Pb2xvbt22Fs/DQC/Jv72SAIqlRhYWECgDh06JC4d++euHPnjti2bZuws7MT5ubm4u7du0IIIbKzs4VKpdJaNy4uTigUCjFnzhzNvO+//14AEEuWLCmyLbVarVkPgFi0aFGRNo0bNxYdOnTQ3D5y5IgAIGrXri3S09M187dv3y4AiOXLl2v6rl+/vvD399dsRwghHj9+LNzd3UXXrl2LbKtNmzaiSZMmmtv37t0TAERwcLBm3q1bt4SRkZGYP3++1rqXLl0SxsbGRebHxsYKAGLjxo2aecHBwaLwQ/n48eMCgNi8ebPWuvv37y8y39XVVfTo0aNI7YGBgeLZp8eztU+bNk3Y29sLHx8frfv0hx9+EHK5XBw/flxr/TVr1ggA4uTJk0W2V1iHDh1E48aNi8xftGiRACDi4uKEEEKkpKQIU1NT0a1bN63HzsqVKwUA8f3335e5T5VKJerUqSMGDhyo1W7JkiVCJpOJmzdvas1ft26dACBu376tta3C98eBAwcEALF161atdZ+97wueK2fPntVqp+txI0Tp90fBY/zIkSOaea1atRIBAQE6+32WrvWzs7OFXC4XH374YYnrLlu2TAAQP/74o2Zebm6u8PX1FRYWFprn3PDhwwUAMWLECK31Cx7b+/btE0KUfv/88ccfAoA4fPiwVrvhw4cLV1dXrXmPHz/Wup2bmyuaNGkiOnfurDXf1dVVDB8+XAghxL59+wQAMX36dK02I0aMEADEpUuXNPMAiMDAQK12PXr0KFJHeHh4kft57ty5onr16uLatWtabadPny6MjIxEfHy8EOL/X/PCwsI0bQYMGCCaNGkiXFxcNHUXR9f6Qgjh7Ows3nzzTc3t4h6jhT37+O/QoYMAIGbNmqXV7tn7Kjo6WgAQo0eP1mr3ySefFNmXrq6uAoDYuXOnZl5aWppwcnISr7zySpF64+LixMOHD0WjRo1EgwYNxP3797W2Udr72VDxo6Uq4ufnh1q1asHFxQWDBg2ChYUFdu/ejdq1awN4ejWTXP50d6hUKjx48AAWFhZo0KAB/vrrL00/O3fuRM2aNTFhwoQi2/g3VzYMGzYMNWrU0Nz+z3/+AycnJ8271ujoaMTGxuKdd97BgwcPcP/+fdy/fx9ZWVno0qULjh07VuQQZHZ2NszMzErc7q5du6BWqzFgwABNn/f
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkAAAAHHCAYAAABXx+fLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABhCklEQVR4nO3deVxU1f8/8NedGRh2kB0UBVFw18QlXBITRbLcSsss0Vz6lpZmttDiVv20rNTStPrklppmprYYLuRSiRuKiimigqCyquzbMHN+fyATI4uAwAzM6/l43GzuPffM+84MM6+599w7khBCgIiIiMiIyPRdABEREVFDYwAiIiIio8MAREREREaHAYiIiIiMDgMQERERGR0GICIiIjI6DEBERERkdBiAiIiIyOgwABERETVi2dnZiI+PR25urr5LaVQYgIiIiO6Rk5ODZcuWaW9nZGRg5cqV+iuoDCEEvvnmGzz88MOwsLCAjY0NvLy8sHHjRn2X1qgwAOnRunXrIEmSdjIzM4OPjw9mzJiBlJQUfZdHRGS0zM3N8d5772HTpk1ITEzE/Pnz8euvv+q7LADAs88+i//7v/9D+/bt8f3332Pfvn3Yv38/Ro8ere/SGhWFvgsgYOHChfDy8kJBQQH+/vtvrFq1Crt370Z0dDQsLCz0XR4RkdGRy+VYsGABJkyYAI1GAxsbG/z+++/6LgsbNmzA1q1bsXHjRjz77LP6LqdRk/hjqPqzbt06TJo0CSdOnECPHj20819//XV8/vnn2Lx5M8aNG6fHComIjNv169eRmJiI9u3bw87OTt/loHPnzujSpQs2bdqk71IaPR4CM0CPPvooACAuLg4AcPv2bcyZMwedO3eGlZUVbGxsEBwcjDNnzpRbt6CgAPPnz4ePjw/MzMzg5uaG0aNH48qVKwCA+Ph4ncNu904BAQHavg4ePAhJkrB161a88847cHV1haWlJYYPH47ExMRy933s2DEMHToUtra2sLCwwIABA/DPP/9UuI0BAQEV3v/8+fPLtd24cSP8/Pxgbm4Oe3t7PPPMMxXef1XbVpZGo8GyZcvQsWNHmJmZwcXFBS+++CLu3Lmj087T0xOPP/54ufuZMWNGuT4rqn3JkiXlHlMAKCwsxLx589CmTRsolUp4eHjgzTffRGFhYYWPVVmVPW6lU3x8vE77r776Ch07doRSqYS7uzumT5+OjIyMWvUZEhICR0dHqFSqcnUNGTIEvr6+OvPuPcRb0WsMAE6fPo2hQ4fCyclJp13Zx760r5MnT+qsm56eXuFjX93no/Q1fvDgQZ22w4YNq/T1WFbp+qWTUqmEj48PFi1ahPt9t6zovm/evAlPT0/06NEDOTk52vmpqamYPHkyXFxcYGZmhq5du2L9+vU6/ZW+/j/99NNy99WpUyftdt9bc1V/h/Pnz4ckSbh48SLGjh0LGxsbODg4YObMmSgoKNC5j+LiYnzwwQfw9vaGUqmEp6cn3nnnnQpf15XV4OnpWa7NTz/9VOXjWFpjenq6zvyTJ09CkiSsW7dOZ/6ff/6J/v37w9LSEnZ2dhgxYgQuXLhQYZ8A0KJFC/j7+0OhUMDV1bXC10tlNZVO1tbW6NWrF3bu3KnTLiAgAJ06daq0n9LntHQbcnNzER0dDQ8PDwwbNgw2NjawtLREQEAA/vrrr3LrX716FWPGjIG9vT0sLCzw8MMPl9uLVZP3+YCAgHJ/vx999BFkMhk2b96sM78mnwf6wkNgBqg0rDg4OAAoeRHv3LkTY8aMgZeXF1JSUvD1119jwIAB+Pfff+Hu7g4AUKvVePzxxxEeHo5nnnkGM2fORHZ2Nvbt24fo6Gh4e3tr72PcuHF47LHHdO43NDS0wno++ugjSJKEt956C6mpqVi2bBkCAwMRFRUFc3NzACVvKsHBwfDz88O8efMgk8mwdu1aPProo/jrr7/Qq1evcv22aNECixYtAlAy4PCll16q8L7ff/99jB07FlOmTEFaWhq+/PJLPPLIIzh9+nSF38imTZuG/v37AwB+/vln7NixQ2f5iy++qN379uqrryIuLg4rVqzA6dOn8c8//8DExKTCx6EmMjIytNtWlkajwfDhw/H3339j2rRpaN++Pc6dO4elS5fi0qVL5d4gK1L2cSu1e/du/PDDDzrz5s+fjwULFiAwMBAvvfQSYmJisGrVKpw4caLcdlanz+effx4bNmzAnj17dMJJcnIy/vzzT8ybN6/CepcuXQpHR0cAJc9nWZmZmQgODoYQArNnz4aHhwcA4LXXXrvv41ATlT0fFTl8+DB2795do/7feecdtG/fHvn5+doPEmdnZ0yePLnafZQ+FiYmJti9ezesrKwAAPn5+QgICMDly5cxY8YMeHl5Ydu2bZg4cSIyMjIwc+bMGtVaOm6k1DfffIMLFy5g6dKl2nldunTRWWfs2LHw9PTEokWLcPToUXzxxRe4c+cONmzYoG0zZcoUrF+/Hk899RRef/11HDt2DIsWLcKFCxfK/Q2WKn3cSutISEio0bbU1P79+xEcHIzWrVtj/vz5yM/Px5dffom+ffvi1KlTOgHsXp999lmNx2aWPs7p6en46quvMGbMGERHR5f7slBdt27dAgB8/PHHcHV1xRtvvAEzMzN8++23CAwMxL59+/DII48AAFJSUtCnTx/k5eXh1VdfhYODA9avX4/hw4fjp59+wqhRo3T6rs77/L3Wrl2L9957D5999pnO4bjafB7ohSC9Wbt2rQAg9u/fL9LS0kRiYqLYsmWLcHBwEObm5uL69etCCCEKCgqEWq3WWTcuLk4olUqxcOFC7bw1a9YIAOLzzz8vd18ajUa7HgCxZMmScm06duwoBgwYoL194MABAUA0b95cZGVlaef/+OOPAoBYvny5tu+2bduKoKAg7f0IIUReXp7w8vISgwcPLndfffr0EZ06ddLeTktLEwDEvHnztPPi4+OFXC4XH330kc66586dEwqFotz82NhYAUCsX79eO2/evHmi7Mv8r7/+EgDEpk2bdNYNCwsrN79Vq1Zi2LBh5WqfPn26uPdP597a33zzTeHs7Cz8/Px0HtPvv/9eyGQy8ddff+msv3r1agFA/PPPP+Xur6wBAwaIjh07lpu/ZMkSAUDExcUJIYRITU0VpqamYsiQITqvnRUrVggAYs2aNTXuU61WixYtWoinn35ap93nn38uJEkSV69e1Zn/7bffCgDi2rVrOvdV9vHYs2ePACB++OEHnXXvfexL/1ZOnDih066i140Q1X8+Sl/jBw4c0M7r3bu3CA4OrrDfe1W0fkFBgZDJZOLll1+u9roFBQUiICBAODs7i8uXL+u0W7ZsmQAgNm7cqJ1XVFQk/P39hZWVlfZvsyZ/22WFhISIVq1aVbis9O9n+PDhOvNffvllAUCcOXNGCCFEVFSUACCmTJmi027OnDkCgPjzzz915u/bt08AEIcOHaq0jtLHZ9u2bRXWdm+NaWlpOvNPnDghAIi1a9dq53Xr1k04OzuLW7duaeedOXNGyGQyMWHChHJ9lkpNTRXW1tba10XZ57uqmsrau3evACB+/PFH7bzK/vZKlT6npdtQetvU1FRcunRJ2y4tLU04ODgIPz8/7bxZs2YJADrvNdnZ2cLLy0t4enpq3xeq+z5fWm/p6+j3338XCoVCvP766zo11+bzQF94CMwABAYGwsnJCR4eHnjmmWdgZWWFHTt2oHnz5gAApVIJmazkqVKr1bh16xasrKzg6+uLU6dOafvZvn07HB0d8corr5S7j3sP2dTEhAkTYG1trb391FNPwc3NTfstOSoqCrGxsXj22Wdx69YtpKenIz09Hbm5uRg0aBAOHz4MjUaj02dBQQHMzMyqvN+ff/4ZGo0GY8eO1faZnp4OV1dXtG3bFgcOHNBpX1RUBKDk8arMtm3bYGtri8GDB+v06efnBysrq3J9qlQqnXbp6enldv3f68aNG/jyyy/x/vvva7/Fl73/9u3bo127djp9lh72vPf+a2v//v0oKirCrFmztK8dAJg6dWqtB3PKZDKMHz8ev/zyC7Kzs7XzN23ahD59+sDLy0u
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjIAAAHHCAYAAACle7JuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABYFUlEQVR4nO3deVwU5R8H8M/usrvcIDcoIop4pVmoiPeB4lFqXmnllWkHWmpmUXlmaWoeFWlWXnnlXZk/Tc0rwwvFW0SFQOUQlfvefX5/IBsrh4DA7ujn/XJfuHM8892Z3eXDzDMzMiGEABEREZEEyQ1dABEREVFFMcgQERGRZDHIEBERkWQxyBAREZFkMcgQERGRZDHIEBERkWQxyBAREZFkMcgQERGRZDHIEBERPab09HTExMTg/v37hi7lqcMgQ0RERuuLL76AVqsFAGi1WsyZM8fAFf1n8+bN6Nq1K6ysrGBpaYnatWtj3rx5hi7rqcMgYyRWrVoFmUyme5iamsLb2xvjxo1DfHy8ocsjIjKI1atXY8GCBbh58ya++uorrF692tAlAQA++ugjDB48GFZWVvjhhx+wd+9e7Nu3D++8846hS3vqmBi6ANI3a9YseHp6IisrC3///TeWLl2KXbt24cKFCzA3Nzd0eURE1WrWrFkYPnw4PvzwQ6jVaqxdu9bQJeHQoUP48ssvMWfOHHz00UeGLuepJ+NNI43DqlWrMGrUKJw8eRItWrTQDX///fexcOFCrF+/HkOHDjVghUREhpGQkIBr166hfv36cHR0NHQ5ePHFF3Hv3j0cPXrU0KUQeGjJ6HXp0gUAEBkZCQC4d+8eJk+ejKZNm8LS0hLW1tbo2bMnzp49W2TerKwszJgxA97e3jA1NYWrqyv69++P69evAwCioqL0Dmc9/OjUqZOurYMHD0Imk+GXX37Bxx9/DBcXF1hYWKBPnz6IiYkpsuzjx4+jR48esLGxgbm5OTp27Fjih75Tp07FLn/GjBlFpl27di18fHxgZmYGOzs7DBkypNjll/baCtNqtVi8eDGaNGkCU1NTODs748033yzSYa9OnTp44YUXiixn3LhxRdosrvb58+cXWacAkJ2djenTp8PLywtqtRru7u6YMmUKsrOzi11XhZW03goeUVFRetN/9913aNKkCdRqNdzc3BAYGIikpKQKtTlixAg4ODggNze3SF3du3dHgwYN9IY9fOi0uPcYAJw5cwY9evSAo6Oj3nSF131BW6dOndKbNzExsdh1X9btUfAeP3jwoN60vXv3LvH9WFjB/AUPtVoNb29vzJkzB6X9vfjwfI/6LNy6dQuvv/46nJ2doVar0aRJE6xYsaJIu6V9/h/12ZfJZBg5cqSurRs3bmDQoEGws7ODubk5WrdujT/++KPCr//MmTPo2bMnrK2tYWlpia5du+LYsWN60xRs56ioKDg5OaFNmzawt7dHs2bNIJPJsGrVqlK3x8PvOXNzczRt2hQ//vij3nQjR46EpaVlqW09vA2OHTuGZ555BkOGDIGdnR3MzMzQsmVL7Nixo8i8CQkJGD16NJydnWFqaopnn322yOGxgu2xYMECLFq0CB4eHjAzM0PHjh1x4cKFIvXWqVNHb9jatWshl8sxd+5cveFXrlzBwIEDYWdnB1NTU7Ro0QK//fZbqa9VinhoycgVhA57e3sA+V8oO3bswKBBg+Dp6Yn4+Hh8//336NixIy5dugQ3NzcAgEajwQsvvID9+/djyJAheO+995Camoq9e/fiwoULqFevnm4ZQ4cORa9evfSWGxQUVGw9n3/+OWQyGT788EMkJCRg8eLF8Pf3R1hYGMzMzAAAf/31F3r27AkfHx9Mnz4dcrkcK1euRJcuXXDkyBG0atWqSLu1atXSdeJLS0vD22+/Xeyyp06disGDB+ONN97AnTt38M0336BDhw44c+YMbG1ti8wzduxYtG/fHgCwbds2bN++XW/8m2++qdsb9u677yIyMhLffvstzpw5g6NHj0KpVBa7HsojKSmp2A6KWq0Wffr0wd9//42xY8eiUaNGOH/+PBYtWoSrV68W+6X4sMLrrcCuXbuwYcMGvWEzZszAzJkz4e/vj7fffhvh4eFYunQpTp48WeR1lqXNYcOGYc2aNdizZ49eyIiLi8Nff/2F6dOnF1vvokWL4ODgACB/exaWnJyMnj17QgiBSZMmwd3dHQAwceLER66H8ihpexTn8OHD2LVrV7na//jjj9GoUSNkZmbqgr+TkxNGjx5d7PSNGjXCzz//rHu+fPlyXL58GYsWLdINa9asGQAgPj4erVu3hkwmw7hx4+Do6Ij//e9/GD16NFJSUjBhwgQAj/78+/v76y2z4LNReFjBd0R8fDzatGmDjIwMvPvuu7C3t8fq1avRp08fbNmyBS+99FK5Xv/FixfRvn17WFtbY8qUKVAqlfj+++/RqVMnHDp0CL6+viWu259//hnnz58vy2bQKXjPpaSkYMWKFRgzZgzq1KkDf3//crVT2N27d7F8+XJYWlri3XffhaOjI9auXYv+/ftj3bp1ur3nmZmZ6NSpE65du4Zx48bB09MTmzdvxsiRI5GUlIT33ntPr901a9YgNTUVgYGByMrKwpIlS9ClSxecP38ezs7Oxdby559/4vXXX8e4ceP0DnNdvHgRbdu2Rc2aNfHRRx/BwsICmzZtQr9+/bB169Yi203SBBmFlStXCgBi37594s6dOyImJkZs3LhR2NvbCzMzM3Hz5k0hhBBZWVlCo9HozRsZGSnUarWYNWuWbtiKFSsEALFw4cIiy9Jqtbr5AIj58+cXmaZJkyaiY8eOuucHDhwQAETNmjVFSkqKbvimTZsEALFkyRJd2/Xr1xcBAQG65QghREZGhvD09BTdunUrsqw2bdqIZ555Rvf8zp07AoCYPn26blhUVJRQKBTi888/15v3/PnzwsTEpMjwiIgIAUCsXr1aN2z69Omi8Fv+yJEjAoBYt26d3ry7d+8uMtzDw0P07t27SO2BgYHi4Y/Rw7VPmTJFODk5CR8fH711+vPPPwu5XC6OHDmiN/+yZcsEAHH06NEiyyusY8eOokmTJkWGz58/XwAQkZGRQgghEhIShEqlEt27d9d773z77bcCgFixYkW529RoNKJWrVri5Zdf1ptu4cKFQiaTiRs3bugN/+GHHwQA8e+//+otq/D62LNnjwAgNmzYoDfvw+u+4LNy8uRJvemKe98IUfbtUfAeP3DggG6Yr6+v6NmzZ7HtPqy4+bOysoRcLhfvvPNOqfMWNmLECOHh4VHsuNGjRwtXV1eRmJioN3zIkCHCxsZGZGRkCCHK9vkv7OHPRmETJkwQAPTep6mpqcLT01PUqVNH954q6+vv16+fUKlU4vr167pht2/fFlZWVqJDhw66YQXbueA9l5WVJWrXrq3bHitXriy23pLmF0KIq1evCgBi3rx5umEjRowQFhYWpbb18PYHIACIgwcP6oZlZGSIRo0aCRcXF5GTkyOEEGLx4sUCgFi7dq1uupycHOHn5ycsLS1136UF38WFv+uFEOL48eMCgJg4caJevQXvj1OnTglLS0sxaNCgIr8XunbtKpo2bSqysrJ0w7RarWjTpo2oX79+qa9Xanhoycj4+/vD0dER7u7uGDJkCCwtLbF9+3bUrFkTAKBWqyGX5282jUaDu3fvwtLSEg0aNMDp06d17WzduhUODg4YP358kWU8fCikPIYPHw4rKyvd84EDB8LV1VX3V2tYWBgiIiLwyiuv4O7du0hMTERiYiLS09PRtWtXHD58WHcqZYGsrCyYmpqWutxt27ZBq9Vi8ODBujYTExPh4uKC+vXr48CBA3rT5+TkAMhfXyXZvHkzbGxs0K1bN702fXx8YGlpWaTN3NxcvekSExORlZVVat23bt3CN998g6lTpxbZfb1582Y0atQIDRs21Guz4HDiw8uvqH379iEnJwcTJkzQvXcAYMyYMbC2ti5yiKAs5HI5Xn31Vfz2229ITU3VDV+3bh3atGkDT09PvenLsj0K2inY+1gVStseD9u2bRtOnjxZZHf9oyQnJyMxMRH
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Гистограмма распределения объема в обучающей выборке\n",
"sns.histplot(train_data[\"Population2020\"], kde=True)\n",
"plt.title('Распределение популяции в обучающей выборке')\n",
"plt.show()\n",
"\n",
"# Гистограмма распределения объема в контрольной выборке\n",
"sns.histplot(val_data[\"Population2020\"], kde=True)\n",
"plt.title('Распределение популяции в контрольной выборке')\n",
"plt.show()\n",
"\n",
"# Гистограмма распределения объема в тестовой выборке\n",
"sns.histplot(test_data[\"Population2020\"], kde=True)\n",
"plt.title(\"Распределение популяции в тестовой выборке\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Процесс конструирования признаков\n",
"\n",
"\n",
"\n",
"### Унитарное кодирование категориальных признаков (one-hot encoding)\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"categorical_features = [\n",
"]\n",
"\n",
"# Применение one-hot encoding\n",
"train_data_encoded = pd.get_dummies(train_data, columns=categorical_features)\n",
"val_data_encoded = pd.get_dummies(val_data, columns=categorical_features)\n",
"test_data_encoded = pd.get_dummies(test_data, columns=categorical_features)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Дискретизация числовых признаков "
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>LandArea</th>\n",
" <th>LandArea</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>216</th>\n",
" <td>53</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>217</th>\n",
" <td>160</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>218</th>\n",
" <td>60</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219</th>\n",
" <td>10</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220</th>\n",
" <td>150</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>221</th>\n",
" <td>328</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>222</th>\n",
" <td>460</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>223</th>\n",
" <td>240</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224</th>\n",
" <td>90</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>225</th>\n",
" <td>30</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>226</th>\n",
" <td>140</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>227</th>\n",
" <td>20</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>228</th>\n",
" <td>21</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>229</th>\n",
" <td>390</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>230</th>\n",
" <td>230</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>231</th>\n",
" <td>100</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>232</th>\n",
" <td>12170</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>233</th>\n",
" <td>260</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>234</th>\n",
" <td>10</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>235</th>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" LandArea LandArea\n",
"no \n",
"216 53 (0.0, 5458956.667]\n",
"217 160 (0.0, 5458956.667]\n",
"218 60 (0.0, 5458956.667]\n",
"219 10 (0.0, 5458956.667]\n",
"220 150 (0.0, 5458956.667]\n",
"221 328 (0.0, 5458956.667]\n",
"222 460 (0.0, 5458956.667]\n",
"223 240 (0.0, 5458956.667]\n",
"224 90 (0.0, 5458956.667]\n",
"225 30 (0.0, 5458956.667]\n",
"226 140 (0.0, 5458956.667]\n",
"227 20 (0.0, 5458956.667]\n",
"228 21 (0.0, 5458956.667]\n",
"229 390 (0.0, 5458956.667]\n",
"230 230 (0.0, 5458956.667]\n",
"231 100 (0.0, 5458956.667]\n",
"232 12170 (0.0, 5458956.667]\n",
"233 260 (0.0, 5458956.667]\n",
"234 10 (0.0, 5458956.667]\n",
"235 0 NaN"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"import numpy as np\n",
"\n",
"\n",
"labels = [\"small country\", \"medium country\", \"big country\"]\n",
"num_bins = 3\n",
"\n",
"hist1, bins1 = np.histogram(\n",
" df[\"LandArea\"].fillna(df[\"LandArea\"].median()), bins=num_bins\n",
")\n",
"bins1, hist1\n",
"\n",
"pd.concat([df[\"LandArea\"], pd.cut(df[\"LandArea\"], list(bins1))], axis=1).tail(20)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>LandArea</th>\n",
" <th>LandArea</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>9388211</td>\n",
" <td>medium country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2973190</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9147420</td>\n",
" <td>medium country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1811570</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>770880</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8358140</td>\n",
" <td>medium country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>910770</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>130170</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>16376870</td>\n",
" <td>big country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1943950</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>364555</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1000000</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>298170</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>995450</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>310070</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>2267050</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>769630</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1628550</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>348560</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>510890</td>\n",
" <td>small country</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" LandArea LandArea\n",
"no \n",
"1 9388211 medium country\n",
"2 2973190 small country\n",
"3 9147420 medium country\n",
"4 1811570 small country\n",
"5 770880 small country\n",
"6 8358140 medium country\n",
"7 910770 small country\n",
"8 130170 small country\n",
"9 16376870 big country\n",
"10 1943950 small country\n",
"11 364555 small country\n",
"12 1000000 small country\n",
"13 298170 small country\n",
"14 995450 small country\n",
"15 310070 small country\n",
"16 2267050 small country\n",
"17 769630 small country\n",
"18 1628550 small country\n",
"19 348560 small country\n",
"20 510890 small country"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat(\n",
" [df[\"LandArea\"], pd.cut(df[\"LandArea\"], list(bins1), labels=labels)], axis=1\n",
").head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Ручной синтез"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# Пример синтеза признака площади страны в кв км на душу человека\n",
"train_data_encoded[\"squareforman\"] = (\n",
" train_data_encoded[\"LandArea\"] / train_data_encoded[\"Population2020\"]\n",
")\n",
"val_data_encoded[\"squareforman\"] = (\n",
" val_data_encoded[\"LandArea\"] / val_data_encoded[\"Population2020\"]\n",
")\n",
"test_data_encoded[\"squareforman\"] = (\n",
" test_data_encoded[\"LandArea\"] / test_data_encoded[\"Population2020\"]\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети."
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
"\n",
"# Пример масштабирования числовых признаков\n",
"numerical_features = [\"Population2020\", \"Yearly Change\"]\n",
"\n",
"scaler = StandardScaler()\n",
"train_data_encoded[numerical_features] = scaler.fit_transform(train_data_encoded[numerical_features])\n",
"val_data_encoded[numerical_features] = scaler.transform(val_data_encoded[numerical_features])\n",
"test_data_encoded[numerical_features] = scaler.transform(test_data_encoded[numerical_features])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Конструирование признаков с применением фреймворка Featuretools"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
" warnings.warn(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
]
}
],
"source": [
"import featuretools as ft\n",
"\n",
"# Определение сущностей\n",
"es = ft.EntitySet(id='pop')\n",
"\n",
"es = es.add_dataframe(dataframe_name='dop', dataframe=train_data_encoded, index='id')\n",
"\n",
"\n",
"# Генерация признаков\n",
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=es, target_dataframe_name=\"dop\", max_depth=2\n",
")\n",
"\n",
"# Преобразование признаков для контрольной и тестовой выборок\n",
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n",
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Оценка качества каждого набора признаков\n",
"Предсказательная способность\n",
"Метрики: RMSE, MAE, R²\n",
"\n",
"Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
"\n",
"Скорость вычисления\n",
"Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
"\n",
"Надежность\n",
"Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
"\n",
"Корреляция\n",
"Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
"\n",
"Цельность\n",
"Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:724: UserWarning: A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: index\n",
" warnings.warn(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
]
}
],
"source": [
"import featuretools as ft\n",
"\n",
"# Определение сущностей\n",
"es = ft.EntitySet(id='pop')\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"dop\", dataframe=train_data_encoded, index=\"id\"\n",
")\n",
"\n",
"# Генерация признаков\n",
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=es, target_dataframe_name=\"dop\", max_depth=2\n",
")\n",
"\n",
"# Преобразование признаков для контрольной и тестовой выборок\n",
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n",
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"RMSE: 0.03387638150888035\n",
"R²: 0.9814104214666138\n",
"MAE: 0.015316153649943631\n",
"Cross-validated RMSE: 0.7480546038440666\n",
"Train RMSE: 0.24468562210527503\n",
"Train R²: 0.9401289463349545\n",
"Train MAE: 0.049477658845671284\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1kAAAIjCAYAAADxz9EgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACE7klEQVR4nOzdd3hU1drG4WcyqaRSkgAhEJoUQZAq1QYERZSiIEhXECV0pSi9N+koFhSPBwVBUFAUFEFFokgVpHcEEkJJ75n9/cHHHEdaApNMyu++rlwy794zeSY7SN6stdcyGYZhCAAAAABgF06ODgAAAAAA+QlNFgAAAADYEU0WAAAAANgRTRYAAAAA2BFNFgAAAADYEU0WAAAAANgRTRYAAAAA2BFNFgAAAADYEU0WAAAAANgRTRYAFEAmk0njxo1zdAyHe+SRR/TII49YH586dUomk0lLly51WKZ/+3fGvGrLli0ymUzasmWLXV+X72UAuRFNFgDco7ffflsmk0n169e/69c4f/68xo0bpz179tgvWC53/Yfu6x8uLi4qV66cunXrphMnTjg6XpZs27ZN48aNU3R0tMMyhISE2Hw9AwIC1KRJE61Zs8Zhmexl/fr1NFIA8hSaLAC4R8uWLVNISIi2b9+uY8eO3dVrnD9/XuPHjy9QTdZ1AwYM0CeffKL33ntPrVq10ooVK1S3bl2dP38+x7OUKVNGSUlJ6tq1a5aet23bNo0fP96hTZYk1axZU5988ok++eQTvfbaazp//rzatWunxYsXOzTXvVq/fr3Gjx9/02NJSUkaNWpUDicCgNujyQKAe3Dy5Elt27ZNs2fPlr+/v5YtW+boSHlOkyZN1KVLF/Xs2VMLFizQrFmzdOXKFX388ce3fE5CQkK2ZDGZTHJ3d5fZbM6W189uQUFB6tKli7p06aJhw4bp119/laenp+bMmePoaNnG3d1dzs7Ojo4BADZosgDgHixbtkyFCxdWq1at9Oyzz96yyYqOjtbgwYMVEhIiNzc3lSpVSt26ddOlS5e0ZcsW1a1bV5LUs2dP63Sv6/cFhYSEqEePHje85r/v1UlNTdWYMWNUu3Zt+fr6ytPTU02aNNHmzZuz/L4iIyPl7Ox809GDw4cPy2QyaeHChZKktLQ0jR8/XhUrVpS7u7uKFi2qxo0b6/vvv8/y55Wkxx57TNK1BlaSxo0bJ5PJpAMHDqhz584qXLiwGjdubD3/v//9r2rXri0PDw8VKVJEzz//vM6ePXvD67733nsqX768PDw8VK9ePf3yyy83nHOre7IOHTqkDh06yN/fXx4eHqpUqZLefPNNa77XX39dklS2bFnr9Tt16lS2ZMyK4sWLq0qVKtavpSTt3r1bTzzxhHx8fOTl5aXHH39cv/32m83zli5dKpPJpJ9//lkvv/yyihYtKh8fH3Xr1k1Xr161OfdW90Td6vv2n3755Rc999xzKl26tNzc3BQcHKzBgwcrKSnJek6PHj20aNEi6+e6/nG7z5+V9/jrr79qyJAh8vf3l6enp9q2bauoqKjb5gaAO+FXPwBwD5YtW6Z27drJ1dVVnTp10jvvvKM//vjD2jRJUnx8vJo0aaKDBw+qV69eqlWrli5duqS1a9fq77//VpUqVTRhwgSNGTNGffr0UZMmTSRJDRs2zFKW2NhYffDBB+rUqZN69+6tuLg4LVmyRKGhodq+fbtq1qyZ6dcKDAzUww8/rM8//1xjx461ObZixQqZzWY999xzkq41GVOnTtVLL72kevXqKTY2Vjt27NCuXbvUvHnzLL0HSTp+/LgkqWjRojb15557ThUrVtSUKVNkGIYkafLkyRo9erQ6dOigl156SVFRUVqwYIGaNm2q3bt3y8/PT5K0ZMkSvfzyy2rYsKEGDRqkEydO6Omnn1aRIkUUHBx82zx//vmnmjRpIhcXF/Xp00chISE6fvy41q1bp8mTJ6tdu3Y6cuSIPvvsM82ZM0fFihWTJPn7++dYxltJS0vT2bNnrV/Lv/76S02aNJGPj4+GDRsmFxcXvfvuu3rkkUf0008/3XBfYVhYmPz8/DRu3DgdPnxY77zzjk6fPm29n+5erVy5UomJiXrllVdUtGhRbd++XQsWLNDff/+tlStXSpJefvllnT9/Xt9//70++eSTO75mVt9j//79VbhwYY0dO1anTp3S3LlzFRYWphUrVtzz+wNQgBkAgLuyY8cOQ5Lx/fffG4ZhGBaLxShVqpQxcOBAm/PGjBljSDJWr159w2tYLBbDMAzjjz/+MCQZH3300Q3nlClTxujevfsN9Ycffth4+OGHrY/T09ONlJQUm3OuXr1qBAYGGr169bKpSzLGjh172/f37rvvGpKMffv22dSrVq1qPPbYY9bHNWrUMFq1anXb17qZzZs3G5KMDz/80IiKijLOnz9vfPPNN0ZISIhhMpmMP/74wzAMwxg7dqwhyejUqZPN80+dOmWYzWZj8uTJNvV9+/YZzs7O1npqaqoREBBg1KxZ0+br89577xmSbL6GJ0+evOE6NG3a1PD29jZOnz5t83muXzvDMIyZM2cakoyTJ09me8ZbKVOmjNGiRQsjKirKiIqKMvbu3Ws8//zzhiSjf//+hmEYRps2bQxXV1fj+PHj1uedP3/e8Pb2Npo2bWqtffTRR4Yko3bt2kZqaqq1PmPGDEOS8dVXX1lrt/pe+vf37fXrvXnzZmstMTHxhudNnTrVMJlMNl/vfv36Gbf6keXfnz+r77FZs2Y213Lw4MGG2Ww2oqOjb/r5ACAzmC4IAHdp2bJlCgwM1KOPPirp2rSljh07avny5crIyLCe98UXX6hGjRpq27btDa9hj9GA68xms1xdXSVJFotFV65cUXp6uurUqaNdu3Zl+fXatWsnZ2dnm9/o79+/XwcOHFDHjh2tNT8/P/311186evToXeXu1auX/P39VbJkSbVq1UoJCQn6+OOPVadOHZvz+vbta/N49erVslgs6tChgy5dumT9KF68uCpWrGidJrljxw5dvHhRffv2tX59pGvT0Hx9fW+bLSoqSj///LN69eql0qVL2xzLzLXLiYz/tHHjRvn7+8vf3181atTQypUr1bVrV02fPl0ZGRnauHGj2rRpo3LlylmfU6JECXXu3Flbt25VbGyszev16dNHLi4u1sevvPKKnJ2dtX79+kxnuh0PDw/rnxMSEnTp0iU1bNhQhmFo9+7dWX69u32P/7yWTZo0UUZGhk6fPn0X7wgArmG6IADchYyMDC1fvlyPPvqozf0u9evX11tvvaVNmzapRYsWkq5Nf2vfvn2O5Pr444/11ltv6dChQ0pLS7PWy5Ytm+XXKlasmB5//HF9/vnnmjhxoqRrUwWdnZ3Vrl0763kTJkzQM888o/vuu0/VqlVTy5Yt1bVrVz3wwAOZ+jxjxoxRkyZNZDabVaxYMVWpUuWmCxn8+z0cPXpUhmGoYsWKN33d683B9R+W/33e9SXjb+f6UvLVqlXL1Hv5t5zI+E/169fXpEmTZDKZVKhQIVWpUsU6HTEiIkKJiYmqVKnSDc+rUqWKLBaLzp49q/vvv99a/3ceLy8vlShRwuZ+s3tx5swZjRkzRmvXrr3hXq+YmJgsv15UVFSW3+O/m+fChQtL0g15ACAraLIA4C78+OOPunDhgpYvX67ly5ffcHzZsmXWJute3WrEJCMjw2YVvP/+97/q0aOH2rRpo9dff10BAQEym82aOnWq9T6nrHr++efVs2dP7dmzRzVr1tTnn3+uxx9/3HrfkSQ1bdpUx48f11dffaWNGzfqgw8+0Jw5c7R48WK99NJLd/wc1atXV7Nmze543j9HPaRro3Umk0nffvvtTVcD9PLyysQ7zF45nbFYsWKZ+lrmhH+O5t7qePPmzXXlyhUNHz5clStXlqenp86dO6cePXrIYrHkSM5brSRp/P99fwBwN2iyAOAuLFu2TAEBAdZVz/5p9erVWrNmjRYvXiwPDw+VL19e+/fvv+3r3W7qWeHChW+6/9Lp06dtRjlWrVqlcuXKafXq1Tav9++FK7KiTZs2evnll61TBo8cOaKRI0fecF6
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
"from sklearn.model_selection import cross_val_score\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"# Удаление строк с NaN\n",
"feature_matrix = feature_matrix.dropna()\n",
"val_feature_matrix = val_feature_matrix.dropna()\n",
"test_feature_matrix = test_feature_matrix.dropna()\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train = feature_matrix.drop(\"Population2020\", axis=1)\n",
"y_train = feature_matrix[\"Population2020\"]\n",
"X_val = val_feature_matrix.drop(\"Population2020\", axis=1)\n",
"y_val = val_feature_matrix[\"Population2020\"]\n",
"X_test = test_feature_matrix.drop(\"Population2020\", axis=1)\n",
"y_test = test_feature_matrix[\"Population2020\"]\n",
"\n",
"# Выбор модели\n",
"model = RandomForestRegressor(random_state=42)\n",
"\n",
"# Обучение модели\n",
"model.fit(X_train, y_train)\n",
"\n",
"# Предсказание и оценка\n",
"y_pred = model.predict(X_test)\n",
"\n",
"rmse = mean_squared_error(y_test, y_pred, squared=False)\n",
"r2 = r2_score(y_test, y_pred)\n",
"mae = mean_absolute_error(y_test, y_pred)\n",
"\n",
"print(f\"RMSE: {rmse}\")\n",
"print(f\"R²: {r2}\")\n",
"print(f\"MAE: {mae}\")\n",
"\n",
"# Кросс-валидация\n",
"scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
"rmse_cv = (-scores.mean())**0.5\n",
"print(f\"Cross-validated RMSE: {rmse_cv}\")\n",
"\n",
"# Анализ важности признаков\n",
"feature_importances = model.feature_importances_\n",
"feature_names = X_train.columns\n",
"\n",
"\n",
"# Проверка на переобучение\n",
"y_train_pred = model.predict(X_train)\n",
"\n",
"rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)\n",
"r2_train = r2_score(y_train, y_train_pred)\n",
"mae_train = mean_absolute_error(y_train, y_train_pred)\n",
"\n",
"print(f\"Train RMSE: {rmse_train}\")\n",
"print(f\"Train R²: {r2_train}\")\n",
"print(f\"Train MAE: {mae_train}\")\n",
"\n",
"# Визуализация результатов\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(y_test, y_pred, alpha=0.5)\n",
"plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n",
"plt.xlabel(\"Actual Population\")\n",
"plt.ylabel(\"Predicted Population\")\n",
"plt.title(\"Actual vs Predicted Population\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Точность предсказаний: Модель показывает довольно высокий R² (0.98), что указывает на хорошее объяснение вариации. Значения RMSE и MAE довольно низки, что говорит о том, что модель достаточно точно предсказывает цены.\n",
"\n",
"Переобучение: Разница между RMSE на обучающей и тестовой выборках не очень большая, что указывает на то, что переобучение не является критическим. Однако, стоит быть осторожным и продолжать мониторинг этого показателя.\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aisenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}