896 lines
172 KiB
Plaintext
896 lines
172 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"ПОПУЛЯЦИЯ ЧЕЛОВЕЧЕСТВА В СТРАНАХ"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 22,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"Index: 235 entries, 1 to 235\n",
|
|||
|
"Data columns (total 12 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 Country (or dependency) 235 non-null object \n",
|
|||
|
" 1 Population2020 235 non-null int64 \n",
|
|||
|
" 2 Yearly Change 235 non-null float64\n",
|
|||
|
" 3 NetChange 235 non-null object \n",
|
|||
|
" 4 Density (P/Km²) 235 non-null object \n",
|
|||
|
" 5 LandArea 235 non-null int64 \n",
|
|||
|
" 6 Migrants (net) 201 non-null object \n",
|
|||
|
" 7 Fert. Rate 235 non-null object \n",
|
|||
|
" 8 Med. Age 235 non-null object \n",
|
|||
|
" 9 Urban Pop % 235 non-null object \n",
|
|||
|
" 10 World Share 235 non-null object \n",
|
|||
|
" 11 Net Change 235 non-null int64 \n",
|
|||
|
"dtypes: float64(1), int64(3), object(8)\n",
|
|||
|
"memory usage: 23.9+ KB\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\n",
|
|||
|
" \".//static//csv///world-population-by-country-2020.csv\", index_col=\"no\"\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"df[\"Population2020\"] = df[\"Population2020\"].apply(\n",
|
|||
|
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
|||
|
")\n",
|
|||
|
"df[\"Net Change\"] = df[\"NetChange\"].apply(\n",
|
|||
|
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
|||
|
")\n",
|
|||
|
"df[\"Yearly Change\"] = df[\"Yearly Change\"].apply(lambda x: float(\"\".join(x.rstrip(\"%\"))))\n",
|
|||
|
"df[\"LandArea\"] = df[\"LandArea\"].apply(\n",
|
|||
|
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"df.info()\n",
|
|||
|
"# print(df['date'].head)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разделим на 3 выборки\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 23,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 150\n",
|
|||
|
"Размер контрольной выборки: 38\n",
|
|||
|
"Размер тестовой выборки: 47\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки (80% - обучение, 20% - тест)\n",
|
|||
|
"train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (80% - обучение, 20% - контроль)\n",
|
|||
|
"train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_data))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_data))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_data))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 24,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjIAAAHHCAYAAACle7JuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABVB0lEQVR4nO3deVhUZf8G8HuGZUCRVdkUAUHE3aJUXHJDkTQ1fd0yt9wqNJfSpFJcw9RcUtQ0Qy2XxC2tXjVRcwk3EpdSREVRYMCNVdlmnt8fvsyPkQGBgJmj9+e6zqVzznOe833mzAz3nDlnRiaEECAiIiKSILm+CyAiIiIqLwYZIiIikiwGGSIiIpIsBhkiIiKSLAYZIiIikiwGGSIiIpIsBhkiIiKSLAYZIiIikiwGGSIiA5Sfn4+UlBTEx8fruxSqYLm5uVAqlUhMTNR3KS8EBhkiIgMRGxuLMWPGwMnJCaampnBwcICvry/4Beyl8+OPP+LWrVua2xs2bEBCQoL+Cirk3LlzeOedd1CzZk0oFAo4OTmhX79++i7rhWCs7wJedBs2bMDIkSM1txUKBerWrYtu3bphxowZcHBw0GN1RGQoTp06hYCAANja2mL69Olo1KgRZDIZrKysIJPJ9F2eJBw/fhx79+7FwoULERMTg8DAQFy/fl3fZeHnn3/GwIED4e3tjfnz58PDwwMAYG9vr+fKXgwy/tZS5SoIMnPmzIG7uzuys7Nx4sQJ/PDDD3B1dcXly5dRrVo1fZdJRHqUm5uL5s2bw9LSEgcPHoSVlZW+S5Kkq1evomPHjkhOTgYATJkyBV9//bVea3r48CEaNGiANm3aIDw8HKampnqt50XEIzJVJCAgAK+99hoAYPTo0bCzs8OSJUvw888/Y/DgwXqujoj0ad++fYiJicHVq1cZYv4Fb29v3LhxA5cvX0bNmjU1Rz70KSwsDNnZ2diwYQNDTCXhOTJ60rlzZwBAXFwcgKep/ZNPPkHTpk1hYWEBS0tLBAQE4MKFC0XWzc7OxqxZs+Dl5QUzMzM4OTmhb9++uHHjBgDg1q1bkMlkxU4dO3bU9HX06FHIZDL89NNP+Oyzz+Do6Ijq1aujV69euHPnTpFtnz59Gt27d4eVlRWqVauGDh064OTJkzrH2LFjR53bnzVrVpG2P/74I3x8fGBubg5bW1sMGjRI5/ZLGltharUay5YtQ+PGjWFmZgYHBweMGzcOjx490mrn5uaGnj17FtnO+PHji/Spq/ZFixYVuU8BICcnB8HBwfD09IRCoYCLiwumTZuGnJwcnfdVYcXdbwVT4XMAAGDVqlVo3LgxFAoFnJ2dERgYiNTU1HL1OXz4cNSsWRN5eXlF6urWrRsaNGigNW/Dhg3PfYwBwPnz59G9e3fUqlVLq13h+76gr3Pnzmmte//+fZ33fWn3R8Fj/OjRo1pte/ToUezjsbCC9QsmhUIBLy8vhISElOrclZSUFIwaNQoODg4wMzND8+bNsXHjRq02p06dgru7O3bu3AkPDw+Ympqibt26mDZtGp48eaJpV9r9U9yYR4wYATc3N615ixcvRps2bWBnZwdzc3P4+Phgx44dRfp3c3PDiBEjNLczMjIwfvx41K5dGwqFAvXr18eCBQugVqu11pPJZBg/frzWvJ49exapY8eOHTprTk1NxaRJk+Di4gKFQgFPT0989dVXWtspeF3YsGEDqlevjlatWsHDwwOBgYGQyWRadevy7OuKiYkJ3NzcMHXqVOTm5mraFfcYLaxjx45aj79Tp06hRYsW+PLLLzVjKO6+ys/Px9y5c+Hh4QGFQgE3Nzd89tlnRV43Cl63Dh48iBYtWsDMzAyNGjXCrl27tNoV1Fv4NePvv/+GjY0Nevbsifz8/DLdz4aKR2T0pCB02NnZAQBu3ryJPXv2oH///nB3d0dycjK+/fZbdOjQAf/88w+cnZ0BACqVCj179kRERAQGDRqEiRMnIiMjA7///jsuX76s9Q5k8ODBePPNN7W2GxQUpLOe+fPnQyaT4dNPP0VKSgqWLVsGPz8/REdHw9zcHABw+PBhBAQEwMfHB8HBwZDL5QgLC0Pnzp1x/PhxtGzZski/derUQUhICAAgMzMTH3zwgc5tz5gxAwMGDMDo0aNx7949rFixAm+88QbOnz8Pa2vrIuuMHTsW7du3BwDs2rULu3fv1lo+btw4zcd6H330EeLi4rBy5UqcP38eJ0+ehImJic77oSxSU1M1YytMrVajV69eOHHiBMaOHYuGDRvi0qVLWLp0Ka5du4Y9e/Y8t+/C91uB3377DVu3btWaN2vWLMyePRt+fn744IMPEBMTg9WrV+Ps2bNFxlmaPocOHYpNmzbhwIEDWiFDqVTi8OHDCA4O1lnv0qVLUbNmTQBP92dhaWlpCAgIgBACU6ZMgYuLCwBg8uTJz70fyqK4/aHLsWPH8Ntvv5Wp/88++wwNGzbEkydPNMHf3t4eo0aNKnadJ0+eoGPHjrh+/TrGjx8Pd3d3hIeHY8SIEUhNTcXEiRMBAA8ePMDNmzfx2WefoW/fvvj4449x7tw5LFq0CJcvX8avv/4KmUxW7v1TkuXLl6NXr14YMmQIcnNzsW3bNvTv3x+//PILevToUex6/fr1w++//45hw4ahZcuWOHLkCIKCgnDr1i2sWbOmzHXo8vjxY3To0AEJCQkYN24c6tatiz///BNBQUFISkrCsmXLil33+vXrWLduXZm2V/C6kpOTgwMHDmDx4sUwMzPD3Llzyz2GBw8e4MSJEzhx4gTee+89+Pj4ICIiQud9NXr0aGzcuBH/+c9/8PHHH+P06dMICQnBlStXirzGxcbGYuDAgXj//fcxfPhwhIWFoX///ti/fz+6du2qs5Y7d+6ge/fu8Pb2xvbt22Fs/DQC/Jv72SAIqlRhYWECgDh06JC4d++euHPnjti2bZuws7MT5ubm4u7du0IIIbKzs4VKpdJaNy4uTigUCjFnzhzNvO+//14AEEuWLCmyLbVarVkPgFi0aFGRNo0bNxYdOnTQ3D5y5IgAIGrXri3S09M187dv3y4AiOXLl2v6rl+/vvD399dsRwghHj9+LNzd3UXXrl2LbKtNmzaiSZMmmtv37t0TAERwcLBm3q1bt4SRkZGYP3++1rqXLl0SxsbGRebHxsYKAGLjxo2aecHBwaLwQ/n48eMCgNi8ebPWuvv37y8y39XVVfTo0aNI7YGBgeLZp8eztU+bNk3Y29sLHx8frfv0hx9+EHK5XBw/flxr/TVr1ggA4uTJk0W2V1iHDh1E48aNi8xftGiRACDi4uKEEEKkpKQIU1NT0a1bN63HzsqVKwUA8f3335e5T5VKJerUqSMGDhyo1W7JkiVCJpOJmzdvas1ft26dACBu376tta3C98eBAwcEALF161atdZ+97wueK2fPntVqp+txI0Tp90fBY/zIkSOaea1atRIBAQE6+32WrvWzs7OFXC4XH374YYnrLlu2TAAQP/74o2Zebm6u8PX1FRYWFprn3PDhwwUAMWLECK31Cx7b+/btE0KUfv/88ccfAoA4fPiwVrvhw4cLV1dXrXmPHz/Wup2bmyuaNGkiOnfurDXf1dVVDB8+XAghxL59+wQAMX36dK02I0aMEADEpUuXNPMAiMDAQK12PXr0KFJHeHh4kft57ty5onr16uLatWtabadPny6MjIxEfHy8EOL/X/PCwsI0bQYMGCCaNGkiXFxcNHUXR9f6Qgjh7Ows3nzzTc3t4h6jhT37+O/QoYMAIGbNmqXV7tn7Kjo6WgAQo0eP1mr3ySefFNmXrq6uAoDYuXOnZl5aWppwcnISr7zySpF64+LixMOHD0WjRo1EgwYNxP3797W2Udr72VDxo6Uq4ufnh1q1asHFxQWDBg2ChYUFdu/ejdq1awN4ejWTXP50d6hUKjx48AAWFhZo0KAB/vrrL00/O3fuRM2aNTFhwoQi2/g3VzYMGzYMNWrU0Nz+z3/+AycnJ8271ujoaMTGxuKdd97BgwcPcP/+fdy/fx9ZWVno0qULjh07VuQQZHZ2NszMzErc7q5du6BWqzFgwABNn/f
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkAAAAHHCAYAAABXx+fLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABhCklEQVR4nO3deVxU1f8/8NedGRh2kB0UBVFw18QlXBITRbLcSsss0Vz6lpZmttDiVv20rNTStPrklppmprYYLuRSiRuKiimigqCyquzbMHN+fyATI4uAwAzM6/l43GzuPffM+84MM6+599w7khBCgIiIiMiIyPRdABEREVFDYwAiIiIio8MAREREREaHAYiIiIiMDgMQERERGR0GICIiIjI6DEBERERkdBiAiIiIyOgwABERETVi2dnZiI+PR25urr5LaVQYgIiIiO6Rk5ODZcuWaW9nZGRg5cqV+iuoDCEEvvnmGzz88MOwsLCAjY0NvLy8sHHjRn2X1qgwAOnRunXrIEmSdjIzM4OPjw9mzJiBlJQUfZdHRGS0zM3N8d5772HTpk1ITEzE/Pnz8euvv+q7LADAs88+i//7v/9D+/bt8f3332Pfvn3Yv38/Ro8ere/SGhWFvgsgYOHChfDy8kJBQQH+/vtvrFq1Crt370Z0dDQsLCz0XR4RkdGRy+VYsGABJkyYAI1GAxsbG/z+++/6LgsbNmzA1q1bsXHjRjz77LP6LqdRk/hjqPqzbt06TJo0CSdOnECPHj20819//XV8/vnn2Lx5M8aNG6fHComIjNv169eRmJiI9u3bw87OTt/loHPnzujSpQs2bdqk71IaPR4CM0CPPvooACAuLg4AcPv2bcyZMwedO3eGlZUVbGxsEBwcjDNnzpRbt6CgAPPnz4ePjw/MzMzg5uaG0aNH48qVKwCA+Ph4ncNu904BAQHavg4ePAhJkrB161a88847cHV1haWlJYYPH47ExMRy933s2DEMHToUtra2sLCwwIABA/DPP/9UuI0BAQEV3v/8+fPLtd24cSP8/Pxgbm4Oe3t7PPPMMxXef1XbVpZGo8GyZcvQsWNHmJmZwcXFBS+++CLu3Lmj087T0xOPP/54ufuZMWNGuT4rqn3JkiXlHlMAKCwsxLx589CmTRsolUp4eHjgzTffRGFhYYWPVVmVPW6lU3x8vE77r776Ch07doRSqYS7uzumT5+OjIyMWvUZEhICR0dHqFSqcnUNGTIEvr6+OvPuPcRb0WsMAE6fPo2hQ4fCyclJp13Zx760r5MnT+qsm56eXuFjX93no/Q1fvDgQZ22w4YNq/T1WFbp+qWTUqmEj48PFi1ahPt9t6zovm/evAlPT0/06NEDOTk52vmpqamYPHkyXFxcYGZmhq5du2L9+vU6/ZW+/j/99NNy99WpUyftdt9bc1V/h/Pnz4ckSbh48SLGjh0LGxsbODg4YObMmSgoKNC5j+LiYnzwwQfw9vaGUqmEp6cn3nnnnQpf15XV4OnpWa7NTz/9VOXjWFpjenq6zvyTJ09CkiSsW7dOZ/6ff/6J/v37w9LSEnZ2dhgxYgQuXLhQYZ8A0KJFC/j7+0OhUMDV1bXC10tlNZVO1tbW6NWrF3bu3KnTLiAgAJ06daq0n9LntHQbcnNzER0dDQ8PDwwbNgw2NjawtLREQEAA/vrrr3LrX716FWPGjIG9vT0sLCzw8MMPl9uLVZP3+YCAgHJ/vx999BFkMhk2b96sM78mnwf6wkNgBqg0rDg4OAAoeRHv3LkTY8aMgZeXF1JSUvD1119jwIAB+Pfff+Hu7g4AUKvVePzxxxEeHo5nnnkGM2fORHZ2Nvbt24fo6Gh4e3tr72PcuHF47LHHdO43NDS0wno++ugjSJKEt956C6mpqVi2bBkCAwMRFRUFc3NzACVvKsHBwfDz88O8efMgk8mwdu1aPProo/jrr7/Qq1evcv22aNECixYtAlAy4PCll16q8L7ff/99jB07FlOmTEFaWhq+/PJLPPLIIzh9+nSF38imTZuG/v37AwB+/vln7NixQ2f5iy++qN379uqrryIuLg4rVqzA6dOn8c8//8DExKTCx6EmMjIytNtWlkajwfDhw/H3339j2rRpaN++Pc6dO4elS5fi0qVL5d4gK1L2cSu1e/du/PDDDzrz5s+fjwULFiAwMBAvvfQSYmJisGrVKpw4caLcdlanz+effx4bNmzAnj17dMJJcnIy/vzzT8ybN6/CepcuXQpHR0cAJc9nWZmZmQgODoYQArNnz4aHhwcA4LXXXrvv41ATlT0fFTl8+DB2795do/7feecdtG/fHvn5+doPEmdnZ0yePLnafZQ+FiYmJti9ezesrKwAAPn5+QgICMDly5cxY8YMeHl5Ydu2bZg4cSIyMjIwc+bMGtVaOm6k1DfffIMLFy5g6dKl2nldunTRWWfs2LHw9PTEokWLcPToUXzxxRe4c+cONmzYoG0zZcoUrF+/Hk899RRef/11HDt2DIsWLcKFCxfK/Q2WKn3cSutISEio0bbU1P79+xEcHIzWrVtj/vz5yM/Px5dffom+ffvi1KlTOgHsXp999lmNx2aWPs7p6en46quvMGbMGERHR5f7slBdt27dAgB8/PHHcHV1xRtvvAEzMzN8++23CAwMxL59+/DII48AAFJSUtCnTx/k5eXh1VdfhYODA9avX4/hw4fjp59+wqhRo3T6rs77/L3Wrl2L9957D5999pnO4bjafB7ohSC9Wbt2rQAg9u/fL9LS0kRiYqLYsmWLcHBwEObm5uL69etCCCEKCgqEWq3WWTcuLk4olUqxcOFC7bw1a9YIAOLzzz8vd18ajUa7HgCxZMmScm06duwoBgwYoL194MABAUA0b95cZGVlaef/+OOPAoBYvny5tu+2bduKoKAg7f0IIUReXp7w8vISgwcPLndfffr0EZ06ddLeTktLEwDEvHnztPPi4+OFXC4XH330kc66586dEwqFotz82NhYAUCsX79eO2/evHmi7Mv8r7/+EgDEpk2bdNYNCwsrN79Vq1Zi2LBh5WqfPn26uPdP597a33zzTeHs7Cz8/Px0HtPvv/9eyGQy8ddff+msv3r1agFA/PPPP+Xur6wBAwaIjh07lpu/ZMkSAUDExcUJIYRITU0VpqamYsiQITqvnRUrVggAYs2aNTXuU61WixYtWoinn35ap93nn38uJEkSV69e1Zn/7bffCgDi2rVrOvdV9vHYs2ePACB++OEHnXXvfexL/1ZOnDih066i140Q1X8+Sl/jBw4c0M7r3bu3CA4OrrDfe1W0fkFBgZDJZOLll1+u9roFBQUiICBAODs7i8uXL+u0W7ZsmQAgNm7cqJ1XVFQk/P39hZWVlfZvsyZ/22WFhISIVq1aVbis9O9n+PDhOvNffvllAUCcOXNGCCFEVFSUACCmTJmi027OnDkCgPjzzz915u/bt08AEIcOHaq0jtLHZ9u2bRXWdm+NaWlpOvNPnDghAIi1a9dq53Xr1k04OzuLW7duaeedOXNGyGQyMWHChHJ9lkpNTRXW1tba10XZ57uqmsrau3evACB+/PFH7bzK/vZKlT6npdtQetvU1FRcunRJ2y4tLU04ODgIPz8/7bxZs2YJADrvNdnZ2cLLy0t4enpq3xeq+z5fWm/p6+j3338XCoVCvP766zo11+bzQF94CMwABAYGwsnJCR4eHnjmmWdgZWWFHTt2oHnz5gAApVIJmazkqVKr1bh16xasrKzg6+uLU6dOafvZvn07HB0d8corr5S7j3sP2dTEhAkTYG1trb391FNPwc3NTfstOSoqCrGxsXj22Wdx69YtpKenIz09Hbm5uRg0aBAOHz4MjUaj02dBQQHMzMyqvN+ff/4ZGo0GY8eO1faZnp4OV1dXtG3bFgcOHNBpX1RUBKDk8arMtm3bYGtri8GDB+v06efnBysrq3J9qlQqnXbp6enldv3f68aNG/jyyy/x/vvva7/Fl73/9u3bo127djp9lh72vPf+a2v//v0oKirCrFmztK8dAJg6dWqtB3PKZDKMHz8ev/zyC7Kzs7XzN23ahD59+sDLy0u
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjIAAAHHCAYAAACle7JuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABYFUlEQVR4nO3deVwU5R8H8M/usrvcIDcoIop4pVmoiPeB4lFqXmnllWkHWmpmUXlmaWoeFWlWXnnlXZk/Tc0rwwvFW0SFQOUQlfvefX5/IBsrh4DA7ujn/XJfuHM8892Z3eXDzDMzMiGEABEREZEEyQ1dABEREVFFMcgQERGRZDHIEBERkWQxyBAREZFkMcgQERGRZDHIEBERkWQxyBAREZFkMcgQERGRZDHIEBERPab09HTExMTg/v37hi7lqcMgQ0RERuuLL76AVqsFAGi1WsyZM8fAFf1n8+bN6Nq1K6ysrGBpaYnatWtj3rx5hi7rqcMgYyRWrVoFmUyme5iamsLb2xvjxo1DfHy8ocsjIjKI1atXY8GCBbh58ya++uorrF692tAlAQA++ugjDB48GFZWVvjhhx+wd+9e7Nu3D++8846hS3vqmBi6ANI3a9YseHp6IisrC3///TeWLl2KXbt24cKFCzA3Nzd0eURE1WrWrFkYPnw4PvzwQ6jVaqxdu9bQJeHQoUP48ssvMWfOHHz00UeGLuepJ+NNI43DqlWrMGrUKJw8eRItWrTQDX///fexcOFCrF+/HkOHDjVghUREhpGQkIBr166hfv36cHR0NHQ5ePHFF3Hv3j0cPXrU0KUQeGjJ6HXp0gUAEBkZCQC4d+8eJk+ejKZNm8LS0hLW1tbo2bMnzp49W2TerKwszJgxA97e3jA1NYWrqyv69++P69evAwCioqL0Dmc9/OjUqZOurYMHD0Imk+GXX37Bxx9/DBcXF1hYWKBPnz6IiYkpsuzjx4+jR48esLGxgbm5OTp27Fjih75Tp07FLn/GjBlFpl27di18fHxgZmYGOzs7DBkypNjll/baCtNqtVi8eDGaNGkCU1NTODs748033yzSYa9OnTp44YUXiixn3LhxRdosrvb58+cXWacAkJ2djenTp8PLywtqtRru7u6YMmUKsrOzi11XhZW03goeUVFRetN/9913aNKkCdRqNdzc3BAYGIikpKQKtTlixAg4ODggNze3SF3du3dHgwYN9IY9fOi0uPcYAJw5cwY9evSAo6Oj3nSF131BW6dOndKbNzExsdh1X9btUfAeP3jwoN60vXv3LvH9WFjB/AUPtVoNb29vzJkzB6X9vfjwfI/6LNy6dQuvv/46nJ2doVar0aRJE6xYsaJIu6V9/h/12ZfJZBg5cqSurRs3bmDQoEGws7ODubk5WrdujT/++KPCr//MmTPo2bMnrK2tYWlpia5du+LYsWN60xRs56ioKDg5OaFNmzawt7dHs2bNIJPJsGrVqlK3x8PvOXNzczRt2hQ//vij3nQjR46EpaVlqW09vA2OHTuGZ555BkOGDIGdnR3MzMzQsmVL7Nixo8i8CQkJGD16NJydnWFqaopnn322yOGxgu2xYMECLFq0CB4eHjAzM0PHjh1x4cKFIvXWqVNHb9jatWshl8sxd+5cveFXrlzBwIEDYWdnB1NTU7Ro0QK//fZbqa9VinhoycgVhA57e3sA+V8oO3bswKBBg+Dp6Yn4+Hh8//336NixIy5dugQ3NzcAgEajwQsvvID9+/djyJAheO+995Camoq9e/fiwoULqFevnm4ZQ4cORa9evfSWGxQUVGw9n3/+OWQyGT788EMkJCRg8eLF8Pf3R1hYGMzMzAAAf/31F3r27AkfHx9Mnz4dcrkcK1euRJcuXXDkyBG0atWqSLu1atXSdeJLS0vD22+/Xeyyp06disGDB+ONN97AnTt38M0336BDhw44c+YMbG1ti8wzduxYtG/fHgCwbds2bN++XW/8m2++qdsb9u677yIyMhLffvstzpw5g6NHj0KpVBa7HsojKSmp2A6KWq0Wffr0wd9//42xY8eiUaNGOH/+PBYtWoSrV68W+6X4sMLrrcCuXbuwYcMGvWEzZszAzJkz4e/vj7fffhvh4eFYunQpTp48WeR1lqXNYcOGYc2aNdizZ49eyIiLi8Nff/2F6dOnF1vvokWL4ODgACB/exaWnJyMnj17QgiBSZMmwd3dHQAwceLER66H8ihpexTn8OHD2LVrV7na//jjj9GoUSNkZmbqgr+TkxNGjx5d7PSNGjXCzz//rHu+fPlyXL58GYsWLdINa9asGQAgPj4erVu3hkwmw7hx4+Do6Ij//e9/GD16NFJSUjBhwgQAj/78+/v76y2z4LNReFjBd0R8fDzatGmDjIwMvPvuu7C3t8fq1avRp08fbNmyBS+99FK5Xv/FixfRvn17WFtbY8qUKVAqlfj+++/RqVMnHDp0CL6+viWu259//hnnz58vy2bQKXjPpaSkYMWKFRgzZgzq1KkDf3//crVT2N27d7F8+XJYWlri3XffhaOjI9auXYv+/ftj3bp1ur3nmZmZ6NSpE65du4Zx48bB09MTmzdvxsiRI5GUlIT33ntPr901a9YgNTUVgYGByMrKwpIlS9ClSxecP38ezs7Oxdby559/4vXXX8e4ceP0DnNdvHgRbdu2Rc2aNfHRRx/BwsICmzZtQr9+/bB169Yi203SBBmFlStXCgBi37594s6dOyImJkZs3LhR2NvbCzMzM3Hz5k0hhBBZWVlCo9HozRsZGSnUarWYNWuWbtiKFSsEALFw4cIiy9Jqtbr5AIj58+cXmaZJkyaiY8eOuucHDhwQAETNmjVFSkqKbvimTZsEALFkyRJd2/Xr1xcBAQG65QghREZGhvD09BTdunUrsqw2bdqIZ555Rvf8zp07AoCYPn26blhUVJRQKBTi888/15v3/PnzwsTEpMjwiIgIAUCsXr1aN2z69Omi8Fv+yJEjAoBYt26d3ry7d+8uMtzDw0P07t27SO2BgYHi4Y/Rw7VPmTJFODk5CR8fH711+vPPPwu5XC6OHDmiN/+yZcsEAHH06NEiyyusY8eOokmTJkWGz58/XwAQkZGRQgghEhIShEqlEt27d9d773z77bcCgFixYkW529RoNKJWrVri5Zdf1ptu4cKFQiaTiRs3bugN/+GHHwQA8e+//+otq/D62LNnjwAgNmzYoDfvw+u+4LNy8uRJvemKe98IUfbtUfAeP3DggG6Yr6+v6NmzZ7HtPqy4+bOysoRcLhfvvPNOqfMWNmLECOHh4VHsuNGjRwtXV1eRmJioN3zIkCHCxsZGZGRkCCHK9vkv7OHPRmETJkwQAPTep6mpqcLT01PUqVNH954q6+vv16+fUKlU4vr167pht2/fFlZWVqJDhw66YQXbueA9l5WVJWrXrq3bHitXriy23pLmF0KIq1evCgBi3rx5umEjRowQFhYWpbb18PYHIACIgwcP6oZlZGSIRo0aCRcXF5GTkyOEEGLx4sUCgFi7dq1uupycHOHn5ycsLS1136UF38WFv+uFEOL48eMCgJg4caJevQXvj1OnTglLS0sxaNCgIr8XunbtKpo2bSqysrJ0w7RarWjTpo2oX79+qa9Xanhoycj4+/vD0dER7u7uGDJkCCwtLbF9+3bUrFkTAKBWqyGX5282jUaDu3fvwtLSEg0aNMDp06d17WzduhUODg4YP358kWU8fCikPIYPHw4rKyvd84EDB8LV1VX3V2tYWBgiIiLwyiuv4O7du0hMTERiYiLS09PRtWtXHD58WHcqZYGsrCyYmpqWutxt27ZBq9Vi8ODBujYTExPh4uKC+vXr48CBA3rT5+TkAMhfXyXZvHkzbGxs0K1bN702fXx8YGlpWaTN3NxcvekSExORlZVVat23bt3CN998g6lTpxbZfb1582Y0atQIDRs21Guz4HDiw8uvqH379iEnJwcTJkzQvXcAYMyYMbC2ti5yiKAs5HI5Xn31Vfz2229ITU3VDV+3bh3atGkDT09PvenLsj0K2inY+1gVStseD9u2bRtOnjxZZHf9oyQnJyMxMRH
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Гистограмма распределения объема в обучающей выборке\n",
|
|||
|
"sns.histplot(train_data[\"Population2020\"], kde=True)\n",
|
|||
|
"plt.title('Распределение популяции в обучающей выборке')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Гистограмма распределения объема в контрольной выборке\n",
|
|||
|
"sns.histplot(val_data[\"Population2020\"], kde=True)\n",
|
|||
|
"plt.title('Распределение популяции в контрольной выборке')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Гистограмма распределения объема в тестовой выборке\n",
|
|||
|
"sns.histplot(test_data[\"Population2020\"], kde=True)\n",
|
|||
|
"plt.title(\"Распределение популяции в тестовой выборке\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Процесс конструирования признаков\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"### Унитарное кодирование категориальных признаков (one-hot encoding)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"categorical_features = [\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding\n",
|
|||
|
"train_data_encoded = pd.get_dummies(train_data, columns=categorical_features)\n",
|
|||
|
"val_data_encoded = pd.get_dummies(val_data, columns=categorical_features)\n",
|
|||
|
"test_data_encoded = pd.get_dummies(test_data, columns=categorical_features)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Дискретизация числовых признаков "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 26,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>LandArea</th>\n",
|
|||
|
" <th>LandArea</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>no</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>216</th>\n",
|
|||
|
" <td>53</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>217</th>\n",
|
|||
|
" <td>160</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>218</th>\n",
|
|||
|
" <td>60</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>219</th>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>220</th>\n",
|
|||
|
" <td>150</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>221</th>\n",
|
|||
|
" <td>328</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>222</th>\n",
|
|||
|
" <td>460</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>223</th>\n",
|
|||
|
" <td>240</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>224</th>\n",
|
|||
|
" <td>90</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>225</th>\n",
|
|||
|
" <td>30</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>226</th>\n",
|
|||
|
" <td>140</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>227</th>\n",
|
|||
|
" <td>20</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>228</th>\n",
|
|||
|
" <td>21</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>229</th>\n",
|
|||
|
" <td>390</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>230</th>\n",
|
|||
|
" <td>230</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>231</th>\n",
|
|||
|
" <td>100</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>232</th>\n",
|
|||
|
" <td>12170</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>233</th>\n",
|
|||
|
" <td>260</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>234</th>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" <td>(0.0, 5458956.667]</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>235</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" LandArea LandArea\n",
|
|||
|
"no \n",
|
|||
|
"216 53 (0.0, 5458956.667]\n",
|
|||
|
"217 160 (0.0, 5458956.667]\n",
|
|||
|
"218 60 (0.0, 5458956.667]\n",
|
|||
|
"219 10 (0.0, 5458956.667]\n",
|
|||
|
"220 150 (0.0, 5458956.667]\n",
|
|||
|
"221 328 (0.0, 5458956.667]\n",
|
|||
|
"222 460 (0.0, 5458956.667]\n",
|
|||
|
"223 240 (0.0, 5458956.667]\n",
|
|||
|
"224 90 (0.0, 5458956.667]\n",
|
|||
|
"225 30 (0.0, 5458956.667]\n",
|
|||
|
"226 140 (0.0, 5458956.667]\n",
|
|||
|
"227 20 (0.0, 5458956.667]\n",
|
|||
|
"228 21 (0.0, 5458956.667]\n",
|
|||
|
"229 390 (0.0, 5458956.667]\n",
|
|||
|
"230 230 (0.0, 5458956.667]\n",
|
|||
|
"231 100 (0.0, 5458956.667]\n",
|
|||
|
"232 12170 (0.0, 5458956.667]\n",
|
|||
|
"233 260 (0.0, 5458956.667]\n",
|
|||
|
"234 10 (0.0, 5458956.667]\n",
|
|||
|
"235 0 NaN"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 26,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"labels = [\"small country\", \"medium country\", \"big country\"]\n",
|
|||
|
"num_bins = 3\n",
|
|||
|
"\n",
|
|||
|
"hist1, bins1 = np.histogram(\n",
|
|||
|
" df[\"LandArea\"].fillna(df[\"LandArea\"].median()), bins=num_bins\n",
|
|||
|
")\n",
|
|||
|
"bins1, hist1\n",
|
|||
|
"\n",
|
|||
|
"pd.concat([df[\"LandArea\"], pd.cut(df[\"LandArea\"], list(bins1))], axis=1).tail(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 27,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>LandArea</th>\n",
|
|||
|
" <th>LandArea</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>no</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>9388211</td>\n",
|
|||
|
" <td>medium country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>2973190</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>9147420</td>\n",
|
|||
|
" <td>medium country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>1811570</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>770880</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>8358140</td>\n",
|
|||
|
" <td>medium country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>910770</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>130170</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>16376870</td>\n",
|
|||
|
" <td>big country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>1943950</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <td>364555</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>1000000</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>298170</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>995450</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>310070</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>2267050</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>769630</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <td>1628550</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <td>348560</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>20</th>\n",
|
|||
|
" <td>510890</td>\n",
|
|||
|
" <td>small country</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" LandArea LandArea\n",
|
|||
|
"no \n",
|
|||
|
"1 9388211 medium country\n",
|
|||
|
"2 2973190 small country\n",
|
|||
|
"3 9147420 medium country\n",
|
|||
|
"4 1811570 small country\n",
|
|||
|
"5 770880 small country\n",
|
|||
|
"6 8358140 medium country\n",
|
|||
|
"7 910770 small country\n",
|
|||
|
"8 130170 small country\n",
|
|||
|
"9 16376870 big country\n",
|
|||
|
"10 1943950 small country\n",
|
|||
|
"11 364555 small country\n",
|
|||
|
"12 1000000 small country\n",
|
|||
|
"13 298170 small country\n",
|
|||
|
"14 995450 small country\n",
|
|||
|
"15 310070 small country\n",
|
|||
|
"16 2267050 small country\n",
|
|||
|
"17 769630 small country\n",
|
|||
|
"18 1628550 small country\n",
|
|||
|
"19 348560 small country\n",
|
|||
|
"20 510890 small country"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 27,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"pd.concat(\n",
|
|||
|
" [df[\"LandArea\"], pd.cut(df[\"LandArea\"], list(bins1), labels=labels)], axis=1\n",
|
|||
|
").head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Ручной синтез"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Пример синтеза признака площади страны в кв км на душу человека\n",
|
|||
|
"train_data_encoded[\"squareforman\"] = (\n",
|
|||
|
" train_data_encoded[\"LandArea\"] / train_data_encoded[\"Population2020\"]\n",
|
|||
|
")\n",
|
|||
|
"val_data_encoded[\"squareforman\"] = (\n",
|
|||
|
" val_data_encoded[\"LandArea\"] / val_data_encoded[\"Population2020\"]\n",
|
|||
|
")\n",
|
|||
|
"test_data_encoded[\"squareforman\"] = (\n",
|
|||
|
" test_data_encoded[\"LandArea\"] / test_data_encoded[\"Population2020\"]\n",
|
|||
|
")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 29,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
|
|||
|
"\n",
|
|||
|
"# Пример масштабирования числовых признаков\n",
|
|||
|
"numerical_features = [\"Population2020\", \"Yearly Change\"]\n",
|
|||
|
"\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"train_data_encoded[numerical_features] = scaler.fit_transform(train_data_encoded[numerical_features])\n",
|
|||
|
"val_data_encoded[numerical_features] = scaler.transform(val_data_encoded[numerical_features])\n",
|
|||
|
"test_data_encoded[numerical_features] = scaler.transform(test_data_encoded[numerical_features])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Конструирование признаков с применением фреймворка Featuretools"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import featuretools as ft\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='pop')\n",
|
|||
|
"\n",
|
|||
|
"es = es.add_dataframe(dataframe_name='dop', dataframe=train_data_encoded, index='id')\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(\n",
|
|||
|
" entityset=es, target_dataframe_name=\"dop\", max_depth=2\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Оценка качества каждого набора признаков\n",
|
|||
|
"Предсказательная способность\n",
|
|||
|
"Метрики: RMSE, MAE, R²\n",
|
|||
|
"\n",
|
|||
|
"Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
|
|||
|
"\n",
|
|||
|
"Скорость вычисления\n",
|
|||
|
"Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
|
|||
|
"\n",
|
|||
|
"Надежность\n",
|
|||
|
"Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
|
|||
|
"\n",
|
|||
|
"Корреляция\n",
|
|||
|
"Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
|
|||
|
"\n",
|
|||
|
"Цельность\n",
|
|||
|
"Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 31,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:724: UserWarning: A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: index\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import featuretools as ft\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='pop')\n",
|
|||
|
"es = es.add_dataframe(\n",
|
|||
|
" dataframe_name=\"dop\", dataframe=train_data_encoded, index=\"id\"\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(\n",
|
|||
|
" entityset=es, target_dataframe_name=\"dop\", max_depth=2\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 32,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"RMSE: 0.03387638150888035\n",
|
|||
|
"R²: 0.9814104214666138\n",
|
|||
|
"MAE: 0.015316153649943631\n",
|
|||
|
"Cross-validated RMSE: 0.7480546038440666\n",
|
|||
|
"Train RMSE: 0.24468562210527503\n",
|
|||
|
"Train R²: 0.9401289463349545\n",
|
|||
|
"Train MAE: 0.049477658845671284\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1kAAAIjCAYAAADxz9EgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACE7klEQVR4nOzdd3hU1drG4WcyqaRSkgAhEJoUQZAq1QYERZSiIEhXECV0pSi9N+koFhSPBwVBUFAUFEFFokgVpHcEEkJJ75n9/cHHHEdaApNMyu++rlwy794zeSY7SN6stdcyGYZhCAAAAABgF06ODgAAAAAA+QlNFgAAAADYEU0WAAAAANgRTRYAAAAA2BFNFgAAAADYEU0WAAAAANgRTRYAAAAA2BFNFgAAAADYEU0WAAAAANgRTRYAFEAmk0njxo1zdAyHe+SRR/TII49YH586dUomk0lLly51WKZ/+3fGvGrLli0ymUzasmWLXV+X72UAuRFNFgDco7ffflsmk0n169e/69c4f/68xo0bpz179tgvWC53/Yfu6x8uLi4qV66cunXrphMnTjg6XpZs27ZN48aNU3R0tMMyhISE2Hw9AwIC1KRJE61Zs8Zhmexl/fr1NFIA8hSaLAC4R8uWLVNISIi2b9+uY8eO3dVrnD9/XuPHjy9QTdZ1AwYM0CeffKL33ntPrVq10ooVK1S3bl2dP38+x7OUKVNGSUlJ6tq1a5aet23bNo0fP96hTZYk1axZU5988ok++eQTvfbaazp//rzatWunxYsXOzTXvVq/fr3Gjx9/02NJSUkaNWpUDicCgNujyQKAe3Dy5Elt27ZNs2fPlr+/v5YtW+boSHlOkyZN1KVLF/Xs2VMLFizQrFmzdOXKFX388ce3fE5CQkK2ZDGZTHJ3d5fZbM6W189uQUFB6tKli7p06aJhw4bp119/laenp+bMmePoaNnG3d1dzs7Ojo4BADZosgDgHixbtkyFCxdWq1at9Oyzz96yyYqOjtbgwYMVEhIiNzc3lSpVSt26ddOlS5e0ZcsW1a1bV5LUs2dP63Sv6/cFhYSEqEePHje85r/v1UlNTdWYMWNUu3Zt+fr6ytPTU02aNNHmzZuz/L4iIyPl7Ox809GDw4cPy2QyaeHChZKktLQ0jR8/XhUrVpS7u7uKFi2qxo0b6/vvv8/y55Wkxx57TNK1BlaSxo0bJ5PJpAMHDqhz584qXLiwGjdubD3/v//9r2rXri0PDw8VKVJEzz//vM6ePXvD67733nsqX768PDw8VK9ePf3yyy83nHOre7IOHTqkDh06yN/fXx4eHqpUqZLefPNNa77XX39dklS2bFnr9Tt16lS2ZMyK4sWLq0qVKtavpSTt3r1bTzzxhHx8fOTl5aXHH39cv/32m83zli5dKpPJpJ9//lkvv/yyihYtKh8fH3Xr1k1Xr161OfdW90Td6vv2n3755Rc999xzKl26tNzc3BQcHKzBgwcrKSnJek6PHj20aNEi6+e6/nG7z5+V9/jrr79qyJAh8vf3l6enp9q2bauoqKjb5gaAO+FXPwBwD5YtW6Z27drJ1dVVnTp10jvvvKM//vjD2jRJUnx8vJo0aaKDBw+qV69eqlWrli5duqS1a9fq77//VpUqVTRhwgSNGTNGffr0UZMmTSRJDRs2zFKW2NhYffDBB+rUqZN69+6tuLg4LVmyRKGhodq+fbtq1qyZ6dcKDAzUww8/rM8//1xjx461ObZixQqZzWY999xzkq41GVOnTtVLL72kevXqKTY2Vjt27NCuXbvUvHnzLL0HSTp+/LgkqWjRojb15557ThUrVtSUKVNkGIYkafLkyRo9erQ6dOigl156SVFRUVqwYIGaNm2q3bt3y8/PT5K0ZMkSvfzyy2rYsKEGDRqkEydO6Omnn1aRIkUUHBx82zx//vmnmjRpIhcXF/Xp00chISE6fvy41q1bp8mTJ6tdu3Y6cuSIPvvsM82ZM0fFihWTJPn7++dYxltJS0vT2bNnrV/Lv/76S02aNJGPj4+GDRsmFxcXvfvuu3rkkUf0008/3XBfYVhYmPz8/DRu3DgdPnxY77zzjk6fPm29n+5erVy5UomJiXrllVdUtGhRbd++XQsWLNDff/+tlStXSpJefvllnT9/Xt9//70++eSTO75mVt9j//79VbhwYY0dO1anTp3S3LlzFRYWphUrVtzz+wNQgBkAgLuyY8cOQ5Lx/fffG4ZhGBaLxShVqpQxcOBAm/PGjBljSDJWr159w2tYLBbDMAzjjz/+MCQZH3300Q3nlClTxujevfsN9Ycffth4+OGHrY/T09ONlJQUm3OuXr1qBAYGGr169bKpSzLGjh172/f37rvvGpKMffv22dSrVq1qPPbYY9bHNWrUMFq1anXb17qZzZs3G5KMDz/80IiKijLOnz9vfPPNN0ZISIhhMpmMP/74wzAMwxg7dqwhyejUqZPN80+dOmWYzWZj8uTJNvV9+/YZzs7O1npqaqoREBBg1KxZ0+br89577xmSbL6GJ0+evOE6NG3a1PD29jZOnz5t83muXzvDMIyZM2cakoyTJ09me8ZbKVOmjNGiRQsjKirKiIqKMvbu3Ws8//zzhiSjf//+hmEYRps2bQxXV1fj+PHj1uedP3/e8Pb2Npo2bWqtffTRR4Yko3bt2kZqaqq1PmPGDEOS8dVXX1lrt/pe+vf37fXrvXnzZmstMTHxhudNnTrVMJlMNl/vfv36Gbf6keXfnz+r77FZs2Y213Lw4MGG2Ww2oqOjb/r5ACAzmC4IAHdp2bJlCgwM1KOPPirp2rSljh07avny5crIyLCe98UXX6hGjRpq27btDa9hj9GA68xms1xdXSVJFotFV65cUXp6uurUqaNdu3Zl+fXatWsnZ2dnm9/o79+/XwcOHFDHjh2tNT8/P/311186evToXeXu1auX/P39VbJkSbVq1UoJCQn6+OOPVadOHZvz+vbta/N49erVslgs6tChgy5dumT9KF68uCpWrGidJrljxw5dvHhRffv2tX59pGvT0Hx9fW+bLSoqSj///LN69eql0qVL2xzLzLXLiYz/tHHjRvn7+8vf3181atTQypUr1bVrV02fPl0ZGRnauHGj2rRpo3LlylmfU6JECXXu3Flbt25VbGyszev16dNHLi4u1sevvPKKnJ2dtX79+kxnuh0PDw/rnxMSEnTp0iU1bNhQhmFo9+7dWX69u32P/7yWTZo0UUZGhk6fPn0X7wgArmG6IADchYyMDC1fvlyPPvqozf0u9evX11tvvaVNmzapRYsWkq5Nf2vfvn2O5Pr444/11ltv6dChQ0pLS7PWy5Ytm+XXKlasmB5//HF9/vnnmjhxoqRrUwWdnZ3Vrl0763kTJkzQM888o/vuu0/VqlVTy5Yt1bVrVz3wwAOZ+jxjxoxRkyZNZDabVaxYMVWpUuWmCxn8+z0cPXpUhmGoYsWKN33d683B9R+W/33e9SXjb+f6UvLVqlXL1Hv5t5zI+E/169fXpEmTZDKZVKhQIVWpUsU6HTEiIkKJiYmqVKnSDc+rUqWKLBaLzp49q/vvv99a/3ceLy8vlShRwuZ+s3tx5swZjRkzRmvXrr3hXq+YmJgsv15UVFSW3+O/m+fChQtL0g15ACAraLIA4C78+OOPunDhgpYvX67ly5ffcHzZsmXWJute3WrEJCMjw2YVvP/+97/q0aOH2rRpo9dff10BAQEym82aOnWq9T6nrHr++efVs2dP7dmzRzVr1tTnn3+uxx9/3HrfkSQ1bdpUx48f11dffaWNGzfqgw8+0Jw5c7R48WK99NJLd/wc1atXV7Nmze543j9HPaRro3Umk0nffvvtTVcD9PLyysQ7zF45nbFYsWKZ+lrmhH+O5t7qePPmzXXlyhUNHz5clStXlqenp86dO6cePXrIYrHkSM5brSRp/P99fwBwN2iyAOAuLFu2TAEBAdZVz/5p9erVWrNmjRYvXiwPDw+VL19e+/fvv+3r3W7qWeHChW+6/9Lp06dtRjlWrVqlcuXKafXq1Tav9++FK7KiTZs2evnll61TBo8cOaKRI0fecF6
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.ensemble import RandomForestRegressor\n",
|
|||
|
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
|
|||
|
"from sklearn.model_selection import cross_val_score\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"# Удаление строк с NaN\n",
|
|||
|
"feature_matrix = feature_matrix.dropna()\n",
|
|||
|
"val_feature_matrix = val_feature_matrix.dropna()\n",
|
|||
|
"test_feature_matrix = test_feature_matrix.dropna()\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"X_train = feature_matrix.drop(\"Population2020\", axis=1)\n",
|
|||
|
"y_train = feature_matrix[\"Population2020\"]\n",
|
|||
|
"X_val = val_feature_matrix.drop(\"Population2020\", axis=1)\n",
|
|||
|
"y_val = val_feature_matrix[\"Population2020\"]\n",
|
|||
|
"X_test = test_feature_matrix.drop(\"Population2020\", axis=1)\n",
|
|||
|
"y_test = test_feature_matrix[\"Population2020\"]\n",
|
|||
|
"\n",
|
|||
|
"# Выбор модели\n",
|
|||
|
"model = RandomForestRegressor(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучение модели\n",
|
|||
|
"model.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Предсказание и оценка\n",
|
|||
|
"y_pred = model.predict(X_test)\n",
|
|||
|
"\n",
|
|||
|
"rmse = mean_squared_error(y_test, y_pred, squared=False)\n",
|
|||
|
"r2 = r2_score(y_test, y_pred)\n",
|
|||
|
"mae = mean_absolute_error(y_test, y_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"RMSE: {rmse}\")\n",
|
|||
|
"print(f\"R²: {r2}\")\n",
|
|||
|
"print(f\"MAE: {mae}\")\n",
|
|||
|
"\n",
|
|||
|
"# Кросс-валидация\n",
|
|||
|
"scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
|
|||
|
"rmse_cv = (-scores.mean())**0.5\n",
|
|||
|
"print(f\"Cross-validated RMSE: {rmse_cv}\")\n",
|
|||
|
"\n",
|
|||
|
"# Анализ важности признаков\n",
|
|||
|
"feature_importances = model.feature_importances_\n",
|
|||
|
"feature_names = X_train.columns\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Проверка на переобучение\n",
|
|||
|
"y_train_pred = model.predict(X_train)\n",
|
|||
|
"\n",
|
|||
|
"rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)\n",
|
|||
|
"r2_train = r2_score(y_train, y_train_pred)\n",
|
|||
|
"mae_train = mean_absolute_error(y_train, y_train_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Train RMSE: {rmse_train}\")\n",
|
|||
|
"print(f\"Train R²: {r2_train}\")\n",
|
|||
|
"print(f\"Train MAE: {mae_train}\")\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация результатов\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(y_test, y_pred, alpha=0.5)\n",
|
|||
|
"plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n",
|
|||
|
"plt.xlabel(\"Actual Population\")\n",
|
|||
|
"plt.ylabel(\"Predicted Population\")\n",
|
|||
|
"plt.title(\"Actual vs Predicted Population\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Точность предсказаний: Модель показывает довольно высокий R² (0.98), что указывает на хорошее объяснение вариации. Значения RMSE и MAE довольно низки, что говорит о том, что модель достаточно точно предсказывает цены.\n",
|
|||
|
"\n",
|
|||
|
"Переобучение: Разница между RMSE на обучающей и тестовой выборках не очень большая, что указывает на то, что переобучение не является критическим. Однако, стоит быть осторожным и продолжать мониторинг этого показателя.\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aisenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|