diff --git a/lec3.ipynb b/lec3.ipynb
index 40f4903..9c7cf09 100644
--- a/lec3.ipynb
+++ b/lec3.ipynb
@@ -18,9 +18,268 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Country (or dependency)</th>\n",
+       "      <th>Population 2020</th>\n",
+       "      <th>Yearly Change</th>\n",
+       "      <th>Net Change</th>\n",
+       "      <th>Density(P/Km²)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "      <th>Migrants (net)</th>\n",
+       "      <th>Fert. Rate</th>\n",
+       "      <th>MedAge</th>\n",
+       "      <th>Urban Pop %</th>\n",
+       "      <th>World Share</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>no</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>China</td>\n",
+       "      <td>1439323776</td>\n",
+       "      <td>0.39</td>\n",
+       "      <td>5540090</td>\n",
+       "      <td>153</td>\n",
+       "      <td>9388211</td>\n",
+       "      <td>-348,399</td>\n",
+       "      <td>1.7</td>\n",
+       "      <td>38</td>\n",
+       "      <td>61%</td>\n",
+       "      <td>18.47%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>India</td>\n",
+       "      <td>1380004385</td>\n",
+       "      <td>0.99</td>\n",
+       "      <td>13586631</td>\n",
+       "      <td>464</td>\n",
+       "      <td>2973190</td>\n",
+       "      <td>-532,687</td>\n",
+       "      <td>2.2</td>\n",
+       "      <td>28</td>\n",
+       "      <td>35%</td>\n",
+       "      <td>17.70%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>United States</td>\n",
+       "      <td>331002651</td>\n",
+       "      <td>0.59</td>\n",
+       "      <td>1937734</td>\n",
+       "      <td>36</td>\n",
+       "      <td>9147420</td>\n",
+       "      <td>954,806</td>\n",
+       "      <td>1.8</td>\n",
+       "      <td>38</td>\n",
+       "      <td>83%</td>\n",
+       "      <td>4.25%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Indonesia</td>\n",
+       "      <td>273523615</td>\n",
+       "      <td>1.07</td>\n",
+       "      <td>2898047</td>\n",
+       "      <td>151</td>\n",
+       "      <td>1811570</td>\n",
+       "      <td>-98,955</td>\n",
+       "      <td>2.3</td>\n",
+       "      <td>30</td>\n",
+       "      <td>56%</td>\n",
+       "      <td>3.51%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Pakistan</td>\n",
+       "      <td>220892340</td>\n",
+       "      <td>2.00</td>\n",
+       "      <td>4327022</td>\n",
+       "      <td>287</td>\n",
+       "      <td>770880</td>\n",
+       "      <td>-233,379</td>\n",
+       "      <td>3.6</td>\n",
+       "      <td>23</td>\n",
+       "      <td>35%</td>\n",
+       "      <td>2.83%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>231</th>\n",
+       "      <td>Montserrat</td>\n",
+       "      <td>4992</td>\n",
+       "      <td>0.06</td>\n",
+       "      <td>3</td>\n",
+       "      <td>50</td>\n",
+       "      <td>100</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>N.A.</td>\n",
+       "      <td>N.A.</td>\n",
+       "      <td>10%</td>\n",
+       "      <td>0.00%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>232</th>\n",
+       "      <td>Falkland Islands</td>\n",
+       "      <td>3480</td>\n",
+       "      <td>3.05</td>\n",
+       "      <td>103</td>\n",
+       "      <td>0</td>\n",
+       "      <td>12170</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>N.A.</td>\n",
+       "      <td>N.A.</td>\n",
+       "      <td>66%</td>\n",
+       "      <td>0.00%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>233</th>\n",
+       "      <td>Niue</td>\n",
+       "      <td>1626</td>\n",
+       "      <td>0.68</td>\n",
+       "      <td>11</td>\n",
+       "      <td>6</td>\n",
+       "      <td>260</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>N.A.</td>\n",
+       "      <td>N.A.</td>\n",
+       "      <td>46%</td>\n",
+       "      <td>0.00%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>234</th>\n",
+       "      <td>Tokelau</td>\n",
+       "      <td>1357</td>\n",
+       "      <td>1.27</td>\n",
+       "      <td>17</td>\n",
+       "      <td>136</td>\n",
+       "      <td>10</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>N.A.</td>\n",
+       "      <td>N.A.</td>\n",
+       "      <td>0%</td>\n",
+       "      <td>0.00%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>235</th>\n",
+       "      <td>Holy See</td>\n",
+       "      <td>801</td>\n",
+       "      <td>0.25</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2,003</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>N.A.</td>\n",
+       "      <td>N.A.</td>\n",
+       "      <td>N.A.</td>\n",
+       "      <td>0.00%</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>235 rows × 11 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    Country (or dependency)  Population 2020  Yearly Change  Net Change  \\\n",
+       "no                                                                        \n",
+       "1                     China       1439323776           0.39     5540090   \n",
+       "2                     India       1380004385           0.99    13586631   \n",
+       "3             United States        331002651           0.59     1937734   \n",
+       "4                 Indonesia        273523615           1.07     2898047   \n",
+       "5                  Pakistan        220892340           2.00     4327022   \n",
+       "..                      ...              ...            ...         ...   \n",
+       "231              Montserrat             4992           0.06           3   \n",
+       "232        Falkland Islands             3480           3.05         103   \n",
+       "233                    Niue             1626           0.68          11   \n",
+       "234                 Tokelau             1357           1.27          17   \n",
+       "235                Holy See              801           0.25           2   \n",
+       "\n",
+       "    Density(P/Km²)  Land Area (Km²) Migrants (net) Fert. Rate MedAge  \\\n",
+       "no                                                                     \n",
+       "1              153          9388211       -348,399        1.7     38   \n",
+       "2              464          2973190       -532,687        2.2     28   \n",
+       "3               36          9147420        954,806        1.8     38   \n",
+       "4              151          1811570        -98,955        2.3     30   \n",
+       "5              287           770880       -233,379        3.6     23   \n",
+       "..             ...              ...            ...        ...    ...   \n",
+       "231             50              100            NaN       N.A.   N.A.   \n",
+       "232              0            12170            NaN       N.A.   N.A.   \n",
+       "233              6              260            NaN       N.A.   N.A.   \n",
+       "234            136               10            NaN       N.A.   N.A.   \n",
+       "235          2,003                0            NaN       N.A.   N.A.   \n",
+       "\n",
+       "    Urban Pop % World Share  \n",
+       "no                           \n",
+       "1           61%      18.47%  \n",
+       "2           35%      17.70%  \n",
+       "3           83%       4.25%  \n",
+       "4           56%       3.51%  \n",
+       "5           35%       2.83%  \n",
+       "..          ...         ...  \n",
+       "231         10%       0.00%  \n",
+       "232         66%       0.00%  \n",
+       "233         46%       0.00%  \n",
+       "234          0%       0.00%  \n",
+       "235        N.A.       0.00%  \n",
+       "\n",
+       "[235 rows x 11 columns]"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "import pandas as pd\n",
     "\n",
@@ -59,7 +318,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -86,7 +345,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -111,7 +370,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -121,9 +380,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([       0.        ,  5458956.66666667, 10917913.33333333,\n",
+       "        16376870.        ]),\n",
+       " array([229,   5,   1]))"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "hist1, bins1 = np.histogram(\n",
     "    countries[\"Land Area (Km²)\"].fillna(countries[\"Land Area (Km²)\"].median()), bins=num_bins\n",
@@ -133,9 +405,196 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Country (or dependency)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>no</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>China</td>\n",
+       "      <td>9388211</td>\n",
+       "      <td>(5458956.667, 10917913.333]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>India</td>\n",
+       "      <td>2973190</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>United States</td>\n",
+       "      <td>9147420</td>\n",
+       "      <td>(5458956.667, 10917913.333]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Indonesia</td>\n",
+       "      <td>1811570</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Pakistan</td>\n",
+       "      <td>770880</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Brazil</td>\n",
+       "      <td>8358140</td>\n",
+       "      <td>(5458956.667, 10917913.333]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Nigeria</td>\n",
+       "      <td>910770</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Bangladesh</td>\n",
+       "      <td>130170</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Russia</td>\n",
+       "      <td>16376870</td>\n",
+       "      <td>(10917913.333, 16376870.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Mexico</td>\n",
+       "      <td>1943950</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Japan</td>\n",
+       "      <td>364555</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Ethiopia</td>\n",
+       "      <td>1000000</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Philippines</td>\n",
+       "      <td>298170</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Egypt</td>\n",
+       "      <td>995450</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Vietnam</td>\n",
+       "      <td>310070</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>DR Congo</td>\n",
+       "      <td>2267050</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Turkey</td>\n",
+       "      <td>769630</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Iran</td>\n",
+       "      <td>1628550</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Germany</td>\n",
+       "      <td>348560</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Thailand</td>\n",
+       "      <td>510890</td>\n",
+       "      <td>(0.0, 5458956.667]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Country (or dependency) Land Area (Km²)              Land Area (Km²)\n",
+       "no                                                                     \n",
+       "1                    China         9388211  (5458956.667, 10917913.333]\n",
+       "2                    India         2973190           (0.0, 5458956.667]\n",
+       "3            United States         9147420  (5458956.667, 10917913.333]\n",
+       "4                Indonesia         1811570           (0.0, 5458956.667]\n",
+       "5                 Pakistan          770880           (0.0, 5458956.667]\n",
+       "6                   Brazil         8358140  (5458956.667, 10917913.333]\n",
+       "7                  Nigeria          910770           (0.0, 5458956.667]\n",
+       "8               Bangladesh          130170           (0.0, 5458956.667]\n",
+       "9                   Russia        16376870   (10917913.333, 16376870.0]\n",
+       "10                  Mexico         1943950           (0.0, 5458956.667]\n",
+       "11                   Japan          364555           (0.0, 5458956.667]\n",
+       "12                Ethiopia         1000000           (0.0, 5458956.667]\n",
+       "13             Philippines          298170           (0.0, 5458956.667]\n",
+       "14                   Egypt          995450           (0.0, 5458956.667]\n",
+       "15                 Vietnam          310070           (0.0, 5458956.667]\n",
+       "16                DR Congo         2267050           (0.0, 5458956.667]\n",
+       "17                  Turkey          769630           (0.0, 5458956.667]\n",
+       "18                    Iran         1628550           (0.0, 5458956.667]\n",
+       "19                 Germany          348560           (0.0, 5458956.667]\n",
+       "20                Thailand          510890           (0.0, 5458956.667]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "pd.concat(\n",
     "    [\n",
@@ -149,9 +608,196 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Country (or dependency)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>no</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>China</td>\n",
+       "      <td>9388211</td>\n",
+       "      <td>Middle</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>India</td>\n",
+       "      <td>2973190</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>United States</td>\n",
+       "      <td>9147420</td>\n",
+       "      <td>Middle</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Indonesia</td>\n",
+       "      <td>1811570</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Pakistan</td>\n",
+       "      <td>770880</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Brazil</td>\n",
+       "      <td>8358140</td>\n",
+       "      <td>Middle</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Nigeria</td>\n",
+       "      <td>910770</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Bangladesh</td>\n",
+       "      <td>130170</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Russia</td>\n",
+       "      <td>16376870</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Mexico</td>\n",
+       "      <td>1943950</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Japan</td>\n",
+       "      <td>364555</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Ethiopia</td>\n",
+       "      <td>1000000</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Philippines</td>\n",
+       "      <td>298170</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Egypt</td>\n",
+       "      <td>995450</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Vietnam</td>\n",
+       "      <td>310070</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>DR Congo</td>\n",
+       "      <td>2267050</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Turkey</td>\n",
+       "      <td>769630</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Iran</td>\n",
+       "      <td>1628550</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Germany</td>\n",
+       "      <td>348560</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Thailand</td>\n",
+       "      <td>510890</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Country (or dependency) Land Area (Km²) Land Area (Km²)\n",
+       "no                                                        \n",
+       "1                    China         9388211          Middle\n",
+       "2                    India         2973190           Small\n",
+       "3            United States         9147420          Middle\n",
+       "4                Indonesia         1811570           Small\n",
+       "5                 Pakistan          770880           Small\n",
+       "6                   Brazil         8358140          Middle\n",
+       "7                  Nigeria          910770           Small\n",
+       "8               Bangladesh          130170           Small\n",
+       "9                   Russia        16376870             Big\n",
+       "10                  Mexico         1943950           Small\n",
+       "11                   Japan          364555           Small\n",
+       "12                Ethiopia         1000000           Small\n",
+       "13             Philippines          298170           Small\n",
+       "14                   Egypt          995450           Small\n",
+       "15                 Vietnam          310070           Small\n",
+       "16                DR Congo         2267050           Small\n",
+       "17                  Turkey          769630           Small\n",
+       "18                    Iran         1628550           Small\n",
+       "19                 Germany          348560           Small\n",
+       "20                Thailand          510890           Small"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "pd.concat(\n",
     "    [\n",
@@ -167,14 +813,26 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)"
+    "Равномерное разделение данных на 4 группы c установкой собственной границы диапазона значений (от 0 до 12000000) просто ставим нименьшее и наибольшее и ставим колво групп"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([       0.,  4000000.,  8000000., 12000000.]),\n",
+       " array([229,   1,   4,   1]))"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "labels = [\"Small\", \"Middle\", \"Big\"]\n",
     "bins2 = np.linspace(0, 12000000, 4)\n",
@@ -190,9 +848,196 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Country (or dependency)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>no</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>China</td>\n",
+       "      <td>9388211</td>\n",
+       "      <td>(8000000.0, 12000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>India</td>\n",
+       "      <td>2973190</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>United States</td>\n",
+       "      <td>9147420</td>\n",
+       "      <td>(8000000.0, 12000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Indonesia</td>\n",
+       "      <td>1811570</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Pakistan</td>\n",
+       "      <td>770880</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Brazil</td>\n",
+       "      <td>8358140</td>\n",
+       "      <td>(8000000.0, 12000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Nigeria</td>\n",
+       "      <td>910770</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Bangladesh</td>\n",
+       "      <td>130170</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Russia</td>\n",
+       "      <td>16376870</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Mexico</td>\n",
+       "      <td>1943950</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Japan</td>\n",
+       "      <td>364555</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Ethiopia</td>\n",
+       "      <td>1000000</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Philippines</td>\n",
+       "      <td>298170</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Egypt</td>\n",
+       "      <td>995450</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Vietnam</td>\n",
+       "      <td>310070</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>DR Congo</td>\n",
+       "      <td>2267050</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Turkey</td>\n",
+       "      <td>769630</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Iran</td>\n",
+       "      <td>1628550</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Germany</td>\n",
+       "      <td>348560</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Thailand</td>\n",
+       "      <td>510890</td>\n",
+       "      <td>(0.0, 4000000.0]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Country (or dependency) Land Area (Km²)          Land Area (Km²)\n",
+       "no                                                                 \n",
+       "1                    China         9388211  (8000000.0, 12000000.0]\n",
+       "2                    India         2973190         (0.0, 4000000.0]\n",
+       "3            United States         9147420  (8000000.0, 12000000.0]\n",
+       "4                Indonesia         1811570         (0.0, 4000000.0]\n",
+       "5                 Pakistan          770880         (0.0, 4000000.0]\n",
+       "6                   Brazil         8358140  (8000000.0, 12000000.0]\n",
+       "7                  Nigeria          910770         (0.0, 4000000.0]\n",
+       "8               Bangladesh          130170         (0.0, 4000000.0]\n",
+       "9                   Russia        16376870                      NaN\n",
+       "10                  Mexico         1943950         (0.0, 4000000.0]\n",
+       "11                   Japan          364555         (0.0, 4000000.0]\n",
+       "12                Ethiopia         1000000         (0.0, 4000000.0]\n",
+       "13             Philippines          298170         (0.0, 4000000.0]\n",
+       "14                   Egypt          995450         (0.0, 4000000.0]\n",
+       "15                 Vietnam          310070         (0.0, 4000000.0]\n",
+       "16                DR Congo         2267050         (0.0, 4000000.0]\n",
+       "17                  Turkey          769630         (0.0, 4000000.0]\n",
+       "18                    Iran         1628550         (0.0, 4000000.0]\n",
+       "19                 Germany          348560         (0.0, 4000000.0]\n",
+       "20                Thailand          510890         (0.0, 4000000.0]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "pd.concat(\n",
     "    [\n",
@@ -206,9 +1051,196 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Country (or dependency)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>no</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>China</td>\n",
+       "      <td>9388211</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>India</td>\n",
+       "      <td>2973190</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>United States</td>\n",
+       "      <td>9147420</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Indonesia</td>\n",
+       "      <td>1811570</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Pakistan</td>\n",
+       "      <td>770880</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Brazil</td>\n",
+       "      <td>8358140</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Nigeria</td>\n",
+       "      <td>910770</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Bangladesh</td>\n",
+       "      <td>130170</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Russia</td>\n",
+       "      <td>16376870</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Mexico</td>\n",
+       "      <td>1943950</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Japan</td>\n",
+       "      <td>364555</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Ethiopia</td>\n",
+       "      <td>1000000</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Philippines</td>\n",
+       "      <td>298170</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Egypt</td>\n",
+       "      <td>995450</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Vietnam</td>\n",
+       "      <td>310070</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>DR Congo</td>\n",
+       "      <td>2267050</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Turkey</td>\n",
+       "      <td>769630</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Iran</td>\n",
+       "      <td>1628550</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Germany</td>\n",
+       "      <td>348560</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Thailand</td>\n",
+       "      <td>510890</td>\n",
+       "      <td>Small</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Country (or dependency) Land Area (Km²) Land Area (Km²)\n",
+       "no                                                        \n",
+       "1                    China         9388211             Big\n",
+       "2                    India         2973190           Small\n",
+       "3            United States         9147420             Big\n",
+       "4                Indonesia         1811570           Small\n",
+       "5                 Pakistan          770880           Small\n",
+       "6                   Brazil         8358140             Big\n",
+       "7                  Nigeria          910770           Small\n",
+       "8               Bangladesh          130170           Small\n",
+       "9                   Russia        16376870             NaN\n",
+       "10                  Mexico         1943950           Small\n",
+       "11                   Japan          364555           Small\n",
+       "12                Ethiopia         1000000           Small\n",
+       "13             Philippines          298170           Small\n",
+       "14                   Egypt          995450           Small\n",
+       "15                 Vietnam          310070           Small\n",
+       "16                DR Congo         2267050           Small\n",
+       "17                  Turkey          769630           Small\n",
+       "18                    Iran         1628550           Small\n",
+       "19                 Germany          348560           Small\n",
+       "20                Thailand          510890           Small"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "pd.concat(\n",
     "    [\n",
@@ -224,14 +1256,26 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)"
+    "Равномерное разделение данных на 5 групп c установкой собственных интервалов (0 - 1000, 1000 - 100000, 100000 - 500000, 500000 - 3000000, 3000000 И БОЛЕЕ)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([0.e+00, 1.e+03, 1.e+05, 5.e+05, 3.e+06,    inf]),\n",
+       " array([52, 77, 56, 44,  6]))"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "labels2 = [\"Dwarf\", \"Small\", \"Middle\", \"Big\", \"Giant\"]\n",
     "hist3, bins3 = np.histogram(\n",
@@ -245,9 +1289,196 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Country (or dependency)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>no</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>China</td>\n",
+       "      <td>9388211</td>\n",
+       "      <td>(3000000.0, inf]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>India</td>\n",
+       "      <td>2973190</td>\n",
+       "      <td>(500000.0, 3000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>United States</td>\n",
+       "      <td>9147420</td>\n",
+       "      <td>(3000000.0, inf]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Indonesia</td>\n",
+       "      <td>1811570</td>\n",
+       "      <td>(500000.0, 3000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Pakistan</td>\n",
+       "      <td>770880</td>\n",
+       "      <td>(500000.0, 3000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Brazil</td>\n",
+       "      <td>8358140</td>\n",
+       "      <td>(3000000.0, inf]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Nigeria</td>\n",
+       "      <td>910770</td>\n",
+       "      <td>(500000.0, 3000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Bangladesh</td>\n",
+       "      <td>130170</td>\n",
+       "      <td>(100000.0, 500000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Russia</td>\n",
+       "      <td>16376870</td>\n",
+       "      <td>(3000000.0, inf]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Mexico</td>\n",
+       "      <td>1943950</td>\n",
+       "      <td>(500000.0, 3000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Japan</td>\n",
+       "      <td>364555</td>\n",
+       "      <td>(100000.0, 500000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Ethiopia</td>\n",
+       "      <td>1000000</td>\n",
+       "      <td>(500000.0, 3000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Philippines</td>\n",
+       "      <td>298170</td>\n",
+       "      <td>(100000.0, 500000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Egypt</td>\n",
+       "      <td>995450</td>\n",
+       "      <td>(500000.0, 3000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Vietnam</td>\n",
+       "      <td>310070</td>\n",
+       "      <td>(100000.0, 500000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>DR Congo</td>\n",
+       "      <td>2267050</td>\n",
+       "      <td>(500000.0, 3000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Turkey</td>\n",
+       "      <td>769630</td>\n",
+       "      <td>(500000.0, 3000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Iran</td>\n",
+       "      <td>1628550</td>\n",
+       "      <td>(500000.0, 3000000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Germany</td>\n",
+       "      <td>348560</td>\n",
+       "      <td>(100000.0, 500000.0]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Thailand</td>\n",
+       "      <td>510890</td>\n",
+       "      <td>(500000.0, 3000000.0]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Country (or dependency) Land Area (Km²)        Land Area (Km²)\n",
+       "no                                                               \n",
+       "1                    China         9388211       (3000000.0, inf]\n",
+       "2                    India         2973190  (500000.0, 3000000.0]\n",
+       "3            United States         9147420       (3000000.0, inf]\n",
+       "4                Indonesia         1811570  (500000.0, 3000000.0]\n",
+       "5                 Pakistan          770880  (500000.0, 3000000.0]\n",
+       "6                   Brazil         8358140       (3000000.0, inf]\n",
+       "7                  Nigeria          910770  (500000.0, 3000000.0]\n",
+       "8               Bangladesh          130170   (100000.0, 500000.0]\n",
+       "9                   Russia        16376870       (3000000.0, inf]\n",
+       "10                  Mexico         1943950  (500000.0, 3000000.0]\n",
+       "11                   Japan          364555   (100000.0, 500000.0]\n",
+       "12                Ethiopia         1000000  (500000.0, 3000000.0]\n",
+       "13             Philippines          298170   (100000.0, 500000.0]\n",
+       "14                   Egypt          995450  (500000.0, 3000000.0]\n",
+       "15                 Vietnam          310070   (100000.0, 500000.0]\n",
+       "16                DR Congo         2267050  (500000.0, 3000000.0]\n",
+       "17                  Turkey          769630  (500000.0, 3000000.0]\n",
+       "18                    Iran         1628550  (500000.0, 3000000.0]\n",
+       "19                 Germany          348560   (100000.0, 500000.0]\n",
+       "20                Thailand          510890  (500000.0, 3000000.0]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "pd.concat(\n",
     "    [\n",
@@ -261,9 +1492,196 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Country (or dependency)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>no</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>China</td>\n",
+       "      <td>9388211</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>India</td>\n",
+       "      <td>2973190</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>United States</td>\n",
+       "      <td>9147420</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Indonesia</td>\n",
+       "      <td>1811570</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Pakistan</td>\n",
+       "      <td>770880</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Brazil</td>\n",
+       "      <td>8358140</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Nigeria</td>\n",
+       "      <td>910770</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Bangladesh</td>\n",
+       "      <td>130170</td>\n",
+       "      <td>Middle</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Russia</td>\n",
+       "      <td>16376870</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Mexico</td>\n",
+       "      <td>1943950</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Japan</td>\n",
+       "      <td>364555</td>\n",
+       "      <td>Middle</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Ethiopia</td>\n",
+       "      <td>1000000</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Philippines</td>\n",
+       "      <td>298170</td>\n",
+       "      <td>Middle</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Egypt</td>\n",
+       "      <td>995450</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Vietnam</td>\n",
+       "      <td>310070</td>\n",
+       "      <td>Middle</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>DR Congo</td>\n",
+       "      <td>2267050</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Turkey</td>\n",
+       "      <td>769630</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Iran</td>\n",
+       "      <td>1628550</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Germany</td>\n",
+       "      <td>348560</td>\n",
+       "      <td>Middle</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Thailand</td>\n",
+       "      <td>510890</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Country (or dependency) Land Area (Km²) Land Area (Km²)\n",
+       "no                                                        \n",
+       "1                    China         9388211           Giant\n",
+       "2                    India         2973190             Big\n",
+       "3            United States         9147420           Giant\n",
+       "4                Indonesia         1811570             Big\n",
+       "5                 Pakistan          770880             Big\n",
+       "6                   Brazil         8358140           Giant\n",
+       "7                  Nigeria          910770             Big\n",
+       "8               Bangladesh          130170          Middle\n",
+       "9                   Russia        16376870           Giant\n",
+       "10                  Mexico         1943950             Big\n",
+       "11                   Japan          364555          Middle\n",
+       "12                Ethiopia         1000000             Big\n",
+       "13             Philippines          298170          Middle\n",
+       "14                   Egypt          995450             Big\n",
+       "15                 Vietnam          310070          Middle\n",
+       "16                DR Congo         2267050             Big\n",
+       "17                  Turkey          769630             Big\n",
+       "18                    Iran         1628550             Big\n",
+       "19                 Germany          348560          Middle\n",
+       "20                Thailand          510890             Big"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "pd.concat(\n",
     "    [\n",
@@ -284,9 +1702,196 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Country (or dependency)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>no</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>China</td>\n",
+       "      <td>9388211</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>India</td>\n",
+       "      <td>2973190</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>United States</td>\n",
+       "      <td>9147420</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Indonesia</td>\n",
+       "      <td>1811570</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Pakistan</td>\n",
+       "      <td>770880</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Brazil</td>\n",
+       "      <td>8358140</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Nigeria</td>\n",
+       "      <td>910770</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Bangladesh</td>\n",
+       "      <td>130170</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Russia</td>\n",
+       "      <td>16376870</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Mexico</td>\n",
+       "      <td>1943950</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Japan</td>\n",
+       "      <td>364555</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Ethiopia</td>\n",
+       "      <td>1000000</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Philippines</td>\n",
+       "      <td>298170</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Egypt</td>\n",
+       "      <td>995450</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Vietnam</td>\n",
+       "      <td>310070</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>DR Congo</td>\n",
+       "      <td>2267050</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Turkey</td>\n",
+       "      <td>769630</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Iran</td>\n",
+       "      <td>1628550</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Germany</td>\n",
+       "      <td>348560</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Thailand</td>\n",
+       "      <td>510890</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Country (or dependency)  Land Area (Km²)  Land Area (Km²)\n",
+       "no                                                          \n",
+       "1                    China          9388211                4\n",
+       "2                    India          2973190                4\n",
+       "3            United States          9147420                4\n",
+       "4                Indonesia          1811570                4\n",
+       "5                 Pakistan           770880                4\n",
+       "6                   Brazil          8358140                4\n",
+       "7                  Nigeria           910770                4\n",
+       "8               Bangladesh           130170                2\n",
+       "9                   Russia         16376870                4\n",
+       "10                  Mexico          1943950                4\n",
+       "11                   Japan           364555                3\n",
+       "12                Ethiopia          1000000                4\n",
+       "13             Philippines           298170                3\n",
+       "14                   Egypt           995450                4\n",
+       "15                 Vietnam           310070                3\n",
+       "16                DR Congo          2267050                4\n",
+       "17                  Turkey           769630                4\n",
+       "18                    Iran          1628550                4\n",
+       "19                 Germany           348560                3\n",
+       "20                Thailand           510890                3"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "pd.concat(\n",
     "    [\n",
@@ -300,9 +1905,196 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Country (or dependency)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "      <th>Land Area (Km²)</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>no</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>China</td>\n",
+       "      <td>9388211</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>India</td>\n",
+       "      <td>2973190</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>United States</td>\n",
+       "      <td>9147420</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Indonesia</td>\n",
+       "      <td>1811570</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Pakistan</td>\n",
+       "      <td>770880</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Brazil</td>\n",
+       "      <td>8358140</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Nigeria</td>\n",
+       "      <td>910770</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Bangladesh</td>\n",
+       "      <td>130170</td>\n",
+       "      <td>Middle</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Russia</td>\n",
+       "      <td>16376870</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Mexico</td>\n",
+       "      <td>1943950</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Japan</td>\n",
+       "      <td>364555</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Ethiopia</td>\n",
+       "      <td>1000000</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Philippines</td>\n",
+       "      <td>298170</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Egypt</td>\n",
+       "      <td>995450</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Vietnam</td>\n",
+       "      <td>310070</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>DR Congo</td>\n",
+       "      <td>2267050</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Turkey</td>\n",
+       "      <td>769630</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Iran</td>\n",
+       "      <td>1628550</td>\n",
+       "      <td>Giant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Germany</td>\n",
+       "      <td>348560</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Thailand</td>\n",
+       "      <td>510890</td>\n",
+       "      <td>Big</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Country (or dependency) Land Area (Km²) Land Area (Km²)\n",
+       "no                                                        \n",
+       "1                    China         9388211           Giant\n",
+       "2                    India         2973190           Giant\n",
+       "3            United States         9147420           Giant\n",
+       "4                Indonesia         1811570           Giant\n",
+       "5                 Pakistan          770880           Giant\n",
+       "6                   Brazil         8358140           Giant\n",
+       "7                  Nigeria          910770           Giant\n",
+       "8               Bangladesh          130170          Middle\n",
+       "9                   Russia        16376870           Giant\n",
+       "10                  Mexico         1943950           Giant\n",
+       "11                   Japan          364555             Big\n",
+       "12                Ethiopia         1000000           Giant\n",
+       "13             Philippines          298170             Big\n",
+       "14                   Egypt          995450           Giant\n",
+       "15                 Vietnam          310070             Big\n",
+       "16                DR Congo         2267050           Giant\n",
+       "17                  Turkey          769630           Giant\n",
+       "18                    Iran         1628550           Giant\n",
+       "19                 Germany          348560             Big\n",
+       "20                Thailand          510890             Big"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "pd.concat(\n",
     "    [\n",
@@ -329,7 +2121,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -364,16 +2156,12 @@
    "source": [
     "#### Загрузка данных\n",
     "\n",
-    "За основу был взят набор данных \"Ecommerce Orders Data Set\" из Kaggle\n",
-    "\n",
-    "Используется только 100 первых заказов и связанные с ними объекты\n",
-    "\n",
-    "https://www.kaggle.com/datasets/sangamsharmait/ecommerce-orders-data-analysis"
+    "приведение даннык к нормальному виду\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -430,7 +2218,7 @@
        " [234 rows x 3 columns])"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -481,7 +2269,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -506,7 +2294,7 @@
        "    No relationships"
       ]
      },
-     "execution_count": 34,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -558,7 +2346,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -573,7 +2361,7 @@
        "    countries.Country (or dependency) -> capitals.Country/Territory"
       ]
      },
-     "execution_count": 35,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -599,7 +2387,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -790,7 +2578,7 @@
        "[235 rows x 7 columns]"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -816,7 +2604,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -831,7 +2619,7 @@
        " <Feature: capitals.Continent>]"
       ]
      },
-     "execution_count": 37,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -856,7 +2644,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -865,7 +2653,7 @@
        "<Axes: >"
       ]
      },
-     "execution_count": 38,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -893,7 +2681,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -1141,16 +2929,16 @@
        "29                Colombia         50882891         50000000"
       ]
      },
-     "execution_count": 40,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "countries_norm = countries.copy()\n",
-    "\n",
+    "#заменяем все значения больше 50000000 на 50000000\n",
     "countries_norm[\"Population Clip\"] = countries_norm[\"Population 2020\"].clip(0, 50000000);\n",
-    "\n",
+    "#проверка результата\n",
     "countries_norm[countries_norm[\"Population 2020\"] > 50000000][\n",
     "    [\"Country (or dependency)\", \"Population 2020\", \"Population Clip\"]\n",
     "]"
@@ -1160,12 +2948,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Винсоризация признака Возраст"
+    "Винсоризация \n",
+    "     * `winsorize()`: Функция для обработки выбросов с помощью Winsorization. \n",
+    "     * `countries_norm[\"Population 2020\"].fillna(countries_norm[\"Population 2020\"].mean())`: Заменяет пропущенные значения в столбце \"Population 2020\" средним значением этого столбца.\n",
+    "     * `(0, 0.05)`: Указывает, что нужно обработать как нижние, так и верхние выбросы. 0.05 означает, что 5% самых маленьких и 5% самых больших значений в столбце \"Population 2020\" будут заменены на значения 5-го и 95-го процентилей соответственно. \n",
+    "     * `inplace=False`: Указывает, что `winsorize` не должен модифицировать исходный датафрейм `countries_norm` напрямую, а создать новый столбец с обработанными данными."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -1420,7 +3212,7 @@
        "29                Colombia         50882891              50882891"
       ]
      },
-     "execution_count": 41,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1450,7 +3242,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -1628,7 +3420,7 @@
        "[235 rows x 6 columns]"
       ]
      },
-     "execution_count": 43,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1677,7 +3469,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
@@ -1842,7 +3634,7 @@
        "[235 rows x 5 columns]"
       ]
      },
-     "execution_count": 44,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/lec4.ipynb b/lec4.ipynb
new file mode 100644
index 0000000..1eeba17
--- /dev/null
+++ b/lec4.ipynb
@@ -0,0 +1,2524 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Загрузка набора данных"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Price</th>\n",
+       "      <th>Levy</th>\n",
+       "      <th>Manufacturer</th>\n",
+       "      <th>Model</th>\n",
+       "      <th>Prod_year</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Leather_interior</th>\n",
+       "      <th>Fuel type</th>\n",
+       "      <th>Engine volume</th>\n",
+       "      <th>Mileage</th>\n",
+       "      <th>Cylinders</th>\n",
+       "      <th>Gear box type</th>\n",
+       "      <th>Drive wheels</th>\n",
+       "      <th>Doors</th>\n",
+       "      <th>Wheel</th>\n",
+       "      <th>Color</th>\n",
+       "      <th>Airbags</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>45654403</th>\n",
+       "      <td>13328</td>\n",
+       "      <td>1399</td>\n",
+       "      <td>LEXUS</td>\n",
+       "      <td>RX 450</td>\n",
+       "      <td>2010</td>\n",
+       "      <td>Jeep</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Hybrid</td>\n",
+       "      <td>3.5</td>\n",
+       "      <td>186005 km</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>4x4</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Silver</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44731507</th>\n",
+       "      <td>16621</td>\n",
+       "      <td>1018</td>\n",
+       "      <td>CHEVROLET</td>\n",
+       "      <td>Equinox</td>\n",
+       "      <td>2011</td>\n",
+       "      <td>Jeep</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Petrol</td>\n",
+       "      <td>3</td>\n",
+       "      <td>192000 km</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>Tiptronic</td>\n",
+       "      <td>4x4</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Black</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45774419</th>\n",
+       "      <td>8467</td>\n",
+       "      <td>-</td>\n",
+       "      <td>HONDA</td>\n",
+       "      <td>FIT</td>\n",
+       "      <td>2006</td>\n",
+       "      <td>Hatchback</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Petrol</td>\n",
+       "      <td>1.3</td>\n",
+       "      <td>200000 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Variator</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Right-hand drive</td>\n",
+       "      <td>Black</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45769185</th>\n",
+       "      <td>3607</td>\n",
+       "      <td>862</td>\n",
+       "      <td>FORD</td>\n",
+       "      <td>Escape</td>\n",
+       "      <td>2011</td>\n",
+       "      <td>Jeep</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Hybrid</td>\n",
+       "      <td>2.5</td>\n",
+       "      <td>168966 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>4x4</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>White</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45809263</th>\n",
+       "      <td>11726</td>\n",
+       "      <td>446</td>\n",
+       "      <td>HONDA</td>\n",
+       "      <td>FIT</td>\n",
+       "      <td>2014</td>\n",
+       "      <td>Hatchback</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Petrol</td>\n",
+       "      <td>1.3</td>\n",
+       "      <td>91901 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Silver</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45798355</th>\n",
+       "      <td>8467</td>\n",
+       "      <td>-</td>\n",
+       "      <td>MERCEDES-BENZ</td>\n",
+       "      <td>CLK 200</td>\n",
+       "      <td>1999</td>\n",
+       "      <td>Coupe</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>CNG</td>\n",
+       "      <td>2.0 Turbo</td>\n",
+       "      <td>300000 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Manual</td>\n",
+       "      <td>Rear</td>\n",
+       "      <td>02-Mar</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Silver</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45778856</th>\n",
+       "      <td>15681</td>\n",
+       "      <td>831</td>\n",
+       "      <td>HYUNDAI</td>\n",
+       "      <td>Sonata</td>\n",
+       "      <td>2011</td>\n",
+       "      <td>Sedan</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Petrol</td>\n",
+       "      <td>2.4</td>\n",
+       "      <td>161600 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Tiptronic</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Red</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45804997</th>\n",
+       "      <td>26108</td>\n",
+       "      <td>836</td>\n",
+       "      <td>HYUNDAI</td>\n",
+       "      <td>Tucson</td>\n",
+       "      <td>2010</td>\n",
+       "      <td>Jeep</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>2</td>\n",
+       "      <td>116365 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Grey</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45793526</th>\n",
+       "      <td>5331</td>\n",
+       "      <td>1288</td>\n",
+       "      <td>CHEVROLET</td>\n",
+       "      <td>Captiva</td>\n",
+       "      <td>2007</td>\n",
+       "      <td>Jeep</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>2</td>\n",
+       "      <td>51258 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Black</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45813273</th>\n",
+       "      <td>470</td>\n",
+       "      <td>753</td>\n",
+       "      <td>HYUNDAI</td>\n",
+       "      <td>Sonata</td>\n",
+       "      <td>2012</td>\n",
+       "      <td>Sedan</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Hybrid</td>\n",
+       "      <td>2.4</td>\n",
+       "      <td>186923 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>White</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>19237 rows × 17 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          Price  Levy   Manufacturer    Model  Prod_year   Category  \\\n",
+       "ID                                                                    \n",
+       "45654403  13328  1399          LEXUS   RX 450       2010       Jeep   \n",
+       "44731507  16621  1018      CHEVROLET  Equinox       2011       Jeep   \n",
+       "45774419   8467     -          HONDA      FIT       2006  Hatchback   \n",
+       "45769185   3607   862           FORD   Escape       2011       Jeep   \n",
+       "45809263  11726   446          HONDA      FIT       2014  Hatchback   \n",
+       "...         ...   ...            ...      ...        ...        ...   \n",
+       "45798355   8467     -  MERCEDES-BENZ  CLK 200       1999      Coupe   \n",
+       "45778856  15681   831        HYUNDAI   Sonata       2011      Sedan   \n",
+       "45804997  26108   836        HYUNDAI   Tucson       2010       Jeep   \n",
+       "45793526   5331  1288      CHEVROLET  Captiva       2007       Jeep   \n",
+       "45813273    470   753        HYUNDAI   Sonata       2012      Sedan   \n",
+       "\n",
+       "         Leather_interior Fuel type Engine volume    Mileage  Cylinders  \\\n",
+       "ID                                                                        \n",
+       "45654403              Yes    Hybrid           3.5  186005 km        6.0   \n",
+       "44731507               No    Petrol             3  192000 km        6.0   \n",
+       "45774419               No    Petrol           1.3  200000 km        4.0   \n",
+       "45769185              Yes    Hybrid           2.5  168966 km        4.0   \n",
+       "45809263              Yes    Petrol           1.3   91901 km        4.0   \n",
+       "...                   ...       ...           ...        ...        ...   \n",
+       "45798355              Yes       CNG     2.0 Turbo  300000 km        4.0   \n",
+       "45778856              Yes    Petrol           2.4  161600 km        4.0   \n",
+       "45804997              Yes    Diesel             2  116365 km        4.0   \n",
+       "45793526              Yes    Diesel             2   51258 km        4.0   \n",
+       "45813273              Yes    Hybrid           2.4  186923 km        4.0   \n",
+       "\n",
+       "         Gear box type Drive wheels   Doors             Wheel   Color  Airbags  \n",
+       "ID                                                                              \n",
+       "45654403     Automatic          4x4  04-May        Left wheel  Silver       12  \n",
+       "44731507     Tiptronic          4x4  04-May        Left wheel   Black        8  \n",
+       "45774419      Variator        Front  04-May  Right-hand drive   Black        2  \n",
+       "45769185     Automatic          4x4  04-May        Left wheel   White        0  \n",
+       "45809263     Automatic        Front  04-May        Left wheel  Silver        4  \n",
+       "...                ...          ...     ...               ...     ...      ...  \n",
+       "45798355        Manual         Rear  02-Mar        Left wheel  Silver        5  \n",
+       "45778856     Tiptronic        Front  04-May        Left wheel     Red        8  \n",
+       "45804997     Automatic        Front  04-May        Left wheel    Grey        4  \n",
+       "45793526     Automatic        Front  04-May        Left wheel   Black        4  \n",
+       "45813273     Automatic        Front  04-May        Left wheel   White       12  \n",
+       "\n",
+       "[19237 rows x 17 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "from sklearn import set_config\n",
+    "\n",
+    "set_config(transform_output=\"pandas\")\n",
+    "\n",
+    "random_state=9\n",
+    "\n",
+    "df = pd.read_csv(\"data/car_price_prediction.csv\", index_col=\"ID\")\n",
+    "\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации\n",
+    "\n",
+    "Целевой признак -- gear box type - коробка переключения передач. x - полная выборка, y - gear box столбец\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'X_train'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Price</th>\n",
+       "      <th>Levy</th>\n",
+       "      <th>Manufacturer</th>\n",
+       "      <th>Model</th>\n",
+       "      <th>Prod_year</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Leather_interior</th>\n",
+       "      <th>Fuel type</th>\n",
+       "      <th>Engine volume</th>\n",
+       "      <th>Mileage</th>\n",
+       "      <th>Cylinders</th>\n",
+       "      <th>Gear box type</th>\n",
+       "      <th>Drive wheels</th>\n",
+       "      <th>Doors</th>\n",
+       "      <th>Wheel</th>\n",
+       "      <th>Color</th>\n",
+       "      <th>Airbags</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>45758153</th>\n",
+       "      <td>1333</td>\n",
+       "      <td>289</td>\n",
+       "      <td>FORD</td>\n",
+       "      <td>Escape</td>\n",
+       "      <td>2008</td>\n",
+       "      <td>Jeep</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Hybrid</td>\n",
+       "      <td>0.4</td>\n",
+       "      <td>349288 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Blue</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45699930</th>\n",
+       "      <td>17249</td>\n",
+       "      <td>-</td>\n",
+       "      <td>FORD</td>\n",
+       "      <td>Escape Hybrid</td>\n",
+       "      <td>2008</td>\n",
+       "      <td>Jeep</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Hybrid</td>\n",
+       "      <td>2.3</td>\n",
+       "      <td>147000 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Variator</td>\n",
+       "      <td>4x4</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>White</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45646562</th>\n",
+       "      <td>1333</td>\n",
+       "      <td>1053</td>\n",
+       "      <td>LEXUS</td>\n",
+       "      <td>ES 350</td>\n",
+       "      <td>2014</td>\n",
+       "      <td>Sedan</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Petrol</td>\n",
+       "      <td>3.5</td>\n",
+       "      <td>179358 km</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Red</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45656923</th>\n",
+       "      <td>9879</td>\n",
+       "      <td>1018</td>\n",
+       "      <td>MERCEDES-BENZ</td>\n",
+       "      <td>ML 350</td>\n",
+       "      <td>2011</td>\n",
+       "      <td>Jeep</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>3</td>\n",
+       "      <td>275862 km</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>4x4</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Silver</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45815887</th>\n",
+       "      <td>10976</td>\n",
+       "      <td>1275</td>\n",
+       "      <td>HYUNDAI</td>\n",
+       "      <td>Sonata</td>\n",
+       "      <td>2019</td>\n",
+       "      <td>Sedan</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Petrol</td>\n",
+       "      <td>2.4</td>\n",
+       "      <td>29419 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Blue</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45802363</th>\n",
+       "      <td>21805</td>\n",
+       "      <td>1024</td>\n",
+       "      <td>HYUNDAI</td>\n",
+       "      <td>H1</td>\n",
+       "      <td>2010</td>\n",
+       "      <td>Minivan</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>2.5</td>\n",
+       "      <td>58958 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Black</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45812777</th>\n",
+       "      <td>220</td>\n",
+       "      <td>1327</td>\n",
+       "      <td>TOYOTA</td>\n",
+       "      <td>Camry</td>\n",
+       "      <td>2018</td>\n",
+       "      <td>Sedan</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Petrol</td>\n",
+       "      <td>2.5</td>\n",
+       "      <td>47688 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Blue</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44104417</th>\n",
+       "      <td>15210</td>\n",
+       "      <td>-</td>\n",
+       "      <td>TOYOTA</td>\n",
+       "      <td>Aqua</td>\n",
+       "      <td>2014</td>\n",
+       "      <td>Hatchback</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Hybrid</td>\n",
+       "      <td>1.5</td>\n",
+       "      <td>139000 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Variator</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Right-hand drive</td>\n",
+       "      <td>White</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45793406</th>\n",
+       "      <td>3136</td>\n",
+       "      <td>-</td>\n",
+       "      <td>OPEL</td>\n",
+       "      <td>Corsa</td>\n",
+       "      <td>1995</td>\n",
+       "      <td>Hatchback</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Petrol</td>\n",
+       "      <td>1.4</td>\n",
+       "      <td>100000 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Manual</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>02-Mar</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Grey</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45700700</th>\n",
+       "      <td>18817</td>\n",
+       "      <td>-</td>\n",
+       "      <td>TOYOTA</td>\n",
+       "      <td>Camry</td>\n",
+       "      <td>2007</td>\n",
+       "      <td>Sedan</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Hybrid</td>\n",
+       "      <td>2.4</td>\n",
+       "      <td>151000 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Variator</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Black</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>15389 rows × 17 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          Price  Levy   Manufacturer          Model  Prod_year   Category  \\\n",
+       "ID                                                                          \n",
+       "45758153   1333   289           FORD         Escape       2008       Jeep   \n",
+       "45699930  17249     -           FORD  Escape Hybrid       2008       Jeep   \n",
+       "45646562   1333  1053          LEXUS         ES 350       2014      Sedan   \n",
+       "45656923   9879  1018  MERCEDES-BENZ         ML 350       2011       Jeep   \n",
+       "45815887  10976  1275        HYUNDAI         Sonata       2019      Sedan   \n",
+       "...         ...   ...            ...            ...        ...        ...   \n",
+       "45802363  21805  1024        HYUNDAI             H1       2010    Minivan   \n",
+       "45812777    220  1327         TOYOTA          Camry       2018      Sedan   \n",
+       "44104417  15210     -         TOYOTA           Aqua       2014  Hatchback   \n",
+       "45793406   3136     -           OPEL          Corsa       1995  Hatchback   \n",
+       "45700700  18817     -         TOYOTA          Camry       2007      Sedan   \n",
+       "\n",
+       "         Leather_interior Fuel type Engine volume    Mileage  Cylinders  \\\n",
+       "ID                                                                        \n",
+       "45758153              Yes    Hybrid           0.4  349288 km        4.0   \n",
+       "45699930               No    Hybrid           2.3  147000 km        4.0   \n",
+       "45646562              Yes    Petrol           3.5  179358 km        6.0   \n",
+       "45656923              Yes    Diesel             3  275862 km        6.0   \n",
+       "45815887              Yes    Petrol           2.4   29419 km        4.0   \n",
+       "...                   ...       ...           ...        ...        ...   \n",
+       "45802363              Yes    Diesel           2.5   58958 km        4.0   \n",
+       "45812777              Yes    Petrol           2.5   47688 km        4.0   \n",
+       "44104417               No    Hybrid           1.5  139000 km        4.0   \n",
+       "45793406               No    Petrol           1.4  100000 km        4.0   \n",
+       "45700700              Yes    Hybrid           2.4  151000 km        4.0   \n",
+       "\n",
+       "         Gear box type Drive wheels   Doors             Wheel   Color  Airbags  \n",
+       "ID                                                                              \n",
+       "45758153     Automatic        Front  04-May        Left wheel    Blue        0  \n",
+       "45699930      Variator          4x4  04-May        Left wheel   White        8  \n",
+       "45646562     Automatic        Front  04-May        Left wheel     Red       12  \n",
+       "45656923     Automatic          4x4  04-May        Left wheel  Silver       12  \n",
+       "45815887     Automatic        Front  04-May        Left wheel    Blue       12  \n",
+       "...                ...          ...     ...               ...     ...      ...  \n",
+       "45802363     Automatic        Front  04-May        Left wheel   Black        4  \n",
+       "45812777     Automatic        Front  04-May        Left wheel    Blue       12  \n",
+       "44104417      Variator        Front  04-May  Right-hand drive   White        2  \n",
+       "45793406        Manual        Front  02-Mar        Left wheel    Grey        2  \n",
+       "45700700      Variator        Front  04-May        Left wheel   Black       10  \n",
+       "\n",
+       "[15389 rows x 17 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'y_train'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Gear box type</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>45758153</th>\n",
+       "      <td>Automatic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45699930</th>\n",
+       "      <td>Variator</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45646562</th>\n",
+       "      <td>Automatic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45656923</th>\n",
+       "      <td>Automatic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45815887</th>\n",
+       "      <td>Automatic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45802363</th>\n",
+       "      <td>Automatic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45812777</th>\n",
+       "      <td>Automatic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44104417</th>\n",
+       "      <td>Variator</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45793406</th>\n",
+       "      <td>Manual</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45700700</th>\n",
+       "      <td>Variator</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>15389 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         Gear box type\n",
+       "ID                    \n",
+       "45758153     Automatic\n",
+       "45699930      Variator\n",
+       "45646562     Automatic\n",
+       "45656923     Automatic\n",
+       "45815887     Automatic\n",
+       "...                ...\n",
+       "45802363     Automatic\n",
+       "45812777     Automatic\n",
+       "44104417      Variator\n",
+       "45793406        Manual\n",
+       "45700700      Variator\n",
+       "\n",
+       "[15389 rows x 1 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'X_test'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Price</th>\n",
+       "      <th>Levy</th>\n",
+       "      <th>Manufacturer</th>\n",
+       "      <th>Model</th>\n",
+       "      <th>Prod_year</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Leather_interior</th>\n",
+       "      <th>Fuel type</th>\n",
+       "      <th>Engine volume</th>\n",
+       "      <th>Mileage</th>\n",
+       "      <th>Cylinders</th>\n",
+       "      <th>Gear box type</th>\n",
+       "      <th>Drive wheels</th>\n",
+       "      <th>Doors</th>\n",
+       "      <th>Wheel</th>\n",
+       "      <th>Color</th>\n",
+       "      <th>Airbags</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>45813151</th>\n",
+       "      <td>220</td>\n",
+       "      <td>919</td>\n",
+       "      <td>MERCEDES-BENZ</td>\n",
+       "      <td>ML 350</td>\n",
+       "      <td>2012</td>\n",
+       "      <td>Jeep</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>3</td>\n",
+       "      <td>209072 km</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>4x4</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Grey</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45783744</th>\n",
+       "      <td>11000</td>\n",
+       "      <td>-</td>\n",
+       "      <td>JEEP</td>\n",
+       "      <td>Liberty</td>\n",
+       "      <td>2001</td>\n",
+       "      <td>Jeep</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>LPG</td>\n",
+       "      <td>3.7</td>\n",
+       "      <td>137582 km</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>4x4</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Right-hand drive</td>\n",
+       "      <td>Silver</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45805850</th>\n",
+       "      <td>10976</td>\n",
+       "      <td>-</td>\n",
+       "      <td>TOYOTA</td>\n",
+       "      <td>RAV 4</td>\n",
+       "      <td>2002</td>\n",
+       "      <td>Jeep</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>CNG</td>\n",
+       "      <td>2</td>\n",
+       "      <td>200000 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>4x4</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>White</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45816409</th>\n",
+       "      <td>1568</td>\n",
+       "      <td>753</td>\n",
+       "      <td>HYUNDAI</td>\n",
+       "      <td>Sonata</td>\n",
+       "      <td>2012</td>\n",
+       "      <td>Sedan</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Petrol</td>\n",
+       "      <td>2.4</td>\n",
+       "      <td>246230 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Black</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45281242</th>\n",
+       "      <td>8938</td>\n",
+       "      <td>843</td>\n",
+       "      <td>TOYOTA</td>\n",
+       "      <td>Prius</td>\n",
+       "      <td>2008</td>\n",
+       "      <td>Sedan</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Hybrid</td>\n",
+       "      <td>1.5</td>\n",
+       "      <td>133016 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Beige</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45798478</th>\n",
+       "      <td>13172</td>\n",
+       "      <td>639</td>\n",
+       "      <td>FORD</td>\n",
+       "      <td>Focus</td>\n",
+       "      <td>2014</td>\n",
+       "      <td>Sedan</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Petrol</td>\n",
+       "      <td>2</td>\n",
+       "      <td>134400 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Tiptronic</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Red</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45321909</th>\n",
+       "      <td>16621</td>\n",
+       "      <td>-</td>\n",
+       "      <td>TOYOTA</td>\n",
+       "      <td>Prius</td>\n",
+       "      <td>2010</td>\n",
+       "      <td>Hatchback</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Hybrid</td>\n",
+       "      <td>1.8</td>\n",
+       "      <td>154000 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Variator</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>White</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45758118</th>\n",
+       "      <td>15681</td>\n",
+       "      <td>1811</td>\n",
+       "      <td>LEXUS</td>\n",
+       "      <td>GX 460</td>\n",
+       "      <td>2010</td>\n",
+       "      <td>Jeep</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Petrol</td>\n",
+       "      <td>4.6</td>\n",
+       "      <td>275240 km</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>4x4</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Silver</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45758137</th>\n",
+       "      <td>6476</td>\n",
+       "      <td>-</td>\n",
+       "      <td>NISSAN</td>\n",
+       "      <td>Note</td>\n",
+       "      <td>2008</td>\n",
+       "      <td>Hatchback</td>\n",
+       "      <td>No</td>\n",
+       "      <td>CNG</td>\n",
+       "      <td>1.5</td>\n",
+       "      <td>999999999 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>4x4</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Right-hand drive</td>\n",
+       "      <td>Black</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45720411</th>\n",
+       "      <td>3</td>\n",
+       "      <td>697</td>\n",
+       "      <td>VOLKSWAGEN</td>\n",
+       "      <td>Jetta</td>\n",
+       "      <td>2015</td>\n",
+       "      <td>Sedan</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Petrol</td>\n",
+       "      <td>1.8 Turbo</td>\n",
+       "      <td>65000 km</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>Automatic</td>\n",
+       "      <td>Front</td>\n",
+       "      <td>04-May</td>\n",
+       "      <td>Left wheel</td>\n",
+       "      <td>Grey</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3848 rows × 17 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          Price  Levy   Manufacturer    Model  Prod_year   Category  \\\n",
+       "ID                                                                    \n",
+       "45813151    220   919  MERCEDES-BENZ   ML 350       2012       Jeep   \n",
+       "45783744  11000     -           JEEP  Liberty       2001       Jeep   \n",
+       "45805850  10976     -         TOYOTA    RAV 4       2002       Jeep   \n",
+       "45816409   1568   753        HYUNDAI   Sonata       2012      Sedan   \n",
+       "45281242   8938   843         TOYOTA    Prius       2008      Sedan   \n",
+       "...         ...   ...            ...      ...        ...        ...   \n",
+       "45798478  13172   639           FORD    Focus       2014      Sedan   \n",
+       "45321909  16621     -         TOYOTA    Prius       2010  Hatchback   \n",
+       "45758118  15681  1811          LEXUS   GX 460       2010       Jeep   \n",
+       "45758137   6476     -         NISSAN     Note       2008  Hatchback   \n",
+       "45720411      3   697     VOLKSWAGEN    Jetta       2015      Sedan   \n",
+       "\n",
+       "         Leather_interior Fuel type Engine volume       Mileage  Cylinders  \\\n",
+       "ID                                                                           \n",
+       "45813151              Yes    Diesel             3     209072 km        6.0   \n",
+       "45783744              Yes       LPG           3.7     137582 km        6.0   \n",
+       "45805850              Yes       CNG             2     200000 km        4.0   \n",
+       "45816409              Yes    Petrol           2.4     246230 km        4.0   \n",
+       "45281242               No    Hybrid           1.5     133016 km        4.0   \n",
+       "...                   ...       ...           ...           ...        ...   \n",
+       "45798478              Yes    Petrol             2     134400 km        4.0   \n",
+       "45321909               No    Hybrid           1.8     154000 km        4.0   \n",
+       "45758118              Yes    Petrol           4.6     275240 km        8.0   \n",
+       "45758137               No       CNG           1.5  999999999 km        4.0   \n",
+       "45720411              Yes    Petrol     1.8 Turbo      65000 km        4.0   \n",
+       "\n",
+       "         Gear box type Drive wheels   Doors             Wheel   Color  Airbags  \n",
+       "ID                                                                              \n",
+       "45813151     Automatic          4x4  04-May        Left wheel    Grey       12  \n",
+       "45783744     Automatic          4x4  04-May  Right-hand drive  Silver        6  \n",
+       "45805850     Automatic          4x4  04-May        Left wheel   White        4  \n",
+       "45816409     Automatic        Front  04-May        Left wheel   Black       12  \n",
+       "45281242     Automatic        Front  04-May        Left wheel   Beige        8  \n",
+       "...                ...          ...     ...               ...     ...      ...  \n",
+       "45798478     Tiptronic        Front  04-May        Left wheel     Red        8  \n",
+       "45321909      Variator        Front  04-May        Left wheel   White        6  \n",
+       "45758118     Automatic          4x4  04-May        Left wheel  Silver        0  \n",
+       "45758137     Automatic          4x4  04-May  Right-hand drive   Black        0  \n",
+       "45720411     Automatic        Front  04-May        Left wheel    Grey       12  \n",
+       "\n",
+       "[3848 rows x 17 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'y_test'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Gear box type</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>45813151</th>\n",
+       "      <td>Automatic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45783744</th>\n",
+       "      <td>Automatic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45805850</th>\n",
+       "      <td>Automatic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45816409</th>\n",
+       "      <td>Automatic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45281242</th>\n",
+       "      <td>Automatic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45798478</th>\n",
+       "      <td>Tiptronic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45321909</th>\n",
+       "      <td>Variator</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45758118</th>\n",
+       "      <td>Automatic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45758137</th>\n",
+       "      <td>Automatic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45720411</th>\n",
+       "      <td>Automatic</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3848 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         Gear box type\n",
+       "ID                    \n",
+       "45813151     Automatic\n",
+       "45783744     Automatic\n",
+       "45805850     Automatic\n",
+       "45816409     Automatic\n",
+       "45281242     Automatic\n",
+       "...                ...\n",
+       "45798478     Tiptronic\n",
+       "45321909      Variator\n",
+       "45758118     Automatic\n",
+       "45758137     Automatic\n",
+       "45720411     Automatic\n",
+       "\n",
+       "[3848 rows x 1 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from utils import split_stratified_into_train_val_test\n",
+    "\n",
+    "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
+    "    df, stratify_colname=\"Gear box type\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=random_state\n",
+    ")\n",
+    "\n",
+    "display(\"X_train\", X_train)\n",
+    "display(\"y_train\", y_train)\n",
+    "\n",
+    "display(\"X_test\", X_test)\n",
+    "display(\"y_test\", y_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "В итоге, этот код выполняет следующие действия:\n",
+    "\n",
+    "* Заполняет пропущенные значения: В числовых столбцах медианой, в категориальных - значением \"unknown\".\n",
+    "* Стандартизирует числовые данные: приводит их к нулевому среднему и единичному стандартному отклонению.\n",
+    "* Преобразует категориальные данные: использует one-hot-кодирование.\n",
+    "* Удаляет ненужные столбцы: из списка `columns_to_drop`.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Формирование конвейера для классификации данных\n",
+    "\n",
+    "preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
+    "\n",
+    "preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
+    "\n",
+    "features_preprocessing -- трансформер для предобработки признаков\n",
+    "\n",
+    "features_engineering -- трансформер для конструирования признаков\n",
+    "\n",
+    "drop_columns -- трансформер для удаления колонок\n",
+    "\n",
+    "features_postprocessing -- трансформер для унитарного кодирования новых признаков\n",
+    "\n",
+    "pipeline_end -- основной конвейер предобработки данных и конструирования признаков\n",
+    "\n",
+    "Конвейер выполняется последовательно.\n",
+    "\n",
+    "Трансформер выполняет параллельно для указанного набора колонок.\n",
+    "\n",
+    "Документация: \n",
+    "\n",
+    "https://scikit-learn.org/1.5/api/sklearn.pipeline.html\n",
+    "\n",
+    "https://scikit-learn.org/1.5/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.discriminant_analysis import StandardScaler\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "\n",
+    "from transformers import TitanicFeatures\n",
+    "\n",
+    "\n",
+    "#columns_to_drop = [\"Survived\", \"Name\", \"Cabin\", \"Ticket\", \"Embarked\", \"Parch\", \"Fare\"]\n",
+    "columns_to_drop = [\"Doors\", \"Color\", \"Gear box type\", \"Prod_year\", \"Mileage\", \"Airbags\", \"Levy\", \"Leather_interior\", \"Fuel type\", \"Drive wheels\"]\n",
+    "num_columns = [\n",
+    "    column\n",
+    "    for column in df.columns\n",
+    "    if column not in columns_to_drop and df[column].dtype != \"object\"\n",
+    "]\n",
+    "cat_columns = [\n",
+    "    column\n",
+    "    for column in df.columns\n",
+    "    if column not in columns_to_drop and df[column].dtype == \"object\"\n",
+    "]\n",
+    "\n",
+    "num_imputer = SimpleImputer(strategy=\"median\")\n",
+    "num_scaler = StandardScaler()\n",
+    "preprocessing_num = Pipeline(\n",
+    "    [\n",
+    "        (\"imputer\", num_imputer),\n",
+    "        (\"scaler\", num_scaler),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
+    "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
+    "preprocessing_cat = Pipeline(\n",
+    "    [\n",
+    "        (\"imputer\", cat_imputer),\n",
+    "        (\"encoder\", cat_encoder),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "features_preprocessing = ColumnTransformer(\n",
+    "    verbose_feature_names_out=False,\n",
+    "    transformers=[\n",
+    "        (\"prepocessing_num\", preprocessing_num, num_columns),\n",
+    "        (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
+    "        #(\"prepocessing_features\", cat_imputer, [\"Name\", \"Cabin\"]),\n",
+    "    ],\n",
+    "    remainder=\"passthrough\"\n",
+    ")\n",
+    "\n",
+    "# features_engineering = ColumnTransformer(\n",
+    "#     verbose_feature_names_out=False,\n",
+    "#     transformers=[\n",
+    "#         (\"add_features\", TitanicFeatures(), [\"Name\", \"Cabin\"]),\n",
+    "#     ],\n",
+    "#     remainder=\"passthrough\",\n",
+    "# )\n",
+    "\n",
+    "drop_columns = ColumnTransformer(\n",
+    "    verbose_feature_names_out=False,\n",
+    "    transformers=[\n",
+    "        (\"drop_columns\", \"drop\", columns_to_drop),\n",
+    "    ],\n",
+    "    remainder=\"passthrough\",\n",
+    ")\n",
+    "\n",
+    "# features_postprocessing = ColumnTransformer(\n",
+    "#     verbose_feature_names_out=False,\n",
+    "#     transformers=[\n",
+    "#         (\"prepocessing_cat\", preprocessing_cat, [\"Cabin_type\"]),\n",
+    "#     ],\n",
+    "#     remainder=\"passthrough\",\n",
+    "# )\n",
+    "\n",
+    "pipeline_end = Pipeline(\n",
+    "    [\n",
+    "        (\"features_preprocessing\", features_preprocessing),\n",
+    "       # (\"features_engineering\", features_engineering),\n",
+    "        (\"drop_columns\", drop_columns),\n",
+    "       # (\"features_postprocessing\", features_postprocessing),\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Демонстрация работы конвейера для предобработки данных при классификации"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Price</th>\n",
+       "      <th>Cylinders</th>\n",
+       "      <th>Manufacturer_ALFA ROMEO</th>\n",
+       "      <th>Manufacturer_ASTON MARTIN</th>\n",
+       "      <th>Manufacturer_AUDI</th>\n",
+       "      <th>Manufacturer_BENTLEY</th>\n",
+       "      <th>Manufacturer_BMW</th>\n",
+       "      <th>Manufacturer_BUICK</th>\n",
+       "      <th>Manufacturer_CADILLAC</th>\n",
+       "      <th>Manufacturer_CHEVROLET</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Engine volume_5.7 Turbo</th>\n",
+       "      <th>Engine volume_5.8</th>\n",
+       "      <th>Engine volume_5.9</th>\n",
+       "      <th>Engine volume_6</th>\n",
+       "      <th>Engine volume_6.2</th>\n",
+       "      <th>Engine volume_6.3</th>\n",
+       "      <th>Engine volume_6.3 Turbo</th>\n",
+       "      <th>Engine volume_6.7</th>\n",
+       "      <th>Engine volume_6.8</th>\n",
+       "      <th>Wheel_Right-hand drive</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>45758153</th>\n",
+       "      <td>-0.082497</td>\n",
+       "      <td>-0.485038</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45699930</th>\n",
+       "      <td>-0.007675</td>\n",
+       "      <td>-0.485038</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45646562</th>\n",
+       "      <td>-0.082497</td>\n",
+       "      <td>1.187062</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45656923</th>\n",
+       "      <td>-0.042322</td>\n",
+       "      <td>1.187062</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45815887</th>\n",
+       "      <td>-0.037165</td>\n",
+       "      <td>-0.485038</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45802363</th>\n",
+       "      <td>0.013743</td>\n",
+       "      <td>-0.485038</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45812777</th>\n",
+       "      <td>-0.087729</td>\n",
+       "      <td>-0.485038</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44104417</th>\n",
+       "      <td>-0.017260</td>\n",
+       "      <td>-0.485038</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45793406</th>\n",
+       "      <td>-0.074021</td>\n",
+       "      <td>-0.485038</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45700700</th>\n",
+       "      <td>-0.000304</td>\n",
+       "      <td>-0.485038</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>15389 rows × 1573 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             Price  Cylinders  Manufacturer_ALFA ROMEO  \\\n",
+       "ID                                                       \n",
+       "45758153 -0.082497  -0.485038                      0.0   \n",
+       "45699930 -0.007675  -0.485038                      0.0   \n",
+       "45646562 -0.082497   1.187062                      0.0   \n",
+       "45656923 -0.042322   1.187062                      0.0   \n",
+       "45815887 -0.037165  -0.485038                      0.0   \n",
+       "...            ...        ...                      ...   \n",
+       "45802363  0.013743  -0.485038                      0.0   \n",
+       "45812777 -0.087729  -0.485038                      0.0   \n",
+       "44104417 -0.017260  -0.485038                      0.0   \n",
+       "45793406 -0.074021  -0.485038                      0.0   \n",
+       "45700700 -0.000304  -0.485038                      0.0   \n",
+       "\n",
+       "          Manufacturer_ASTON MARTIN  Manufacturer_AUDI  Manufacturer_BENTLEY  \\\n",
+       "ID                                                                             \n",
+       "45758153                        0.0                0.0                   0.0   \n",
+       "45699930                        0.0                0.0                   0.0   \n",
+       "45646562                        0.0                0.0                   0.0   \n",
+       "45656923                        0.0                0.0                   0.0   \n",
+       "45815887                        0.0                0.0                   0.0   \n",
+       "...                             ...                ...                   ...   \n",
+       "45802363                        0.0                0.0                   0.0   \n",
+       "45812777                        0.0                0.0                   0.0   \n",
+       "44104417                        0.0                0.0                   0.0   \n",
+       "45793406                        0.0                0.0                   0.0   \n",
+       "45700700                        0.0                0.0                   0.0   \n",
+       "\n",
+       "          Manufacturer_BMW  Manufacturer_BUICK  Manufacturer_CADILLAC  \\\n",
+       "ID                                                                      \n",
+       "45758153               0.0                 0.0                    0.0   \n",
+       "45699930               0.0                 0.0                    0.0   \n",
+       "45646562               0.0                 0.0                    0.0   \n",
+       "45656923               0.0                 0.0                    0.0   \n",
+       "45815887               0.0                 0.0                    0.0   \n",
+       "...                    ...                 ...                    ...   \n",
+       "45802363               0.0                 0.0                    0.0   \n",
+       "45812777               0.0                 0.0                    0.0   \n",
+       "44104417               0.0                 0.0                    0.0   \n",
+       "45793406               0.0                 0.0                    0.0   \n",
+       "45700700               0.0                 0.0                    0.0   \n",
+       "\n",
+       "          Manufacturer_CHEVROLET  ...  Engine volume_5.7 Turbo  \\\n",
+       "ID                                ...                            \n",
+       "45758153                     0.0  ...                      0.0   \n",
+       "45699930                     0.0  ...                      0.0   \n",
+       "45646562                     0.0  ...                      0.0   \n",
+       "45656923                     0.0  ...                      0.0   \n",
+       "45815887                     0.0  ...                      0.0   \n",
+       "...                          ...  ...                      ...   \n",
+       "45802363                     0.0  ...                      0.0   \n",
+       "45812777                     0.0  ...                      0.0   \n",
+       "44104417                     0.0  ...                      0.0   \n",
+       "45793406                     0.0  ...                      0.0   \n",
+       "45700700                     0.0  ...                      0.0   \n",
+       "\n",
+       "          Engine volume_5.8  Engine volume_5.9  Engine volume_6  \\\n",
+       "ID                                                                \n",
+       "45758153                0.0                0.0              0.0   \n",
+       "45699930                0.0                0.0              0.0   \n",
+       "45646562                0.0                0.0              0.0   \n",
+       "45656923                0.0                0.0              0.0   \n",
+       "45815887                0.0                0.0              0.0   \n",
+       "...                     ...                ...              ...   \n",
+       "45802363                0.0                0.0              0.0   \n",
+       "45812777                0.0                0.0              0.0   \n",
+       "44104417                0.0                0.0              0.0   \n",
+       "45793406                0.0                0.0              0.0   \n",
+       "45700700                0.0                0.0              0.0   \n",
+       "\n",
+       "          Engine volume_6.2  Engine volume_6.3  Engine volume_6.3 Turbo  \\\n",
+       "ID                                                                        \n",
+       "45758153                0.0                0.0                      0.0   \n",
+       "45699930                0.0                0.0                      0.0   \n",
+       "45646562                0.0                0.0                      0.0   \n",
+       "45656923                0.0                0.0                      0.0   \n",
+       "45815887                0.0                0.0                      0.0   \n",
+       "...                     ...                ...                      ...   \n",
+       "45802363                0.0                0.0                      0.0   \n",
+       "45812777                0.0                0.0                      0.0   \n",
+       "44104417                0.0                0.0                      0.0   \n",
+       "45793406                0.0                0.0                      0.0   \n",
+       "45700700                0.0                0.0                      0.0   \n",
+       "\n",
+       "          Engine volume_6.7  Engine volume_6.8  Wheel_Right-hand drive  \n",
+       "ID                                                                      \n",
+       "45758153                0.0                0.0                     0.0  \n",
+       "45699930                0.0                0.0                     0.0  \n",
+       "45646562                0.0                0.0                     0.0  \n",
+       "45656923                0.0                0.0                     0.0  \n",
+       "45815887                0.0                0.0                     0.0  \n",
+       "...                     ...                ...                     ...  \n",
+       "45802363                0.0                0.0                     0.0  \n",
+       "45812777                0.0                0.0                     0.0  \n",
+       "44104417                0.0                0.0                     1.0  \n",
+       "45793406                0.0                0.0                     0.0  \n",
+       "45700700                0.0                0.0                     0.0  \n",
+       "\n",
+       "[15389 rows x 1573 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preprocessing_result = pipeline_end.fit_transform(X_train)\n",
+    "preprocessed_df = pd.DataFrame(\n",
+    "    preprocessing_result,\n",
+    "    columns=pipeline_end.get_feature_names_out(),\n",
+    ")\n",
+    "\n",
+    "preprocessed_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Формирование набора моделей для классификации\n",
+    "\n",
+    "logistic -- логистическая регрессия\n",
+    "\n",
+    "ridge -- гребневая регрессия\n",
+    "\n",
+    "decision_tree -- дерево решений\n",
+    "\n",
+    "knn -- k-ближайших соседей\n",
+    "\n",
+    "naive_bayes -- наивный Байесовский классификатор\n",
+    "\n",
+    "gradient_boosting -- метод градиентного бустинга (набор деревьев решений)\n",
+    "\n",
+    "random_forest -- метод случайного леса (набор деревьев решений)\n",
+    "\n",
+    "mlp -- многослойный персептрон (нейронная сеть)\n",
+    "\n",
+    "Документация: https://scikit-learn.org/1.5/supervised_learning.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree\n",
+    "\n",
+    "class_models = {\n",
+    "    \"logistic\": {\"model\": linear_model.LogisticRegression()},\n",
+    "    # \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n",
+    "    \"ridge\": {\"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")},\n",
+    "    \"decision_tree\": {\n",
+    "        \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)\n",
+    "    },\n",
+    "    \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n",
+    "    \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n",
+    "    \"gradient_boosting\": {\n",
+    "        \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n",
+    "    },\n",
+    "    \"random_forest\": {\n",
+    "        \"model\": ensemble.RandomForestClassifier(\n",
+    "            max_depth=11, class_weight=\"balanced\", random_state=random_state\n",
+    "        )\n",
+    "    },\n",
+    "    \"mlp\": {\n",
+    "        \"model\": neural_network.MLPClassifier(\n",
+    "            hidden_layer_sizes=(7,),\n",
+    "            max_iter=100000,\n",
+    "            early_stopping=True,\n",
+    "            random_state=random_state,\n",
+    "        )\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Gear box type    object\n",
+      "dtype: object\n",
+      "Gear box type    object\n",
+      "dtype: object\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 19237 entries, 45654403 to 45813273\n",
+      "Data columns (total 17 columns):\n",
+      " #   Column            Non-Null Count  Dtype  \n",
+      "---  ------            --------------  -----  \n",
+      " 0   Price             19237 non-null  int64  \n",
+      " 1   Levy              19237 non-null  object \n",
+      " 2   Manufacturer      19237 non-null  object \n",
+      " 3   Model             19237 non-null  object \n",
+      " 4   Prod_year         19237 non-null  int64  \n",
+      " 5   Category          19237 non-null  object \n",
+      " 6   Leather_interior  19237 non-null  object \n",
+      " 7   Fuel type         19237 non-null  object \n",
+      " 8   Engine volume     19237 non-null  object \n",
+      " 9   Mileage           19237 non-null  object \n",
+      " 10  Cylinders         19237 non-null  float64\n",
+      " 11  Gear box type     19237 non-null  object \n",
+      " 12  Drive wheels      19237 non-null  object \n",
+      " 13  Doors             19237 non-null  object \n",
+      " 14  Wheel             19237 non-null  object \n",
+      " 15  Color             19237 non-null  object \n",
+      " 16  Airbags           19237 non-null  int64  \n",
+      "dtypes: float64(1), int64(3), object(13)\n",
+      "memory usage: 2.6+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(y_train.dtypes)\n",
+    "print(y_test.dtypes)\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Обучение моделей на обучающем наборе данных и оценка на тестовом"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model: logistic\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
+      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
+      "\n",
+      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
+      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
+      "Please also refer to the documentation for alternative solver options:\n",
+      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
+      "  n_iter_i = _check_optimize_result(\n",
+      "c:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1, 3] during transform. These unknown categories will be encoded as all zeros\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "ename": "ValueError",
+     "evalue": "Mix of label input types (string and number)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[16], line 28\u001b[0m\n\u001b[0;32m     19\u001b[0m \u001b[38;5;66;03m# class_models[model_name][\"Precision_train\"] = metrics.precision_score(\u001b[39;00m\n\u001b[0;32m     20\u001b[0m \u001b[38;5;66;03m#     y_train, y_train_predict, average=\"micro\"\u001b[39;00m\n\u001b[0;32m     21\u001b[0m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[0;32m     22\u001b[0m \u001b[38;5;66;03m# class_models[model_name][\"Precision_test\"] = metrics.precision_score(\u001b[39;00m\n\u001b[0;32m     23\u001b[0m \u001b[38;5;66;03m#     y_test, y_test_predict\u001b[39;00m\n\u001b[0;32m     24\u001b[0m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[0;32m     25\u001b[0m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRecall_train\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m metrics\u001b[38;5;241m.\u001b[39mrecall_score(\n\u001b[0;32m     26\u001b[0m     y_train, y_train_predict, average\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmicro\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m     27\u001b[0m )\n\u001b[1;32m---> 28\u001b[0m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRecall_test\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecall_score\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m     29\u001b[0m \u001b[43m    \u001b[49m\u001b[43my_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_test_predict\u001b[49m\n\u001b[0;32m     30\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     31\u001b[0m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAccuracy_train\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m metrics\u001b[38;5;241m.\u001b[39maccuracy_score(\n\u001b[0;32m     32\u001b[0m     y_train, y_train_predict\n\u001b[0;32m     33\u001b[0m )\n\u001b[0;32m     34\u001b[0m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAccuracy_test\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m metrics\u001b[38;5;241m.\u001b[39maccuracy_score(\n\u001b[0;32m     35\u001b[0m     y_test, y_test_predict\n\u001b[0;32m     36\u001b[0m )\n",
+      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m    208\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m    209\u001b[0m         skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m    210\u001b[0m             prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m    211\u001b[0m         )\n\u001b[0;32m    212\u001b[0m     ):\n\u001b[1;32m--> 213\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m    215\u001b[0m     \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m    216\u001b[0m     \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m    217\u001b[0m     \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m    218\u001b[0m     \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m    219\u001b[0m     msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m    220\u001b[0m         \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m    221\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m    222\u001b[0m         \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m    223\u001b[0m     )\n",
+      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:2385\u001b[0m, in \u001b[0;36mrecall_score\u001b[1;34m(y_true, y_pred, labels, pos_label, average, sample_weight, zero_division)\u001b[0m\n\u001b[0;32m   2217\u001b[0m \u001b[38;5;129m@validate_params\u001b[39m(\n\u001b[0;32m   2218\u001b[0m     {\n\u001b[0;32m   2219\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_true\u001b[39m\u001b[38;5;124m\"\u001b[39m: [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marray-like\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msparse matrix\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   2244\u001b[0m     zero_division\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwarn\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   2245\u001b[0m ):\n\u001b[0;32m   2246\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Compute the recall.\u001b[39;00m\n\u001b[0;32m   2247\u001b[0m \n\u001b[0;32m   2248\u001b[0m \u001b[38;5;124;03m    The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   2383\u001b[0m \u001b[38;5;124;03m    array([1. , 1. , 0.5])\u001b[39;00m\n\u001b[0;32m   2384\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 2385\u001b[0m     _, r, _, _ \u001b[38;5;241m=\u001b[39m \u001b[43mprecision_recall_fscore_support\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   2386\u001b[0m \u001b[43m        \u001b[49m\u001b[43my_true\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2387\u001b[0m \u001b[43m        \u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2388\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2389\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpos_label\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpos_label\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2390\u001b[0m \u001b[43m        \u001b[49m\u001b[43maverage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maverage\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2391\u001b[0m \u001b[43m        \u001b[49m\u001b[43mwarn_for\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrecall\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2392\u001b[0m \u001b[43m        \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2393\u001b[0m \u001b[43m        \u001b[49m\u001b[43mzero_division\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mzero_division\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2394\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2395\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m r\n",
+      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:186\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    184\u001b[0m global_skip_validation \u001b[38;5;241m=\u001b[39m get_config()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mskip_parameter_validation\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m    185\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m global_skip_validation:\n\u001b[1;32m--> 186\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    188\u001b[0m func_sig \u001b[38;5;241m=\u001b[39m signature(func)\n\u001b[0;32m    190\u001b[0m \u001b[38;5;66;03m# Map *args/**kwargs to the function signature\u001b[39;00m\n",
+      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1789\u001b[0m, in \u001b[0;36mprecision_recall_fscore_support\u001b[1;34m(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight, zero_division)\u001b[0m\n\u001b[0;32m   1626\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Compute precision, recall, F-measure and support for each class.\u001b[39;00m\n\u001b[0;32m   1627\u001b[0m \n\u001b[0;32m   1628\u001b[0m \u001b[38;5;124;03mThe precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1786\u001b[0m \u001b[38;5;124;03m array([2, 2, 2]))\u001b[39;00m\n\u001b[0;32m   1787\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   1788\u001b[0m _check_zero_division(zero_division)\n\u001b[1;32m-> 1789\u001b[0m labels \u001b[38;5;241m=\u001b[39m \u001b[43m_check_set_wise_labels\u001b[49m\u001b[43m(\u001b[49m\u001b[43my_true\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maverage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpos_label\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1791\u001b[0m \u001b[38;5;66;03m# Calculate tp_sum, pred_sum, true_sum ###\u001b[39;00m\n\u001b[0;32m   1792\u001b[0m samplewise \u001b[38;5;241m=\u001b[39m average \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msamples\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1564\u001b[0m, in \u001b[0;36m_check_set_wise_labels\u001b[1;34m(y_true, y_pred, average, labels, pos_label)\u001b[0m\n\u001b[0;32m   1561\u001b[0m y_type, y_true, y_pred \u001b[38;5;241m=\u001b[39m _check_targets(y_true, y_pred)\n\u001b[0;32m   1562\u001b[0m \u001b[38;5;66;03m# Convert to Python primitive type to avoid NumPy type / Python str\u001b[39;00m\n\u001b[0;32m   1563\u001b[0m \u001b[38;5;66;03m# comparison. See https://github.com/numpy/numpy/issues/6784\u001b[39;00m\n\u001b[1;32m-> 1564\u001b[0m present_labels \u001b[38;5;241m=\u001b[39m \u001b[43munique_labels\u001b[49m\u001b[43m(\u001b[49m\u001b[43my_true\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mtolist()\n\u001b[0;32m   1565\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m average \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbinary\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m   1566\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m y_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbinary\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
+      "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\utils\\multiclass.py:114\u001b[0m, in \u001b[0;36munique_labels\u001b[1;34m(*ys)\u001b[0m\n\u001b[0;32m    112\u001b[0m \u001b[38;5;66;03m# Check that we don't mix string type with number type\u001b[39;00m\n\u001b[0;32m    113\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mset\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(label, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m label \u001b[38;5;129;01min\u001b[39;00m ys_labels)) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m--> 114\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMix of label input types (string and number)\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m    116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m xp\u001b[38;5;241m.\u001b[39masarray(\u001b[38;5;28msorted\u001b[39m(ys_labels))\n",
+      "\u001b[1;31mValueError\u001b[0m: Mix of label input types (string and number)"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "from sklearn import metrics\n",
+    "\n",
+    "for model_name in class_models.keys():\n",
+    "    print(f\"Model: {model_name}\")\n",
+    "    model = class_models[model_name][\"model\"]\n",
+    "\n",
+    "    model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
+    "    model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n",
+    "\n",
+    "    y_train_predict = model_pipeline.predict(X_train)\n",
+    "    y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n",
+    "    y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n",
+    "\n",
+    "    class_models[model_name][\"pipeline\"] = model_pipeline\n",
+    "    class_models[model_name][\"probs\"] = y_test_probs\n",
+    "    class_models[model_name][\"preds\"] = y_test_predict\n",
+    "\n",
+    "    class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n",
+    "        y_train, y_train_predict, average=\"micro\"\n",
+    "    )\n",
+    "    class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n",
+    "        y_test, y_test_predict\n",
+    "    )\n",
+    "    class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n",
+    "        y_train, y_train_predict, average=\"micro\"\n",
+    "    )\n",
+    "    class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n",
+    "        y_test, y_test_predict\n",
+    "    )\n",
+    "    class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n",
+    "        y_train, y_train_predict\n",
+    "    )\n",
+    "    class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n",
+    "        y_test, y_test_predict\n",
+    "    )\n",
+    "    class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n",
+    "        y_test, y_test_probs\n",
+    "    )\n",
+    "    class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n",
+    "    class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n",
+    "    class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n",
+    "        y_test, y_test_predict\n",
+    "    )\n",
+    "    class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n",
+    "        y_test, y_test_predict\n",
+    "    )\n",
+    "    class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n",
+    "        y_test, y_test_predict\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Сводная таблица оценок качества для использованных моделей классификации\n",
+    "\n",
+    "Документация: https://scikit-learn.org/1.5/modules/model_evaluation.html"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Матрица неточностей"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import ConfusionMatrixDisplay\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n",
+    "for index, key in enumerate(class_models.keys()):\n",
+    "    c_matrix = class_models[key][\"Confusion_matrix\"]\n",
+    "    disp = ConfusionMatrixDisplay(\n",
+    "        confusion_matrix=c_matrix, display_labels=[\"Died\", \"Sirvived\"]\n",
+    "    ).plot(ax=ax.flat[index])\n",
+    "    disp.ax_.set_title(key)\n",
+    "\n",
+    "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Точность, полнота, верность (аккуратность), F-мера"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
+    "    [\n",
+    "        \"Precision_train\",\n",
+    "        \"Precision_test\",\n",
+    "        \"Recall_train\",\n",
+    "        \"Recall_test\",\n",
+    "        \"Accuracy_train\",\n",
+    "        \"Accuracy_test\",\n",
+    "        \"F1_train\",\n",
+    "        \"F1_test\",\n",
+    "    ]\n",
+    "]\n",
+    "class_metrics.sort_values(\n",
+    "    by=\"Accuracy_test\", ascending=False\n",
+    ").style.background_gradient(\n",
+    "    cmap=\"plasma\",\n",
+    "    low=0.3,\n",
+    "    high=1,\n",
+    "    subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
+    ").background_gradient(\n",
+    "    cmap=\"viridis\",\n",
+    "    low=1,\n",
+    "    high=0.3,\n",
+    "    subset=[\n",
+    "        \"Precision_train\",\n",
+    "        \"Precision_test\",\n",
+    "        \"Recall_train\",\n",
+    "        \"Recall_test\",\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
+    "    [\n",
+    "        \"Accuracy_test\",\n",
+    "        \"F1_test\",\n",
+    "        \"ROC_AUC_test\",\n",
+    "        \"Cohen_kappa_test\",\n",
+    "        \"MCC_test\",\n",
+    "    ]\n",
+    "]\n",
+    "class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n",
+    "    cmap=\"plasma\",\n",
+    "    low=0.3,\n",
+    "    high=1,\n",
+    "    subset=[\n",
+    "        \"ROC_AUC_test\",\n",
+    "        \"MCC_test\",\n",
+    "        \"Cohen_kappa_test\",\n",
+    "    ],\n",
+    ").background_gradient(\n",
+    "    cmap=\"viridis\",\n",
+    "    low=1,\n",
+    "    high=0.3,\n",
+    "    subset=[\n",
+    "        \"Accuracy_test\",\n",
+    "        \"F1_test\",\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n",
+    "\n",
+    "display(best_model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Вывод данных с ошибкой предсказания для оценки"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preprocessing_result = pipeline_end.transform(X_test)\n",
+    "preprocessed_df = pd.DataFrame(\n",
+    "    preprocessing_result,\n",
+    "    columns=pipeline_end.get_feature_names_out(),\n",
+    ")\n",
+    "\n",
+    "y_pred = class_models[best_model][\"preds\"]\n",
+    "\n",
+    "error_index = y_test[y_test[\"Survived\"] != y_pred].index.tolist()\n",
+    "display(f\"Error items count: {len(error_index)}\")\n",
+    "\n",
+    "error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n",
+    "error_df = X_test.loc[error_index].copy()\n",
+    "error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n",
+    "error_df.sort_index()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Пример использования обученной модели (конвейера) для предсказания"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = class_models[best_model][\"pipeline\"]\n",
+    "\n",
+    "example_id = 450\n",
+    "test = pd.DataFrame(X_test.loc[example_id, :]).T\n",
+    "test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T\n",
+    "display(test)\n",
+    "display(test_preprocessed)\n",
+    "result_proba = model.predict_proba(test)[0]\n",
+    "result = model.predict(test)[0]\n",
+    "real = int(y_test.loc[example_id].values[0])\n",
+    "display(f\"predicted: {result} (proba: {result_proba})\")\n",
+    "display(f\"real: {real}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Подбор гиперпараметров методом поиска по сетке\n",
+    "\n",
+    "https://www.kaggle.com/code/sociopath00/random-forest-using-gridsearchcv\n",
+    "\n",
+    "https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import GridSearchCV\n",
+    "\n",
+    "optimized_model_type = \"random_forest\"\n",
+    "\n",
+    "random_forest_model = class_models[optimized_model_type][\"pipeline\"]\n",
+    "\n",
+    "param_grid = {\n",
+    "    \"model__n_estimators\": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],\n",
+    "    \"model__max_features\": [\"sqrt\", \"log2\", 2],\n",
+    "    \"model__max_depth\": [2, 3, 4, 5, 6, 7, 8, 9 ,10],\n",
+    "    \"model__criterion\": [\"gini\", \"entropy\", \"log_loss\"],\n",
+    "}\n",
+    "\n",
+    "gs_optomizer = GridSearchCV(\n",
+    "    estimator=random_forest_model, param_grid=param_grid, n_jobs=-1\n",
+    ")\n",
+    "gs_optomizer.fit(X_train, y_train.values.ravel())\n",
+    "gs_optomizer.best_params_"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Обучение модели с новыми гиперпараметрами"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimized_model = ensemble.RandomForestClassifier(\n",
+    "    random_state=random_state,\n",
+    "    criterion=\"gini\",\n",
+    "    max_depth=7,\n",
+    "    max_features=\"sqrt\",\n",
+    "    n_estimators=30,\n",
+    ")\n",
+    "\n",
+    "result = {}\n",
+    "\n",
+    "result[\"pipeline\"] = Pipeline([(\"pipeline\", pipeline_end), (\"model\", optimized_model)]).fit(X_train, y_train.values.ravel())\n",
+    "result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n",
+    "result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n",
+    "result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n",
+    "\n",
+    "result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n",
+    "result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n",
+    "result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n",
+    "result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n",
+    "result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n",
+    "result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n",
+    "result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n",
+    "result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n",
+    "result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n",
+    "result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n",
+    "result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n",
+    "result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Формирование данных для оценки старой и новой версии модели"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n",
+    "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
+    "    data=class_models[optimized_model_type]\n",
+    ")\n",
+    "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
+    "    data=result\n",
+    ")\n",
+    "optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n",
+    "optimized_metrics = optimized_metrics.set_index(\"Name\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Оценка параметров старой и новой модели"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimized_metrics[\n",
+    "    [\n",
+    "        \"Precision_train\",\n",
+    "        \"Precision_test\",\n",
+    "        \"Recall_train\",\n",
+    "        \"Recall_test\",\n",
+    "        \"Accuracy_train\",\n",
+    "        \"Accuracy_test\",\n",
+    "        \"F1_train\",\n",
+    "        \"F1_test\",\n",
+    "    ]\n",
+    "].style.background_gradient(\n",
+    "    cmap=\"plasma\",\n",
+    "    low=0.3,\n",
+    "    high=1,\n",
+    "    subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
+    ").background_gradient(\n",
+    "    cmap=\"viridis\",\n",
+    "    low=1,\n",
+    "    high=0.3,\n",
+    "    subset=[\n",
+    "        \"Precision_train\",\n",
+    "        \"Precision_test\",\n",
+    "        \"Recall_train\",\n",
+    "        \"Recall_test\",\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimized_metrics[\n",
+    "    [\n",
+    "        \"Accuracy_test\",\n",
+    "        \"F1_test\",\n",
+    "        \"ROC_AUC_test\",\n",
+    "        \"Cohen_kappa_test\",\n",
+    "        \"MCC_test\",\n",
+    "    ]\n",
+    "].style.background_gradient(\n",
+    "    cmap=\"plasma\",\n",
+    "    low=0.3,\n",
+    "    high=1,\n",
+    "    subset=[\n",
+    "        \"ROC_AUC_test\",\n",
+    "        \"MCC_test\",\n",
+    "        \"Cohen_kappa_test\",\n",
+    "    ],\n",
+    ").background_gradient(\n",
+    "    cmap=\"viridis\",\n",
+    "    low=1,\n",
+    "    high=0.3,\n",
+    "    subset=[\n",
+    "        \"Accuracy_test\",\n",
+    "        \"F1_test\",\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False\n",
+    ")\n",
+    "\n",
+    "for index in range(0, len(optimized_metrics)):\n",
+    "    c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n",
+    "    disp = ConfusionMatrixDisplay(\n",
+    "        confusion_matrix=c_matrix, display_labels=[\"Died\", \"Sirvived\"]\n",
+    "    ).plot(ax=ax.flat[index])\n",
+    "\n",
+    "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/transformers.py b/transformers.py
new file mode 100644
index 0000000..4c4342b
--- /dev/null
+++ b/transformers.py
@@ -0,0 +1,27 @@
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+
+
+class TitanicFeatures(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        pass
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X, y=None):
+        def get_title(name) -> str:
+            return name.split(",")[1].split(".")[0].strip()
+
+        def get_cabin_type(cabin) -> str:
+            if pd.isna(cabin):
+                return "unknown"
+            return cabin[0]
+
+        X["Is_married"] = [1 if get_title(name) == "Mrs" else 0 for name in X["Name"]]
+        X["Cabin_type"] = [get_cabin_type(cabin) for cabin in X["Cabin"]]
+        return X
+
+    def get_feature_names_out(self, features_in):
+        return np.append(features_in, ["Is_married", "Cabin_type"], axis=0)
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..cb8c396
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,79 @@
+from typing import Tuple
+
+import pandas as pd
+from pandas import DataFrame
+from sklearn.model_selection import train_test_split
+
+
+def split_stratified_into_train_val_test(
+    df_input,
+    stratify_colname="y",
+    frac_train=0.6,
+    frac_val=0.15,
+    frac_test=0.25,
+    random_state=None,
+) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
+    """
+    Splits a Pandas dataframe into three subsets (train, val, and test)
+    following fractional ratios provided by the user, where each subset is
+    stratified by the values in a specific column (that is, each subset has
+    the same relative frequency of the values in the column). It performs this
+    splitting by running train_test_split() twice.
+
+    Parameters
+    ----------
+    df_input : Pandas dataframe
+        Input dataframe to be split.
+    stratify_colname : str
+        The name of the column that will be used for stratification. Usually
+        this column would be for the label.
+    frac_train : float
+    frac_val   : float
+    frac_test  : float
+        The ratios with which the dataframe will be split into train, val, and
+        test data. The values should be expressed as float fractions and should
+        sum to 1.0.
+    random_state : int, None, or RandomStateInstance
+        Value to be passed to train_test_split().
+
+    Returns
+    -------
+    df_train, df_val, df_test :
+        Dataframes containing the three splits.
+    """
+
+    if frac_train + frac_val + frac_test != 1.0:
+        raise ValueError(
+            "fractions %f, %f, %f do not add up to 1.0"
+            % (frac_train, frac_val, frac_test)
+        )
+
+    if stratify_colname not in df_input.columns:
+        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
+
+    X = df_input  # Contains all columns.
+    y = df_input[
+        [stratify_colname]
+    ]  # Dataframe of just the column on which to stratify.
+
+    # Split original dataframe into train and temp dataframes.
+    df_train, df_temp, y_train, y_temp = train_test_split(
+        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
+    )
+
+    if frac_val <= 0:
+        assert len(df_input) == len(df_train) + len(df_temp)
+        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
+
+    # Split the temp dataframe into val and test dataframes.
+    relative_frac_test = frac_test / (frac_val + frac_test)
+    df_val, df_test, y_val, y_test = train_test_split(
+        df_temp,
+        y_temp,
+        stratify=y_temp,
+        test_size=relative_frac_test,
+        random_state=random_state,
+    )
+
+    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
+    return df_train, df_val, df_test, y_train, y_val, y_test

	Country (or dependency)	Population 2020	Yearly Change	Net Change	Density(P/Km²)	Land Area (Km²)	Migrants (net)	Fert. Rate	MedAge	Urban Pop %	World Share
no
1	China	1439323776	0.39	5540090	153	9388211	-348,399	1.7	38	61%	18.47%
2	India	1380004385	0.99	13586631	464	2973190	-532,687	2.2	28	35%	17.70%
3	United States	331002651	0.59	1937734	36	9147420	954,806	1.8	38	83%	4.25%
4	Indonesia	273523615	1.07	2898047	151	1811570	-98,955	2.3	30	56%	3.51%
5	Pakistan	220892340	2.00	4327022	287	770880	-233,379	3.6	23	35%	2.83%
...	...	...	...	...	...	...	...	...	...	...	...
231	Montserrat	4992	0.06	3	50	100	NaN	N.A.	N.A.	10%	0.00%
232	Falkland Islands	3480	3.05	103	0	12170	NaN	N.A.	N.A.	66%	0.00%
233	Niue	1626	0.68	11	6	260	NaN	N.A.	N.A.	46%	0.00%
234	Tokelau	1357	1.27	17	136	10	NaN	N.A.	N.A.	0%	0.00%
235	Holy See	801	0.25	2	2,003	0	NaN	N.A.	N.A.	N.A.	0.00%
	Country (or dependency)	Land Area (Km²)	Land Area (Km²)
no
1	China	9388211	(5458956.667, 10917913.333]
2	India	2973190	(0.0, 5458956.667]
3	United States	9147420	(5458956.667, 10917913.333]
4	Indonesia	1811570	(0.0, 5458956.667]
5	Pakistan	770880	(0.0, 5458956.667]
6	Brazil	8358140	(5458956.667, 10917913.333]
7	Nigeria	910770	(0.0, 5458956.667]
8	Bangladesh	130170	(0.0, 5458956.667]
9	Russia	16376870	(10917913.333, 16376870.0]
10	Mexico	1943950	(0.0, 5458956.667]
11	Japan	364555	(0.0, 5458956.667]
12	Ethiopia	1000000	(0.0, 5458956.667]
13	Philippines	298170	(0.0, 5458956.667]
14	Egypt	995450	(0.0, 5458956.667]
15	Vietnam	310070	(0.0, 5458956.667]
16	DR Congo	2267050	(0.0, 5458956.667]
17	Turkey	769630	(0.0, 5458956.667]
18	Iran	1628550	(0.0, 5458956.667]
19	Germany	348560	(0.0, 5458956.667]
20	Thailand	510890	(0.0, 5458956.667]
	Country (or dependency)	Land Area (Km²)	Land Area (Km²)
no
1	China	9388211	Middle
2	India	2973190	Small
3	United States	9147420	Middle
4	Indonesia	1811570	Small
5	Pakistan	770880	Small
6	Brazil	8358140	Middle
7	Nigeria	910770	Small
8	Bangladesh	130170	Small
9	Russia	16376870	Big
10	Mexico	1943950	Small
11	Japan	364555	Small
12	Ethiopia	1000000	Small
13	Philippines	298170	Small
14	Egypt	995450	Small
15	Vietnam	310070	Small
16	DR Congo	2267050	Small
17	Turkey	769630	Small
18	Iran	1628550	Small
19	Germany	348560	Small
20	Thailand	510890	Small
	Country (or dependency)	Land Area (Km²)	Land Area (Km²)
no
1	China	9388211	(8000000.0, 12000000.0]
2	India	2973190	(0.0, 4000000.0]
3	United States	9147420	(8000000.0, 12000000.0]
4	Indonesia	1811570	(0.0, 4000000.0]
5	Pakistan	770880	(0.0, 4000000.0]
6	Brazil	8358140	(8000000.0, 12000000.0]
7	Nigeria	910770	(0.0, 4000000.0]
8	Bangladesh	130170	(0.0, 4000000.0]
9	Russia	16376870	NaN
10	Mexico	1943950	(0.0, 4000000.0]
11	Japan	364555	(0.0, 4000000.0]
12	Ethiopia	1000000	(0.0, 4000000.0]
13	Philippines	298170	(0.0, 4000000.0]
14	Egypt	995450	(0.0, 4000000.0]
15	Vietnam	310070	(0.0, 4000000.0]
16	DR Congo	2267050	(0.0, 4000000.0]
17	Turkey	769630	(0.0, 4000000.0]
18	Iran	1628550	(0.0, 4000000.0]
19	Germany	348560	(0.0, 4000000.0]
20	Thailand	510890	(0.0, 4000000.0]
	Country (or dependency)	Land Area (Km²)	Land Area (Km²)
no
1	China	9388211	(3000000.0, inf]
2	India	2973190	(500000.0, 3000000.0]
3	United States	9147420	(3000000.0, inf]
4	Indonesia	1811570	(500000.0, 3000000.0]
5	Pakistan	770880	(500000.0, 3000000.0]
6	Brazil	8358140	(3000000.0, inf]
7	Nigeria	910770	(500000.0, 3000000.0]
8	Bangladesh	130170	(100000.0, 500000.0]
9	Russia	16376870	(3000000.0, inf]
10	Mexico	1943950	(500000.0, 3000000.0]
11	Japan	364555	(100000.0, 500000.0]
12	Ethiopia	1000000	(500000.0, 3000000.0]
13	Philippines	298170	(100000.0, 500000.0]
14	Egypt	995450	(500000.0, 3000000.0]
15	Vietnam	310070	(100000.0, 500000.0]
16	DR Congo	2267050	(500000.0, 3000000.0]
17	Turkey	769630	(500000.0, 3000000.0]
18	Iran	1628550	(500000.0, 3000000.0]
19	Germany	348560	(100000.0, 500000.0]
20	Thailand	510890	(500000.0, 3000000.0]
	Country (or dependency)	Land Area (Km²)	Land Area (Km²)
no
1	China	9388211	Giant
2	India	2973190	Big
3	United States	9147420	Giant
4	Indonesia	1811570	Big
5	Pakistan	770880	Big
6	Brazil	8358140	Giant
7	Nigeria	910770	Big
8	Bangladesh	130170	Middle
9	Russia	16376870	Giant
10	Mexico	1943950	Big
11	Japan	364555	Middle
12	Ethiopia	1000000	Big
13	Philippines	298170	Middle
14	Egypt	995450	Big
15	Vietnam	310070	Middle
16	DR Congo	2267050	Big
17	Turkey	769630	Big
18	Iran	1628550	Big
19	Germany	348560	Middle
20	Thailand	510890	Big
	Price	Levy	Manufacturer	Model	Prod_year	Category	Leather_interior	Fuel type	Engine volume	Mileage	Cylinders	Gear box type	Drive wheels	Doors	Wheel	Color	Airbags
ID
45654403	13328	1399	LEXUS	RX 450	2010	Jeep	Yes	Hybrid	3.5	186005 km	6.0	Automatic	4x4	04-May	Left wheel	Silver	12
44731507	16621	1018	CHEVROLET	Equinox	2011	Jeep	No	Petrol	3	192000 km	6.0	Tiptronic	4x4	04-May	Left wheel	Black	8
45774419	8467	-	HONDA	FIT	2006	Hatchback	No	Petrol	1.3	200000 km	4.0	Variator	Front	04-May	Right-hand drive	Black	2
45769185	3607	862	FORD	Escape	2011	Jeep	Yes	Hybrid	2.5	168966 km	4.0	Automatic	4x4	04-May	Left wheel	White	0
45809263	11726	446	HONDA	FIT	2014	Hatchback	Yes	Petrol	1.3	91901 km	4.0	Automatic	Front	04-May	Left wheel	Silver	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
45798355	8467	-	MERCEDES-BENZ	CLK 200	1999	Coupe	Yes	CNG	2.0 Turbo	300000 km	4.0	Manual	Rear	02-Mar	Left wheel	Silver	5
45778856	15681	831	HYUNDAI	Sonata	2011	Sedan	Yes	Petrol	2.4	161600 km	4.0	Tiptronic	Front	04-May	Left wheel	Red	8
45804997	26108	836	HYUNDAI	Tucson	2010	Jeep	Yes	Diesel	2	116365 km	4.0	Automatic	Front	04-May	Left wheel	Grey	4
45793526	5331	1288	CHEVROLET	Captiva	2007	Jeep	Yes	Diesel	2	51258 km	4.0	Automatic	Front	04-May	Left wheel	Black	4
45813273	470	753	HYUNDAI	Sonata	2012	Sedan	Yes	Hybrid	2.4	186923 km	4.0	Automatic	Front	04-May	Left wheel	White	12
	Price	Levy	Manufacturer	Model	Prod_year	Category	Leather_interior	Fuel type	Engine volume	Mileage	Cylinders	Gear box type	Drive wheels	Doors	Wheel	Color	Airbags
ID
45758153	1333	289	FORD	Escape	2008	Jeep	Yes	Hybrid	0.4	349288 km	4.0	Automatic	Front	04-May	Left wheel	Blue	0
45699930	17249	-	FORD	Escape Hybrid	2008	Jeep	No	Hybrid	2.3	147000 km	4.0	Variator	4x4	04-May	Left wheel	White	8
45646562	1333	1053	LEXUS	ES 350	2014	Sedan	Yes	Petrol	3.5	179358 km	6.0	Automatic	Front	04-May	Left wheel	Red	12
45656923	9879	1018	MERCEDES-BENZ	ML 350	2011	Jeep	Yes	Diesel	3	275862 km	6.0	Automatic	4x4	04-May	Left wheel	Silver	12
45815887	10976	1275	HYUNDAI	Sonata	2019	Sedan	Yes	Petrol	2.4	29419 km	4.0	Automatic	Front	04-May	Left wheel	Blue	12
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
45802363	21805	1024	HYUNDAI	H1	2010	Minivan	Yes	Diesel	2.5	58958 km	4.0	Automatic	Front	04-May	Left wheel	Black	4
45812777	220	1327	TOYOTA	Camry	2018	Sedan	Yes	Petrol	2.5	47688 km	4.0	Automatic	Front	04-May	Left wheel	Blue	12
44104417	15210	-	TOYOTA	Aqua	2014	Hatchback	No	Hybrid	1.5	139000 km	4.0	Variator	Front	04-May	Right-hand drive	White	2
45793406	3136	-	OPEL	Corsa	1995	Hatchback	No	Petrol	1.4	100000 km	4.0	Manual	Front	02-Mar	Left wheel	Grey	2
45700700	18817	-	TOYOTA	Camry	2007	Sedan	Yes	Hybrid	2.4	151000 km	4.0	Variator	Front	04-May	Left wheel	Black	10
	Price	Levy	Manufacturer	Model	Prod_year	Category	Leather_interior	Fuel type	Engine volume	Mileage	Cylinders	Gear box type	Drive wheels	Doors	Wheel	Color	Airbags
ID
45813151	220	919	MERCEDES-BENZ	ML 350	2012	Jeep	Yes	Diesel	3	209072 km	6.0	Automatic	4x4	04-May	Left wheel	Grey	12
45783744	11000	-	JEEP	Liberty	2001	Jeep	Yes	LPG	3.7	137582 km	6.0	Automatic	4x4	04-May	Right-hand drive	Silver	6
45805850	10976	-	TOYOTA	RAV 4	2002	Jeep	Yes	CNG	2	200000 km	4.0	Automatic	4x4	04-May	Left wheel	White	4
45816409	1568	753	HYUNDAI	Sonata	2012	Sedan	Yes	Petrol	2.4	246230 km	4.0	Automatic	Front	04-May	Left wheel	Black	12
45281242	8938	843	TOYOTA	Prius	2008	Sedan	No	Hybrid	1.5	133016 km	4.0	Automatic	Front	04-May	Left wheel	Beige	8
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
45798478	13172	639	FORD	Focus	2014	Sedan	Yes	Petrol	2	134400 km	4.0	Tiptronic	Front	04-May	Left wheel	Red	8
45321909	16621	-	TOYOTA	Prius	2010	Hatchback	No	Hybrid	1.8	154000 km	4.0	Variator	Front	04-May	Left wheel	White	6
45758118	15681	1811	LEXUS	GX 460	2010	Jeep	Yes	Petrol	4.6	275240 km	8.0	Automatic	4x4	04-May	Left wheel	Silver	0
45758137	6476	-	NISSAN	Note	2008	Hatchback	No	CNG	1.5	999999999 km	4.0	Automatic	4x4	04-May	Right-hand drive	Black	0
45720411	3	697	VOLKSWAGEN	Jetta	2015	Sedan	Yes	Petrol	1.8 Turbo	65000 km	4.0	Automatic	Front	04-May	Left wheel	Grey	12
	Price	Cylinders	Manufacturer_ALFA ROMEO	Manufacturer_ASTON MARTIN	Manufacturer_AUDI	Manufacturer_BENTLEY	Manufacturer_BMW	Manufacturer_BUICK	Manufacturer_CADILLAC	Manufacturer_CHEVROLET	...	Engine volume_5.7 Turbo	Engine volume_5.8	Engine volume_5.9	Engine volume_6	Engine volume_6.2	Engine volume_6.3	Engine volume_6.3 Turbo	Engine volume_6.7	Engine volume_6.8	Wheel_Right-hand drive
ID
45758153	-0.082497	-0.485038	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
45699930	-0.007675	-0.485038	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
45646562	-0.082497	1.187062	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
45656923	-0.042322	1.187062	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
45815887	-0.037165	-0.485038	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
45802363	0.013743	-0.485038	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
45812777	-0.087729	-0.485038	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
44104417	-0.017260	-0.485038	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
45793406	-0.074021	-0.485038	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
45700700	-0.000304	-0.485038	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0