MII_Salin_Oleg_PIbd-33/lec3.ipynb

3468 lines
123 KiB
Plaintext
Raw Permalink Normal View History

2024-10-23 13:43:55 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Унитарное кодирование\n",
"\n",
"Преобразование категориального признака в несколько бинарных признаков"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-08 22:37:34 +04:00
"#### Загрузка набора данных World Population"
2024-10-23 13:43:55 +04:00
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 1,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Country</th>\n",
" <th>Population2020</th>\n",
" <th>Yearly</th>\n",
" <th>NetChange</th>\n",
" <th>Density</th>\n",
" <th>LandArea</th>\n",
" <th>Migrants</th>\n",
" <th>FertRate</th>\n",
" <th>MedAge</th>\n",
" <th>UrbanPop</th>\n",
" <th>WorldShare</th>\n",
" <th>Net Change</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>China</td>\n",
" <td>1439323776</td>\n",
" <td>0.39</td>\n",
" <td>5,540,090</td>\n",
" <td>153</td>\n",
" <td>9388211</td>\n",
" <td>-348,399</td>\n",
" <td>1.7</td>\n",
" <td>38</td>\n",
" <td>61%</td>\n",
" <td>18.47%</td>\n",
" <td>5540090</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>India</td>\n",
" <td>1380004385</td>\n",
" <td>0.99</td>\n",
" <td>13,586,631</td>\n",
" <td>464</td>\n",
" <td>2973190</td>\n",
" <td>-532,687</td>\n",
" <td>2.2</td>\n",
" <td>28</td>\n",
" <td>35%</td>\n",
" <td>17.70%</td>\n",
" <td>13586631</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>United States</td>\n",
" <td>331002651</td>\n",
" <td>0.59</td>\n",
" <td>1,937,734</td>\n",
" <td>36</td>\n",
" <td>9147420</td>\n",
" <td>954,806</td>\n",
" <td>1.8</td>\n",
" <td>38</td>\n",
" <td>83%</td>\n",
" <td>4.25%</td>\n",
" <td>1937734</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Indonesia</td>\n",
" <td>273523615</td>\n",
" <td>1.07</td>\n",
" <td>2,898,047</td>\n",
" <td>151</td>\n",
" <td>1811570</td>\n",
" <td>-98,955</td>\n",
" <td>2.3</td>\n",
" <td>30</td>\n",
" <td>56%</td>\n",
" <td>3.51%</td>\n",
" <td>2898047</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Pakistan</td>\n",
" <td>220892340</td>\n",
" <td>2.00</td>\n",
" <td>4,327,022</td>\n",
" <td>287</td>\n",
" <td>770880</td>\n",
" <td>-233,379</td>\n",
" <td>3.6</td>\n",
" <td>23</td>\n",
" <td>35%</td>\n",
" <td>2.83%</td>\n",
" <td>4327022</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>231</th>\n",
" <td>Montserrat</td>\n",
" <td>4992</td>\n",
" <td>0.06</td>\n",
" <td>3</td>\n",
" <td>50</td>\n",
" <td>100</td>\n",
" <td>NaN</td>\n",
" <td>N.A.</td>\n",
" <td>N.A.</td>\n",
" <td>10%</td>\n",
" <td>0.00%</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>232</th>\n",
" <td>Falkland Islands</td>\n",
" <td>3480</td>\n",
" <td>3.05</td>\n",
" <td>103</td>\n",
" <td>0</td>\n",
" <td>12170</td>\n",
" <td>NaN</td>\n",
" <td>N.A.</td>\n",
" <td>N.A.</td>\n",
" <td>66%</td>\n",
" <td>0.00%</td>\n",
" <td>103</td>\n",
" </tr>\n",
" <tr>\n",
" <th>233</th>\n",
" <td>Niue</td>\n",
" <td>1626</td>\n",
" <td>0.68</td>\n",
" <td>11</td>\n",
" <td>6</td>\n",
" <td>260</td>\n",
" <td>NaN</td>\n",
" <td>N.A.</td>\n",
" <td>N.A.</td>\n",
" <td>46%</td>\n",
" <td>0.00%</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>234</th>\n",
" <td>Tokelau</td>\n",
" <td>1357</td>\n",
" <td>1.27</td>\n",
" <td>17</td>\n",
" <td>136</td>\n",
" <td>10</td>\n",
" <td>NaN</td>\n",
" <td>N.A.</td>\n",
" <td>N.A.</td>\n",
" <td>0%</td>\n",
" <td>0.00%</td>\n",
" <td>17</td>\n",
" </tr>\n",
" <tr>\n",
" <th>235</th>\n",
" <td>Holy See</td>\n",
" <td>801</td>\n",
" <td>0.25</td>\n",
" <td>2</td>\n",
" <td>2,003</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>N.A.</td>\n",
" <td>N.A.</td>\n",
" <td>N.A.</td>\n",
" <td>0.00%</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>235 rows × 12 columns</p>\n",
"</div>"
],
"text/plain": [
" Country Population2020 Yearly NetChange Density LandArea \\\n",
"no \n",
"1 China 1439323776 0.39 5,540,090 153 9388211 \n",
"2 India 1380004385 0.99 13,586,631 464 2973190 \n",
"3 United States 331002651 0.59 1,937,734 36 9147420 \n",
"4 Indonesia 273523615 1.07 2,898,047 151 1811570 \n",
"5 Pakistan 220892340 2.00 4,327,022 287 770880 \n",
".. ... ... ... ... ... ... \n",
"231 Montserrat 4992 0.06 3 50 100 \n",
"232 Falkland Islands 3480 3.05 103 0 12170 \n",
"233 Niue 1626 0.68 11 6 260 \n",
"234 Tokelau 1357 1.27 17 136 10 \n",
"235 Holy See 801 0.25 2 2,003 0 \n",
"\n",
" Migrants FertRate MedAge UrbanPop WorldShare Net Change \n",
"no \n",
"1 -348,399 1.7 38 61% 18.47% 5540090 \n",
"2 -532,687 2.2 28 35% 17.70% 13586631 \n",
"3 954,806 1.8 38 83% 4.25% 1937734 \n",
"4 -98,955 2.3 30 56% 3.51% 2898047 \n",
"5 -233,379 3.6 23 35% 2.83% 4327022 \n",
".. ... ... ... ... ... ... \n",
"231 NaN N.A. N.A. 10% 0.00% 3 \n",
"232 NaN N.A. N.A. 66% 0.00% 103 \n",
"233 NaN N.A. N.A. 46% 0.00% 11 \n",
"234 NaN N.A. N.A. 0% 0.00% 17 \n",
"235 NaN N.A. N.A. N.A. 0.00% 2 \n",
"\n",
"[235 rows x 12 columns]"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 1,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"countries = pd.read_csv(\n",
" \"data/world-population-by-country-2020.csv\", index_col=\"no\"\n",
")\n",
"\n",
"countries[\"Population2020\"] = countries[\"Population2020\"].apply(\n",
" lambda x: int(\"\".join(x.split(\",\")))\n",
")\n",
"countries[\"Net Change\"] = countries[\"NetChange\"].apply(\n",
" lambda x: int(\"\".join(x.split(\",\")))\n",
")\n",
"countries[\"Yearly\"] = countries[\"Yearly\"].apply(\n",
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
")\n",
"countries[\"LandArea\"] = countries[\"LandArea\"].apply(\n",
" lambda x: int(\"\".join(x.split(\",\")))\n",
")\n",
"countries"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Кодирование"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 2,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"import numpy as np\n",
"\n",
"# encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
"\n",
"# encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n",
"\n",
"# encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n",
"\n",
"# encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
"\n",
"# encoded_values_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Добавление признаков в исходный Dataframe"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 3,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [],
"source": [
"# titanic = pd.concat([titanic, encoded_values_df], axis=1)\n",
"\n",
"# titanic"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Дискретизация признаков"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Равномерное разделение данных на 3 группы"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 4,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [],
"source": [
"labels = [\"Small\", \"Middle\", \"Big\"]\n",
"num_bins = 3"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 5,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([ 0. , 5458956.66666667, 10917913.33333333,\n",
" 16376870. ]),\n",
" array([229, 5, 1]))"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 5,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hist1, bins1 = np.histogram(\n",
" countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins=num_bins\n",
")\n",
"bins1, hist1"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 6,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>LandArea</th>\n",
" <th>LandArea</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>9388211</td>\n",
" <td>(5458956.667, 10917913.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2973190</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9147420</td>\n",
" <td>(5458956.667, 10917913.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1811570</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>770880</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8358140</td>\n",
" <td>(5458956.667, 10917913.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>910770</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>130170</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>16376870</td>\n",
" <td>(10917913.333, 16376870.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1943950</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>364555</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1000000</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>298170</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>995450</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>310070</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>2267050</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>769630</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1628550</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>348560</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>510890</td>\n",
" <td>(0.0, 5458956.667]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" LandArea LandArea\n",
"no \n",
"1 9388211 (5458956.667, 10917913.333]\n",
"2 2973190 (0.0, 5458956.667]\n",
"3 9147420 (5458956.667, 10917913.333]\n",
"4 1811570 (0.0, 5458956.667]\n",
"5 770880 (0.0, 5458956.667]\n",
"6 8358140 (5458956.667, 10917913.333]\n",
"7 910770 (0.0, 5458956.667]\n",
"8 130170 (0.0, 5458956.667]\n",
"9 16376870 (10917913.333, 16376870.0]\n",
"10 1943950 (0.0, 5458956.667]\n",
"11 364555 (0.0, 5458956.667]\n",
"12 1000000 (0.0, 5458956.667]\n",
"13 298170 (0.0, 5458956.667]\n",
"14 995450 (0.0, 5458956.667]\n",
"15 310070 (0.0, 5458956.667]\n",
"16 2267050 (0.0, 5458956.667]\n",
"17 769630 (0.0, 5458956.667]\n",
"18 1628550 (0.0, 5458956.667]\n",
"19 348560 (0.0, 5458956.667]\n",
"20 510890 (0.0, 5458956.667]"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 6,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat(\n",
" [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins1))], axis=1\n",
").head(20)"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 7,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>LandArea</th>\n",
" <th>LandArea</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>9388211</td>\n",
" <td>Middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2973190</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9147420</td>\n",
" <td>Middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1811570</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>770880</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8358140</td>\n",
" <td>Middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>910770</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>130170</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>16376870</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1943950</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>364555</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1000000</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>298170</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>995450</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>310070</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>2267050</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>769630</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1628550</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>348560</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>510890</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" LandArea LandArea\n",
"no \n",
"1 9388211 Middle\n",
"2 2973190 Small\n",
"3 9147420 Middle\n",
"4 1811570 Small\n",
"5 770880 Small\n",
"6 8358140 Middle\n",
"7 910770 Small\n",
"8 130170 Small\n",
"9 16376870 Big\n",
"10 1943950 Small\n",
"11 364555 Small\n",
"12 1000000 Small\n",
"13 298170 Small\n",
"14 995450 Small\n",
"15 310070 Small\n",
"16 2267050 Small\n",
"17 769630 Small\n",
"18 1628550 Small\n",
"19 348560 Small\n",
"20 510890 Small"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 7,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins1), labels=labels)], axis=1).head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 8,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([ 0., 4000000., 8000000., 12000000.]),\n",
" array([229, 1, 4, 1]))"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 8,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"labels = [\"Small\", \"Middle\", \"Big\"]\n",
"bins2 = np.linspace(0, 12000000, 4)\n",
"\n",
"tmp_bins2 = np.digitize(\n",
" countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins2\n",
")\n",
"\n",
"hist2 = np.bincount(tmp_bins2 - 1)\n",
"\n",
"bins2, hist2"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 9,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>LandArea</th>\n",
" <th>LandArea</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>9388211</td>\n",
" <td>(8000000.0, 12000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2973190</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9147420</td>\n",
" <td>(8000000.0, 12000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1811570</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>770880</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8358140</td>\n",
" <td>(8000000.0, 12000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>910770</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>130170</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>16376870</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1943950</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>364555</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1000000</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>298170</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>995450</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>310070</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>2267050</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>769630</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1628550</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>348560</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>510890</td>\n",
" <td>(0.0, 4000000.0]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" LandArea LandArea\n",
"no \n",
"1 9388211 (8000000.0, 12000000.0]\n",
"2 2973190 (0.0, 4000000.0]\n",
"3 9147420 (8000000.0, 12000000.0]\n",
"4 1811570 (0.0, 4000000.0]\n",
"5 770880 (0.0, 4000000.0]\n",
"6 8358140 (8000000.0, 12000000.0]\n",
"7 910770 (0.0, 4000000.0]\n",
"8 130170 (0.0, 4000000.0]\n",
"9 16376870 NaN\n",
"10 1943950 (0.0, 4000000.0]\n",
"11 364555 (0.0, 4000000.0]\n",
"12 1000000 (0.0, 4000000.0]\n",
"13 298170 (0.0, 4000000.0]\n",
"14 995450 (0.0, 4000000.0]\n",
"15 310070 (0.0, 4000000.0]\n",
"16 2267050 (0.0, 4000000.0]\n",
"17 769630 (0.0, 4000000.0]\n",
"18 1628550 (0.0, 4000000.0]\n",
"19 348560 (0.0, 4000000.0]\n",
"20 510890 (0.0, 4000000.0]"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 9,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins2))], axis=1).head(20)"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 10,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>LandArea</th>\n",
" <th>LandArea</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>9388211</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2973190</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9147420</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1811570</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>770880</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8358140</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>910770</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>130170</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>16376870</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1943950</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>364555</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1000000</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>298170</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>995450</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>310070</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>2267050</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>769630</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1628550</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>348560</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>510890</td>\n",
" <td>Small</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" LandArea LandArea\n",
"no \n",
"1 9388211 Big\n",
"2 2973190 Small\n",
"3 9147420 Big\n",
"4 1811570 Small\n",
"5 770880 Small\n",
"6 8358140 Big\n",
"7 910770 Small\n",
"8 130170 Small\n",
"9 16376870 NaN\n",
"10 1943950 Small\n",
"11 364555 Small\n",
"12 1000000 Small\n",
"13 298170 Small\n",
"14 995450 Small\n",
"15 310070 Small\n",
"16 2267050 Small\n",
"17 769630 Small\n",
"18 1628550 Small\n",
"19 348560 Small\n",
"20 510890 Small"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 10,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat(\n",
" [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins2), labels=labels)],\n",
" axis=1,\n",
").head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 11,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([0.e+00, 1.e+03, 1.e+05, 5.e+05, 3.e+06, inf]),\n",
" array([52, 77, 56, 44, 6]))"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 11,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"labels2 = [\"Dwarf\", \"Small\", \"Middle\", \"Big\", \"Giant\"]\n",
"hist3, bins3 = np.histogram(\n",
"\n",
" countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins=[0, 1000, 100000, 500000, 3000000, np.inf]\n",
")\n",
"\n",
"\n",
"bins3, hist3"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 12,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>LandArea</th>\n",
" <th>LandArea</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>9388211</td>\n",
" <td>(3000000.0, inf]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2973190</td>\n",
" <td>(500000.0, 3000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9147420</td>\n",
" <td>(3000000.0, inf]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1811570</td>\n",
" <td>(500000.0, 3000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>770880</td>\n",
" <td>(500000.0, 3000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8358140</td>\n",
" <td>(3000000.0, inf]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>910770</td>\n",
" <td>(500000.0, 3000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>130170</td>\n",
" <td>(100000.0, 500000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>16376870</td>\n",
" <td>(3000000.0, inf]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1943950</td>\n",
" <td>(500000.0, 3000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>364555</td>\n",
" <td>(100000.0, 500000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1000000</td>\n",
" <td>(500000.0, 3000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>298170</td>\n",
" <td>(100000.0, 500000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>995450</td>\n",
" <td>(500000.0, 3000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>310070</td>\n",
" <td>(100000.0, 500000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>2267050</td>\n",
" <td>(500000.0, 3000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>769630</td>\n",
" <td>(500000.0, 3000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1628550</td>\n",
" <td>(500000.0, 3000000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>348560</td>\n",
" <td>(100000.0, 500000.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>510890</td>\n",
" <td>(500000.0, 3000000.0]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" LandArea LandArea\n",
"no \n",
"1 9388211 (3000000.0, inf]\n",
"2 2973190 (500000.0, 3000000.0]\n",
"3 9147420 (3000000.0, inf]\n",
"4 1811570 (500000.0, 3000000.0]\n",
"5 770880 (500000.0, 3000000.0]\n",
"6 8358140 (3000000.0, inf]\n",
"7 910770 (500000.0, 3000000.0]\n",
"8 130170 (100000.0, 500000.0]\n",
"9 16376870 (3000000.0, inf]\n",
"10 1943950 (500000.0, 3000000.0]\n",
"11 364555 (100000.0, 500000.0]\n",
"12 1000000 (500000.0, 3000000.0]\n",
"13 298170 (100000.0, 500000.0]\n",
"14 995450 (500000.0, 3000000.0]\n",
"15 310070 (100000.0, 500000.0]\n",
"16 2267050 (500000.0, 3000000.0]\n",
"17 769630 (500000.0, 3000000.0]\n",
"18 1628550 (500000.0, 3000000.0]\n",
"19 348560 (100000.0, 500000.0]\n",
"20 510890 (500000.0, 3000000.0]"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 12,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins3))], axis=1).head(20)"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 13,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>LandArea</th>\n",
" <th>LandArea</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>9388211</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2973190</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9147420</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1811570</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>770880</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8358140</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>910770</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>130170</td>\n",
" <td>Middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>16376870</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1943950</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>364555</td>\n",
" <td>Middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1000000</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>298170</td>\n",
" <td>Middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>995450</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>310070</td>\n",
" <td>Middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>2267050</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>769630</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1628550</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>348560</td>\n",
" <td>Middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>510890</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" LandArea LandArea\n",
"no \n",
"1 9388211 Giant\n",
"2 2973190 Big\n",
"3 9147420 Giant\n",
"4 1811570 Big\n",
"5 770880 Big\n",
"6 8358140 Giant\n",
"7 910770 Big\n",
"8 130170 Middle\n",
"9 16376870 Giant\n",
"10 1943950 Big\n",
"11 364555 Middle\n",
"12 1000000 Big\n",
"13 298170 Middle\n",
"14 995450 Big\n",
"15 310070 Middle\n",
"16 2267050 Big\n",
"17 769630 Big\n",
"18 1628550 Big\n",
"19 348560 Middle\n",
"20 510890 Big"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 13,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat(\n",
" [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins3), labels=labels2)],\n",
" axis=1,\n",
").head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Квантильное разделение данных на 5 групп\n"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 14,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>LandArea</th>\n",
" <th>LandArea</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>9388211</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2973190</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9147420</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1811570</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>770880</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8358140</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>910770</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>130170</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>16376870</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1943950</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>364555</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1000000</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>298170</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>995450</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>310070</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>2267050</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>769630</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1628550</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>348560</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>510890</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" LandArea LandArea\n",
"no \n",
"1 9388211 4\n",
"2 2973190 4\n",
"3 9147420 4\n",
"4 1811570 4\n",
"5 770880 4\n",
"6 8358140 4\n",
"7 910770 4\n",
"8 130170 2\n",
"9 16376870 4\n",
"10 1943950 4\n",
"11 364555 3\n",
"12 1000000 4\n",
"13 298170 3\n",
"14 995450 4\n",
"15 310070 3\n",
"16 2267050 4\n",
"17 769630 4\n",
"18 1628550 4\n",
"19 348560 3\n",
"20 510890 3"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 14,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([countries[\"LandArea\"], pd.qcut(countries[\"LandArea\"], q=5, labels=False)], axis=1).head(20)"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 15,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>LandArea</th>\n",
" <th>LandArea</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>9388211</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2973190</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9147420</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1811570</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>770880</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>8358140</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>910770</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>130170</td>\n",
" <td>Middle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>16376870</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1943950</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>364555</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1000000</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>298170</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>995450</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>310070</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>2267050</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>769630</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1628550</td>\n",
" <td>Giant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>348560</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>510890</td>\n",
" <td>Big</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" LandArea LandArea\n",
"no \n",
"1 9388211 Giant\n",
"2 2973190 Giant\n",
"3 9147420 Giant\n",
"4 1811570 Giant\n",
"5 770880 Giant\n",
"6 8358140 Giant\n",
"7 910770 Giant\n",
"8 130170 Middle\n",
"9 16376870 Giant\n",
"10 1943950 Giant\n",
"11 364555 Big\n",
"12 1000000 Giant\n",
"13 298170 Big\n",
"14 995450 Giant\n",
"15 310070 Big\n",
"16 2267050 Giant\n",
"17 769630 Giant\n",
"18 1628550 Giant\n",
"19 348560 Big\n",
"20 510890 Big"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 15,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([countries[\"LandArea\"], pd.qcut(countries[\"LandArea\"], q=5, labels=labels2)], axis=1).head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Пример конструирования признаков на основе существующих\n",
"\n",
"Title - обращение к пассажиру (Mr, Mrs, Miss)\n",
"\n",
"Is_married - замужняя ли женщина\n",
"\n",
"Cabin_type - палуба (тип каюты)"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 16,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [],
"source": [
"# titanic_cl = titanic.drop(\n",
"# [\"Embarked_Q\", \"Embarked_S\", \"Embarked_nan\", \"Sex_male\"], axis=1, errors=\"ignore\"\n",
"# )\n",
"# titanic_cl = titanic_cl.dropna()\n",
"\n",
"# titanic_cl[\"Title\"] = [\n",
"# i.split(\",\")[1].split(\".\")[0].strip() for i in titanic_cl[\"Name\"]\n",
"# ]\n",
"\n",
"# titanic_cl[\"Is_married\"] = [1 if i == \"Mrs\" else 0 for i in titanic_cl[\"Title\"]]\n",
"\n",
"# titanic_cl[\"Cabin_type\"] = [i[0] for i in titanic_cl[\"Cabin\"]]\n",
"\n",
"# titanic_cl"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n",
"\n",
"https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-08 22:37:34 +04:00
"#### Загрузка данных"
2024-10-23 13:43:55 +04:00
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 17,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
2024-11-08 22:37:34 +04:00
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1379: SyntaxWarning: invalid escape sequence '\\l'\n",
" columns_string = \"\\l\".join(column_typing_info) # noqa: W605\n",
"c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1381: SyntaxWarning: invalid escape sequence '\\l'\n",
" label = \"{%s (%d row%s)|%s\\l}\" % ( # noqa: W605\n"
]
},
2024-10-23 13:43:55 +04:00
{
"data": {
"text/plain": [
"( no Country Population2020 Yearly NetChange Density \\\n",
" 0 1 China 1439323776 0.39 5540090 153 \n",
" 1 2 India 1380004385 0.99 13586631 464 \n",
" 2 3 United States 331002651 0.59 1937734 36 \n",
" 3 4 Indonesia 273523615 1.07 2898047 151 \n",
" 4 5 Pakistan 220892340 2.00 4327022 287 \n",
" .. ... ... ... ... ... ... \n",
" 230 231 Montserrat 4992 0.06 3 50 \n",
" 231 232 Falkland Islands 3480 3.05 103 0 \n",
" 232 233 Niue 1626 0.68 11 6 \n",
" 233 234 Tokelau 1357 1.27 17 136 \n",
" 234 235 Holy See 801 0.25 2 2,003 \n",
" \n",
" LandArea \n",
" 0 9388211 \n",
" 1 2973190 \n",
" 2 9147420 \n",
" 3 1811570 \n",
" 4 770880 \n",
" .. ... \n",
" 230 100 \n",
" 231 12170 \n",
" 232 260 \n",
" 233 10 \n",
" 234 0 \n",
" \n",
" [235 rows x 7 columns],\n",
" Year Population YearlyPer Yearly Median Fertility Density\n",
" 0 2020 7794798739 1.10 83000320 31 2.47 52\n",
" 1 2025 8184437460 0.98 77927744 32 2.54 55\n",
" 2 2030 8548487400 0.87 72809988 33 2.62 57\n",
" 3 2035 8887524213 0.78 67807363 34 2.70 60\n",
" 4 2040 9198847240 0.69 62264605 35 2.77 62\n",
" 5 2045 9481803274 0.61 56591207 35 2.85 64\n",
" 6 2050 9735033990 0.53 50646143 36 2.95 65,\n",
" Country Capital Continent\n",
" 0 Afghanistan Kabul Asia\n",
" 1 Albania Tirana Europe\n",
" 2 Algeria Algiers Africa\n",
" 3 American Samoa Pago Pago Oceania\n",
" 4 Andorra Andorra la Vella Europe\n",
" .. ... ... ...\n",
" 229 Wallis and Futuna Mata-Utu Oceania\n",
" 230 Western Sahara El Aai?�n Africa\n",
" 231 Yemen Sanaa Asia\n",
" 232 Zambia Lusaka Africa\n",
" 233 Zimbabwe Harare Africa\n",
" \n",
" [234 rows x 3 columns])"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 17,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import featuretools as ft\n",
"from woodwork.logical_types import Categorical, Datetime\n",
"\n",
"info = pd.read_csv(\"data/world-population-by-country-2020.csv\")\n",
"forcast = pd.read_csv(\"data/world-population-forcast-2020-2050.csv\")\n",
"capitals = pd.read_csv(\"data/countries-continents-capitals.csv\", encoding=\"ISO-8859-1\")\n",
"forcast[\"Population\"] = forcast[\"Population\"].apply(\n",
" lambda x: int(\"\".join(x.split(\",\")))\n",
")\n",
"forcast[\"YearlyPer\"] = forcast[\"YearlyPer\"].apply(\n",
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
")\n",
"forcast[\"Yearly\"] = forcast[\"Yearly\"].apply(\n",
" lambda x: int(\"\".join(x.split(\",\")))\n",
")\n",
"info = info.drop([\"Migrants\", \"FertRate\", \"MedAge\", \"UrbanPop\", \"WorldShare\"], axis=1)\n",
"info[\"Population2020\"] = info[\"Population2020\"].apply(\n",
" lambda x: int(\"\".join(x.split(\",\")))\n",
")\n",
"info[\"Yearly\"] = info[\"Yearly\"].apply(\n",
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
")\n",
"info[\"NetChange\"] = info[\"NetChange\"].apply(\n",
" lambda x: int(\"\".join(x.split(\",\")))\n",
")\n",
"info[\"LandArea\"] = info[\"LandArea\"].apply(\n",
" lambda x: int(\"\".join(x.split(\",\")))\n",
")\n",
"\n",
"info, forcast, capitals"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Создание сущностей в featuretools\n",
"\n",
"Добавление dataframe'ов с данными в EntitySet с указанием параметров: название сущности (таблицы), первичный ключ, категориальные атрибуты (в том числе даты)"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 18,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n"
]
},
{
"data": {
"text/plain": [
"Entityset: countries\n",
" DataFrames:\n",
" countries [Rows: 235, Columns: 7]\n",
" capitals [Rows: 234, Columns: 3]\n",
" forcast [Rows: 7, Columns: 8]\n",
" Relationships:\n",
" No relationships"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 18,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"es = ft.EntitySet(id=\"countries\")\n",
"\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"countries\",\n",
" dataframe=info,\n",
" index=\"no\",\n",
" logical_types={\n",
" \"Country\": Categorical,\n",
" },\n",
")\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"capitals\",\n",
" dataframe=capitals,\n",
" index=\"Country\",\n",
" logical_types={\n",
" \"Country\": Categorical,\n",
" \"Capital\": Categorical,\n",
" \"Continent\": Categorical,\n",
" },\n",
")\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"forcast\",\n",
" dataframe=forcast,\n",
" index=\"forcast_id\",\n",
" make_index=True,\n",
" logical_types={\n",
" \"Year\": Datetime,\n",
" },\n",
")\n",
"\n",
"es"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Настройка связей между сущностями featuretools\n",
"\n",
"Настройка связей между таблицами на уровне ключей\n",
"\n",
"Связь указывается от родителя к потомкам (таблица-родитель, первичный ключ, таблица-потомок, внешний ключ)"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 19,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Entityset: countries\n",
" DataFrames:\n",
" countries [Rows: 235, Columns: 7]\n",
" capitals [Rows: 234, Columns: 3]\n",
" forcast [Rows: 7, Columns: 8]\n",
" Relationships:\n",
" countries.Country -> capitals.Country"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 19,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"es = es.add_relationship(\"capitals\", \"Country\", \"countries\", \"Country\")\n",
"\n",
"es"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Автоматическое конструирование признаков с помощью featuretools\n",
"\n",
"Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n",
"\n",
"Результат помещается в Dataframe feature_matrix"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 20,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Country</th>\n",
" <th>Population2020</th>\n",
" <th>Yearly</th>\n",
" <th>NetChange</th>\n",
" <th>LandArea</th>\n",
" <th>capitals.Capital</th>\n",
" <th>capitals.Continent</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>China</td>\n",
" <td>1439323776</td>\n",
" <td>0.39</td>\n",
" <td>5540090</td>\n",
" <td>9388211</td>\n",
" <td>Beijing</td>\n",
" <td>Asia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>India</td>\n",
" <td>1380004385</td>\n",
" <td>0.99</td>\n",
" <td>13586631</td>\n",
" <td>2973190</td>\n",
" <td>New Delhi</td>\n",
" <td>Asia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>United States</td>\n",
" <td>331002651</td>\n",
" <td>0.59</td>\n",
" <td>1937734</td>\n",
" <td>9147420</td>\n",
" <td>Washington, D.C.</td>\n",
" <td>North America</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Indonesia</td>\n",
" <td>273523615</td>\n",
" <td>1.07</td>\n",
" <td>2898047</td>\n",
" <td>1811570</td>\n",
" <td>Jakarta</td>\n",
" <td>Asia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Pakistan</td>\n",
" <td>220892340</td>\n",
" <td>2.00</td>\n",
" <td>4327022</td>\n",
" <td>770880</td>\n",
" <td>Islamabad</td>\n",
" <td>Asia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>231</th>\n",
" <td>Montserrat</td>\n",
" <td>4992</td>\n",
" <td>0.06</td>\n",
" <td>3</td>\n",
" <td>100</td>\n",
" <td>Brades</td>\n",
" <td>North America</td>\n",
" </tr>\n",
" <tr>\n",
" <th>232</th>\n",
" <td>Falkland Islands</td>\n",
" <td>3480</td>\n",
" <td>3.05</td>\n",
" <td>103</td>\n",
" <td>12170</td>\n",
" <td>Stanley</td>\n",
" <td>South America</td>\n",
" </tr>\n",
" <tr>\n",
" <th>233</th>\n",
" <td>Niue</td>\n",
" <td>1626</td>\n",
" <td>0.68</td>\n",
" <td>11</td>\n",
" <td>260</td>\n",
" <td>Alofi</td>\n",
" <td>Oceania</td>\n",
" </tr>\n",
" <tr>\n",
" <th>234</th>\n",
" <td>Tokelau</td>\n",
" <td>1357</td>\n",
" <td>1.27</td>\n",
" <td>17</td>\n",
" <td>10</td>\n",
" <td>Nukunonu</td>\n",
" <td>Oceania</td>\n",
" </tr>\n",
" <tr>\n",
" <th>235</th>\n",
" <td>Holy See</td>\n",
" <td>801</td>\n",
" <td>0.25</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>235 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" Country Population2020 Yearly NetChange LandArea \\\n",
"no \n",
"1 China 1439323776 0.39 5540090 9388211 \n",
"2 India 1380004385 0.99 13586631 2973190 \n",
"3 United States 331002651 0.59 1937734 9147420 \n",
"4 Indonesia 273523615 1.07 2898047 1811570 \n",
"5 Pakistan 220892340 2.00 4327022 770880 \n",
".. ... ... ... ... ... \n",
"231 Montserrat 4992 0.06 3 100 \n",
"232 Falkland Islands 3480 3.05 103 12170 \n",
"233 Niue 1626 0.68 11 260 \n",
"234 Tokelau 1357 1.27 17 10 \n",
"235 Holy See 801 0.25 2 0 \n",
"\n",
" capitals.Capital capitals.Continent \n",
"no \n",
"1 Beijing Asia \n",
"2 New Delhi Asia \n",
"3 Washington, D.C. North America \n",
"4 Jakarta Asia \n",
"5 Islamabad Asia \n",
".. ... ... \n",
"231 Brades North America \n",
"232 Stanley South America \n",
"233 Alofi Oceania \n",
"234 Nukunonu Oceania \n",
"235 NaN NaN \n",
"\n",
"[235 rows x 7 columns]"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 20,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=es,\n",
" target_dataframe_name=\"countries\",\n",
" max_depth=1,\n",
")\n",
"\n",
"feature_matrix"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Полученные признаки\n",
"\n",
"Список колонок полученного dataframe'а"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 21,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<Feature: Country>,\n",
" <Feature: Population2020>,\n",
" <Feature: Yearly>,\n",
" <Feature: NetChange>,\n",
" <Feature: LandArea>,\n",
" <Feature: capitals.Capital>,\n",
" <Feature: capitals.Continent>]"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 21,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_defs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Отсечение значений признаков"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Определение выбросов с помощью boxplot"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 22,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: >"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 22,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
2024-11-08 22:37:34 +04:00
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAAGsCAYAAAAPJKchAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAvfklEQVR4nO3df1hVZb7//9dmAxsoQU0FNQyL0rokZXQknEhMfqTFxHhVTnbUPOX51tS5LPKcwknNLDFF08oOx2a0PFNqmZLHzORQKiXlEaN0Jk1TYzJBraNbwWC7Wd8//LBHApStuG/ZPB/X5XWx7nWvtd575lp7v1rrXuu2WZZlCQAAwJAA0wUAAIC2jTACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjGpVYWTTpk3KyMhQt27dZLPZlJ+f7/U+3n77bfXr109hYWG66qqrNHv27JYvFAAANFurCiOVlZXq27evFixYcF7bf/DBB7rvvvv00EMPaceOHXr11Vf14osv6pVXXmnhSgEAQHPZWutEeTabTatWrVJmZqanrbq6Wn/84x+1dOlSHT16VH369NELL7yg5ORkSdKoUaPkcrn0zjvveLZ5+eWXNWvWLJWVlclms/n4UwAAgFZ1ZeRcHn30URUXF2vZsmX66quvdPfdd+u2227T7t27JZ0OKyEhIfW2CQ0N1ffff6/vvvvORMkAALR5fhNGysrKtHjxYr3zzjtKSkrSNddco4kTJ+rmm2/W4sWLJUnp6elauXKlCgsLVVtbq2+++UZz5syRJB08eNBk+QAAtFmBpgtoKdu3b5fb7dZ1111Xr726ulpXXHGFJGn8+PH69ttvdccdd8jlcik8PFwTJkzQM888o4AAv8llAAC0Kn4TRk6cOCG73a6SkhLZ7fZ66y6//HJJp8eZvPDCC5oxY4bKy8vVuXNnFRYWSpKuvvpqn9cMAAD8KIzEx8fL7Xbr0KFDSkpKOmtfu92u7t27S5KWLl2qxMREde7c2RdlAgCAX2hVYeTEiRPas2ePZ3nfvn0qLS1Vx44ddd111+m+++7TmDFjNGfOHMXHx+vw4cMqLCzUjTfeqNtvv11HjhzRihUrlJycrJ9//tkzxmTjxo0GPxUAAG1bq3q0d8OGDRoyZEiD9rFjx+r111+Xy+XSc889pyVLlujAgQPq1KmTbrrpJk2bNk1xcXE6cuSIMjIytH37dlmWpcTERD3//PNKSEgw8GkAAIDUysIIAADwPzxCAgAAjCKMAAAAo1rFANba2lr98MMPateuHa9sBwCglbAsS8ePH1e3bt3O+j6vVhFGfvjhB0VHR5suAwAAnIe///3vuvLKK5tc3yrCSLt27SSd/jDh4eGGqwHQklwul9avX6+0tDQFBQWZLgdAC3I6nYqOjvb8jjelVYSRulsz4eHhhBHAz7hcLoWFhSk8PJwwAvipcw2xYAArAAAwijACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAAAKMIIwAAwCjCCABj3G63Nm7cqE2bNmnjxo1yu92mSwJgAGEEgBErV65UbGysUlNTNXfuXKWmpio2NlYrV640XRoAHyOMAPC5lStX6q677lJcXJyKioq0dOlSFRUVKS4uTnfddReBBGhjbJZlWaaLOBen06mIiAgdO3aMuWmAVs7tdis2NlZxcXHKz8+X2+3W2rVrNXz4cNntdmVmZmrHjh3avXu37Ha76XIBXIDm/n5zZQSATxUVFWn//v2aNGmSAgLqfwUFBAQoOztb+/btU1FRkaEKAfgaYQSATx08eFCS1KdPn0bX17XX9QPg/wgjAHyqa9eukqQdO3Y0ur6uva4fAP9HGAHgU0lJSYqJidGMGTNUW1tbb11tba1ycnLUs2dPJSUlGaoQgK8RRgD4lN1u15w5c7RmzRplZmbqs88+08mTJ/XZZ58pMzNTa9asUW5uLoNXgTYk0HQBANqeESNGaMWKFXriiSd0yy23eNp79uypFStWaMSIEQarA+BrPNoLwBi3262PP/5YH3zwgYYNG6YhQ4ZwRQTwIxft0d5NmzYpIyND3bp1k81mU35+frO3/fTTTxUYGKh+/fp5e1gAfshut2vw4MG65ZZbNHjwYIII0EZ5HUYqKyvVt29fLViwwKvtjh49qjFjxmjo0KHeHhIAAPgxr8eMDBs2TMOGDfP6QA899JBGjRolu93u1dUUAADg33wygHXx4sXau3ev/vKXv+i55547Z//q6mpVV1d7lp1OpyTJ5XLJ5XJdtDoB+Jbb7daGDRu0adMmORwOJScnc6sG8CPN/c2+6GFk9+7deuqpp1RUVKTAwOYdLicnR9OmTWvQvn79eoWFhbV0iQAMKC4u1uLFi3Xo0CFJ0ty5c9WlSxeNGzdOiYmJhqsD0BKqqqqa1e+ihhG3261Ro0Zp2rRpuu6665q9XXZ2trKysjzLTqdT0dHRSktL42kawA+sWrVKs2bN0vDhwzVx4kSVl5crKipKubm5mjVrlpYtW6bf/e53pssEcIHq7mycywU92muz2bRq1SplZmY2uv7o0aPq0KFDvcuutbW1sixLdrtd69ev16233nrO4/BoL+A/mLUXaDua+/t9Ua+MhIeHa/v27fXaXn31VX300UdasWKFevbseTEPD+ASVDdr79KlSxUQECC32+1ZVzdr76BBg1RUVKTk5GRzhQLwGa/DyIkTJ7Rnzx7P8r59+1RaWqqOHTuqR48eys7O1oEDB7RkyRIFBAQ0mJmzS5cuCgkJaXLGTgD+jVl7AfyS1+8Z2bp1q+Lj4xUfHy9JysrKUnx8vKZMmSLp9BdIWVlZy1YJwG8way+AX+J18AB8ijEjQNtx0V4HDwAXgll7AfwSs/YC8Dlm7QVwJm7TADCGWXsB/3ZJPNoLAGdTN2tvZWUls/YCbRhjRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAY5XUY2bRpkzIyMtStWzfZbDbl5+eftf/KlSuVmpqqzp07Kzw8XImJifrwww/Pt14AAOBnvA4jlZWV6tu3rxYsWNCs/ps2bVJqaqrWrl2rkpISDRkyRBkZGfriiy+8LhYAAPifQG83GDZsmIYNG9bs/vPmzau3PGPGDL333nv67//+b8XHx3t7eAAA4Ge8DiMXqra2VsePH1fHjh2b7FNdXa3q6mrPstPplCS5XC65XK6LXiMA36k7pzm3Af/T3PPa52EkNzdXJ06c0D333NNkn5ycHE2bNq1B+/r16xUWFnYxywNgSEFBgekSALSwqqqqZvWzWZZlne9BbDabVq1apczMzGb1f+uttzR+/Hi99957SklJabJfY1dGoqOjdeTIEYWHh59vuQAuQS6XSwUFBUpNTVVQUJDpcgC0IKfTqU6dOunYsWNn/f322ZWRZcuW6cEHH9Q777xz1iAiSQ6HQw6Ho0F7UFAQX1aAn+L8BvxPc89pn7xnZOnSpRo3bpyWLl2q22+/3ReHBAAArYTXV0ZOnDihPXv2eJb37dun0tJSdezYUT169FB2drYOHDigJUuWSDp9a2bs2LGaP3++EhISVF5eLkkKDQ1VREREC30MAADQWnl9ZWTr1q2Kj4/3PJablZWl+Ph4TZkyRZJ08OBBlZWVefovXLhQp06d0iOPPKKuXbt6/k2YMKGFPgIAAGjNvL4ykpycrLONeX399dfrLW/YsMHbQwAAgDaEuWkAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAA
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
2024-10-23 13:43:55 +04:00
}
],
"source": [
"countries.boxplot(column=\"Population2020\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-08 22:37:34 +04:00
"Отсечение данных для признака Население, значение которых больше 50000000\n"
2024-10-23 13:43:55 +04:00
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 23,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Country</th>\n",
" <th>Population2020</th>\n",
" <th>PopulationClip</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>China</td>\n",
" <td>1439323776</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>India</td>\n",
" <td>1380004385</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>United States</td>\n",
" <td>331002651</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Indonesia</td>\n",
" <td>273523615</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Pakistan</td>\n",
" <td>220892340</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Brazil</td>\n",
" <td>212559417</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Nigeria</td>\n",
" <td>206139589</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Bangladesh</td>\n",
" <td>164689383</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Russia</td>\n",
" <td>145934462</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Mexico</td>\n",
" <td>128932753</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Japan</td>\n",
" <td>126476461</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Ethiopia</td>\n",
" <td>114963588</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Philippines</td>\n",
" <td>109581078</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Egypt</td>\n",
" <td>102334404</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Vietnam</td>\n",
" <td>97338579</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>DR Congo</td>\n",
" <td>89561403</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Turkey</td>\n",
" <td>84339067</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Iran</td>\n",
" <td>83992949</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Germany</td>\n",
" <td>83783942</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Thailand</td>\n",
" <td>69799978</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>United Kingdom</td>\n",
" <td>67886011</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>France</td>\n",
" <td>65273511</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>Italy</td>\n",
" <td>60461826</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>Tanzania</td>\n",
" <td>59734218</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>South Africa</td>\n",
" <td>59308690</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>Myanmar</td>\n",
" <td>54409800</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>Kenya</td>\n",
" <td>53771296</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>South Korea</td>\n",
" <td>51269185</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>Colombia</td>\n",
" <td>50882891</td>\n",
" <td>50000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Country Population2020 PopulationClip\n",
"no \n",
"1 China 1439323776 50000000\n",
"2 India 1380004385 50000000\n",
"3 United States 331002651 50000000\n",
"4 Indonesia 273523615 50000000\n",
"5 Pakistan 220892340 50000000\n",
"6 Brazil 212559417 50000000\n",
"7 Nigeria 206139589 50000000\n",
"8 Bangladesh 164689383 50000000\n",
"9 Russia 145934462 50000000\n",
"10 Mexico 128932753 50000000\n",
"11 Japan 126476461 50000000\n",
"12 Ethiopia 114963588 50000000\n",
"13 Philippines 109581078 50000000\n",
"14 Egypt 102334404 50000000\n",
"15 Vietnam 97338579 50000000\n",
"16 DR Congo 89561403 50000000\n",
"17 Turkey 84339067 50000000\n",
"18 Iran 83992949 50000000\n",
"19 Germany 83783942 50000000\n",
"20 Thailand 69799978 50000000\n",
"21 United Kingdom 67886011 50000000\n",
"22 France 65273511 50000000\n",
"23 Italy 60461826 50000000\n",
"24 Tanzania 59734218 50000000\n",
"25 South Africa 59308690 50000000\n",
"26 Myanmar 54409800 50000000\n",
"27 Kenya 53771296 50000000\n",
"28 South Korea 51269185 50000000\n",
"29 Colombia 50882891 50000000"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 23,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"countries_norm = countries.copy()\n",
"\n",
2024-11-08 22:37:34 +04:00
"countries_norm[\"PopulationClip\"] = countries_norm[\"Population2020\"].clip(0, 50000000)\n",
2024-10-23 13:43:55 +04:00
"\n",
"countries_norm[countries_norm[\"Population2020\"] > 50000000][\n",
" [\"Country\", \"Population2020\", \"PopulationClip\"]\n",
"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Винсоризация признака Возраст"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 24,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"111195830.99999991\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Country</th>\n",
" <th>Population2020</th>\n",
" <th>PopulationWinsorized</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>China</td>\n",
" <td>1439323776</td>\n",
" <td>114963588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>India</td>\n",
" <td>1380004385</td>\n",
" <td>114963588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>United States</td>\n",
" <td>331002651</td>\n",
" <td>114963588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Indonesia</td>\n",
" <td>273523615</td>\n",
" <td>114963588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Pakistan</td>\n",
" <td>220892340</td>\n",
" <td>114963588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Brazil</td>\n",
" <td>212559417</td>\n",
" <td>114963588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Nigeria</td>\n",
" <td>206139589</td>\n",
" <td>114963588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Bangladesh</td>\n",
" <td>164689383</td>\n",
" <td>114963588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Russia</td>\n",
" <td>145934462</td>\n",
" <td>114963588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Mexico</td>\n",
" <td>128932753</td>\n",
" <td>114963588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Japan</td>\n",
" <td>126476461</td>\n",
" <td>114963588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Ethiopia</td>\n",
" <td>114963588</td>\n",
" <td>114963588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Philippines</td>\n",
" <td>109581078</td>\n",
" <td>109581078</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Egypt</td>\n",
" <td>102334404</td>\n",
" <td>102334404</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Vietnam</td>\n",
" <td>97338579</td>\n",
" <td>97338579</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>DR Congo</td>\n",
" <td>89561403</td>\n",
" <td>89561403</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Turkey</td>\n",
" <td>84339067</td>\n",
" <td>84339067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Iran</td>\n",
" <td>83992949</td>\n",
" <td>83992949</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Germany</td>\n",
" <td>83783942</td>\n",
" <td>83783942</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Thailand</td>\n",
" <td>69799978</td>\n",
" <td>69799978</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>United Kingdom</td>\n",
" <td>67886011</td>\n",
" <td>67886011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>France</td>\n",
" <td>65273511</td>\n",
" <td>65273511</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>Italy</td>\n",
" <td>60461826</td>\n",
" <td>60461826</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>Tanzania</td>\n",
" <td>59734218</td>\n",
" <td>59734218</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>South Africa</td>\n",
" <td>59308690</td>\n",
" <td>59308690</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>Myanmar</td>\n",
" <td>54409800</td>\n",
" <td>54409800</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>Kenya</td>\n",
" <td>53771296</td>\n",
" <td>53771296</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>South Korea</td>\n",
" <td>51269185</td>\n",
" <td>51269185</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>Colombia</td>\n",
" <td>50882891</td>\n",
" <td>50882891</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Country Population2020 PopulationWinsorized\n",
"no \n",
"1 China 1439323776 114963588\n",
"2 India 1380004385 114963588\n",
"3 United States 331002651 114963588\n",
"4 Indonesia 273523615 114963588\n",
"5 Pakistan 220892340 114963588\n",
"6 Brazil 212559417 114963588\n",
"7 Nigeria 206139589 114963588\n",
"8 Bangladesh 164689383 114963588\n",
"9 Russia 145934462 114963588\n",
"10 Mexico 128932753 114963588\n",
"11 Japan 126476461 114963588\n",
"12 Ethiopia 114963588 114963588\n",
"13 Philippines 109581078 109581078\n",
"14 Egypt 102334404 102334404\n",
"15 Vietnam 97338579 97338579\n",
"16 DR Congo 89561403 89561403\n",
"17 Turkey 84339067 84339067\n",
"18 Iran 83992949 83992949\n",
"19 Germany 83783942 83783942\n",
"20 Thailand 69799978 69799978\n",
"21 United Kingdom 67886011 67886011\n",
"22 France 65273511 65273511\n",
"23 Italy 60461826 60461826\n",
"24 Tanzania 59734218 59734218\n",
"25 South Africa 59308690 59308690\n",
"26 Myanmar 54409800 54409800\n",
"27 Kenya 53771296 53771296\n",
"28 South Korea 51269185 51269185\n",
"29 Colombia 50882891 50882891"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 24,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from scipy.stats.mstats import winsorize\n",
"\n",
"print(countries_norm[\"Population2020\"].quantile(q=0.95))\n",
"\n",
"countries_norm[\"PopulationWinsorized\"] = winsorize(\n",
" countries_norm[\"Population2020\"].fillna(countries_norm[\"Population2020\"].mean()),\n",
" (0, 0.05),\n",
" inplace=False,\n",
")\n",
"\n",
"countries_norm[countries_norm[\"Population2020\"] > 50000000][\n",
" [\"Country\", \"Population2020\", \"PopulationWinsorized\"]\n",
"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Нормализация значений"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 25,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Country</th>\n",
" <th>Population2020</th>\n",
" <th>PopulationNorm</th>\n",
" <th>PopulationClipNorm</th>\n",
" <th>PopulationWinsorizedNorm</th>\n",
" <th>PopulationWinsorizedNorm2</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>China</td>\n",
" <td>1439323776</td>\n",
" <td>1.000000e+00</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>India</td>\n",
" <td>1380004385</td>\n",
" <td>9.587866e-01</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>United States</td>\n",
" <td>331002651</td>\n",
" <td>2.299705e-01</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Indonesia</td>\n",
" <td>273523615</td>\n",
" <td>1.900357e-01</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Pakistan</td>\n",
" <td>220892340</td>\n",
" <td>1.534691e-01</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>231</th>\n",
" <td>Montserrat</td>\n",
" <td>4992</td>\n",
" <td>2.911786e-06</td>\n",
" <td>0.000084</td>\n",
" <td>0.000036</td>\n",
" <td>-0.999927</td>\n",
" </tr>\n",
" <tr>\n",
" <th>232</th>\n",
" <td>Falkland Islands</td>\n",
" <td>3480</td>\n",
" <td>1.861292e-06</td>\n",
" <td>0.000054</td>\n",
" <td>0.000023</td>\n",
" <td>-0.999953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>233</th>\n",
" <td>Niue</td>\n",
" <td>1626</td>\n",
" <td>5.731862e-07</td>\n",
" <td>0.000017</td>\n",
" <td>0.000007</td>\n",
" <td>-0.999986</td>\n",
" </tr>\n",
" <tr>\n",
" <th>234</th>\n",
" <td>Tokelau</td>\n",
" <td>1357</td>\n",
" <td>3.862927e-07</td>\n",
" <td>0.000011</td>\n",
" <td>0.000005</td>\n",
" <td>-0.999990</td>\n",
" </tr>\n",
" <tr>\n",
" <th>235</th>\n",
" <td>Holy See</td>\n",
" <td>801</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>-1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>235 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" Country Population2020 PopulationNorm PopulationClipNorm \\\n",
"no \n",
"1 China 1439323776 1.000000e+00 1.000000 \n",
"2 India 1380004385 9.587866e-01 1.000000 \n",
"3 United States 331002651 2.299705e-01 1.000000 \n",
"4 Indonesia 273523615 1.900357e-01 1.000000 \n",
"5 Pakistan 220892340 1.534691e-01 1.000000 \n",
".. ... ... ... ... \n",
"231 Montserrat 4992 2.911786e-06 0.000084 \n",
"232 Falkland Islands 3480 1.861292e-06 0.000054 \n",
"233 Niue 1626 5.731862e-07 0.000017 \n",
"234 Tokelau 1357 3.862927e-07 0.000011 \n",
"235 Holy See 801 0.000000e+00 0.000000 \n",
"\n",
" PopulationWinsorizedNorm PopulationWinsorizedNorm2 \n",
"no \n",
"1 1.000000 1.000000 \n",
"2 1.000000 1.000000 \n",
"3 1.000000 1.000000 \n",
"4 1.000000 1.000000 \n",
"5 1.000000 1.000000 \n",
".. ... ... \n",
"231 0.000036 -0.999927 \n",
"232 0.000023 -0.999953 \n",
"233 0.000007 -0.999986 \n",
"234 0.000005 -0.999990 \n",
"235 0.000000 -1.000000 \n",
"\n",
"[235 rows x 6 columns]"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 25,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import preprocessing\n",
"\n",
"min_max_scaler = preprocessing.MinMaxScaler()\n",
"\n",
"min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n",
"\n",
"countries_norm[\"PopulationNorm\"] = min_max_scaler.fit_transform(\n",
" countries_norm[\"Population2020\"].to_numpy().reshape(-1, 1)\n",
").reshape(countries_norm[\"Population2020\"].shape)\n",
"\n",
"countries_norm[\"PopulationClipNorm\"] = min_max_scaler.fit_transform(\n",
" countries_norm[\"PopulationClip\"].to_numpy().reshape(-1, 1)\n",
").reshape(countries_norm[\"Population2020\"].shape)\n",
"\n",
"countries_norm[\"PopulationWinsorizedNorm\"] = min_max_scaler.fit_transform(\n",
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
").reshape(countries_norm[\"Population2020\"].shape)\n",
"\n",
"countries_norm[\"PopulationWinsorizedNorm2\"] = min_max_scaler_2.fit_transform(\n",
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
").reshape(countries_norm[\"Population2020\"].shape)\n",
"\n",
"countries_norm[\n",
" [\n",
" \"Country\",\n",
" \"Population2020\",\n",
" \"PopulationNorm\",\n",
" \"PopulationClipNorm\",\n",
" \"PopulationWinsorizedNorm\",\n",
" \"PopulationWinsorizedNorm2\",\n",
" ]\n",
"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Стандартизация значений"
]
},
{
"cell_type": "code",
2024-11-08 22:37:34 +04:00
"execution_count": 27,
2024-10-23 13:43:55 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Country</th>\n",
" <th>Population2020</th>\n",
" <th>PopulationStand</th>\n",
" <th>PopulationClipStand</th>\n",
" <th>PopulationWinsorizedStand</th>\n",
" </tr>\n",
" <tr>\n",
" <th>no</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>China</td>\n",
" <td>1439323776</td>\n",
" <td>10.427597</td>\n",
" <td>2.073933</td>\n",
" <td>3.171659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>India</td>\n",
" <td>1380004385</td>\n",
" <td>9.987702</td>\n",
" <td>2.073933</td>\n",
" <td>3.171659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>United States</td>\n",
" <td>331002651</td>\n",
" <td>2.208627</td>\n",
" <td>2.073933</td>\n",
" <td>3.171659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Indonesia</td>\n",
" <td>273523615</td>\n",
" <td>1.782380</td>\n",
" <td>2.073933</td>\n",
" <td>3.171659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Pakistan</td>\n",
" <td>220892340</td>\n",
" <td>1.392082</td>\n",
" <td>2.073933</td>\n",
" <td>3.171659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>231</th>\n",
" <td>Montserrat</td>\n",
" <td>4992</td>\n",
" <td>-0.245950</td>\n",
" <td>-0.795071</td>\n",
" <td>-0.621969</td>\n",
" </tr>\n",
" <tr>\n",
" <th>232</th>\n",
" <td>Falkland Islands</td>\n",
" <td>3480</td>\n",
" <td>-0.245962</td>\n",
" <td>-0.795158</td>\n",
" <td>-0.622019</td>\n",
" </tr>\n",
" <tr>\n",
" <th>233</th>\n",
" <td>Niue</td>\n",
" <td>1626</td>\n",
" <td>-0.245975</td>\n",
" <td>-0.795265</td>\n",
" <td>-0.622080</td>\n",
" </tr>\n",
" <tr>\n",
" <th>234</th>\n",
" <td>Tokelau</td>\n",
" <td>1357</td>\n",
" <td>-0.245977</td>\n",
" <td>-0.795280</td>\n",
" <td>-0.622089</td>\n",
" </tr>\n",
" <tr>\n",
" <th>235</th>\n",
" <td>Holy See</td>\n",
" <td>801</td>\n",
" <td>-0.245982</td>\n",
" <td>-0.795312</td>\n",
" <td>-0.622107</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>235 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" Country Population2020 PopulationStand PopulationClipStand \\\n",
"no \n",
"1 China 1439323776 10.427597 2.073933 \n",
"2 India 1380004385 9.987702 2.073933 \n",
"3 United States 331002651 2.208627 2.073933 \n",
"4 Indonesia 273523615 1.782380 2.073933 \n",
"5 Pakistan 220892340 1.392082 2.073933 \n",
".. ... ... ... ... \n",
"231 Montserrat 4992 -0.245950 -0.795071 \n",
"232 Falkland Islands 3480 -0.245962 -0.795158 \n",
"233 Niue 1626 -0.245975 -0.795265 \n",
"234 Tokelau 1357 -0.245977 -0.795280 \n",
"235 Holy See 801 -0.245982 -0.795312 \n",
"\n",
" PopulationWinsorizedStand \n",
"no \n",
"1 3.171659 \n",
"2 3.171659 \n",
"3 3.171659 \n",
"4 3.171659 \n",
"5 3.171659 \n",
".. ... \n",
"231 -0.621969 \n",
"232 -0.622019 \n",
"233 -0.622080 \n",
"234 -0.622089 \n",
"235 -0.622107 \n",
"\n",
"[235 rows x 5 columns]"
]
},
2024-11-08 22:37:34 +04:00
"execution_count": 27,
2024-10-23 13:43:55 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import preprocessing\n",
"\n",
"stndart_scaler = preprocessing.StandardScaler()\n",
"\n",
"countries_norm[\"PopulationStand\"] = stndart_scaler.fit_transform(\n",
" countries_norm[\"Population2020\"].to_numpy().reshape(-1, 1)\n",
").reshape(countries_norm[\"Population2020\"].shape)\n",
"\n",
"countries_norm[\"PopulationClipStand\"] = stndart_scaler.fit_transform(\n",
" countries_norm[\"PopulationClip\"].to_numpy().reshape(-1, 1)\n",
").reshape(countries_norm[\"Population2020\"].shape)\n",
"\n",
"countries_norm[\"PopulationWinsorizedStand\"] = stndart_scaler.fit_transform(\n",
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
").reshape(countries_norm[\"Population2020\"].shape)\n",
"\n",
"countries_norm[\n",
" [\n",
" \"Country\",\n",
" \"Population2020\",\n",
" \"PopulationStand\",\n",
" \"PopulationClipStand\",\n",
" \"PopulationWinsorizedStand\",\n",
" ]\n",
"]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}